{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,15]],"date-time":"2026-07-15T18:53:31Z","timestamp":1784141611860,"version":"3.55.0"},"reference-count":80,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Science and Technology Council","award":["113-2628-E-002-033"],"award-info":[{"award-number":["113-2628-E-002-033"]}]},{"name":"National Science and Technology Council","award":["110-2223-E-002-007-MY3"],"award-info":[{"award-number":["110-2223-E-002-007-MY3"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/taslp.2024.3436618","type":"journal-article","created":{"date-parts":[[2024,8,2]],"date-time":"2024-08-02T17:37:15Z","timestamp":1722620235000},"page":"3730-3744","source":"Crossref","is-referenced-by-count":7,"title":["SpeechPrompt: Prompting Speech Language Models for Speech Processing Tasks"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-1562-7282","authenticated-orcid":false,"given":"Kai-Wei","family":"Chang","sequence":"first","affiliation":[{"name":"Graduate Institute of Communication Engineering, National Taiwan University, Taipei City, Taiwan"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7166-5534","authenticated-orcid":false,"given":"Haibin","family":"Wu","sequence":"additional","affiliation":[{"name":"Graduate Institute of Communication Engineering, National Taiwan University, Taipei City, Taiwan"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5054-8510","authenticated-orcid":false,"given":"Yu-Kai","family":"Wang","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3044-8709","authenticated-orcid":false,"given":"Yuan-Kuei","family":"Wu","sequence":"additional","affiliation":[{"name":"Graduate Institute of Communication Engineering, National Taiwan University, Taipei City, Taiwan"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hua","family":"Shen","sequence":"additional","affiliation":[{"name":"University of Michigan, Ann Arbor, MI, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wei-Cheng","family":"Tseng","sequence":"additional","affiliation":[{"name":"University of Texas at Austin, Austin, TX, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Iu-Thing","family":"Kang","sequence":"additional","affiliation":[{"name":"MediaTek, Hsinchu, Taiwan"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0656-9874","authenticated-orcid":false,"given":"Shang-Wen","family":"Li","sequence":"additional","affiliation":[{"name":"FAIR, Meta, Menlo Park, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9654-5747","authenticated-orcid":false,"given":"Hung-Yi","family":"Lee","sequence":"additional","affiliation":[{"name":"Graduate Institute of Communication Engineering, National Taiwan University, Taipei City, Taiwan"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3207050"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1775"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-556"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.580"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3560815"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.346"},{"key":"ref7","first-page":"140:1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.185"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.20"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.158"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-023-00626-4"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acllong.353"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2023.08.012"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10610"},{"key":"ref17","first-page":"20841","article-title":"Black-box tuning for language-model-as-a-service","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Sun","year":"2022"},{"key":"ref18","first-page":"1336","article-title":"On generative spoken language modeling from raw audio","volume":"9","author":"Lakhotia","year":"2021","journal-title":"Trans. Assoc. Comput. Linguistics"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.593"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11032"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-475"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2051"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2019.06.001"},{"key":"ref24","article-title":"Improving language understanding by generative pre-training","author":"Radford","year":"2018"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3206084"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3200909"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-demo.1"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref30","first-page":"3915","article-title":"Self-supervised learning with random-projection quantizer for speech recognition","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Chiu","year":"2022"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054548"},{"key":"ref32","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Baevski","year":"2020"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-391"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054438"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/icassp40776.2020.9053176"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3095662"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447929"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447751"},{"issue":"8","key":"ref39","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI Blog"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref41","first-page":"17022","article-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Kong","year":"2020"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"ref43","first-page":"63483","article-title":"Textually pretrained speech language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Hassid","year":"2023"},{"key":"ref44","article-title":"Spoken question answering and speech continuation using spectrogram-powered LLM","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Nachmani","year":"2024"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11031"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2032"},{"key":"ref47","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2023"},{"key":"ref48","article-title":"Adversarial reprogramming of neural networks","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Elsayed","year":"2019"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i20.30267"},{"key":"ref50","first-page":"11808","article-title":"Voice2series: Reprogramming acoustic models for time series classification","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Yang","year":"2021"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1086"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.colingmain.488"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.759"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.483"},{"key":"ref56","article-title":"Speech commands: A dataset for limited-vocabulary speech recognition","author":"Warden","year":"2018"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1160"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.3390\/app10238643"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.3390\/app11062477"},{"key":"ref60","first-page":"177","article-title":"Database for Arabic speech commands recognition","volume-title":"Proc. CEST","author":"Benamer","year":"2020"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2396"},{"key":"ref62","first-page":"6992","article-title":"A multimodal corpus for emotion recognition in sarcasm","volume-title":"Proc. 13th LREC","author":"Ray","year":"2022"},{"key":"ref63","first-page":"5351","article-title":"AccentDB: A database of non-native English accents to assist neural speech recognition","volume-title":"Proc. 12th Lang. Resour. Eval. Conf.","author":"Ahamad","year":"2020"},{"key":"ref64","article-title":"Voxforge","author":"MacLean","year":"2018"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414470"},{"key":"ref66","first-page":"486","article-title":"Freesound Datasets: A platform for the creation of open audio datasets","volume-title":"Proc. ISMIR","author":"Fonseca","year":"2017"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414922"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-2027"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096680"},{"key":"ref71","article-title":"The LJ speech dataset","author":"Ito","year":"2017"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1951"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.235"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10884"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389731"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096988"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448257"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389742"},{"key":"ref80","article-title":"SALMONN: Towards generic hearing abilities for large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Tang","year":"2024"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6570655\/10304349\/10620644.pdf?arnumber=10620644","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,12]],"date-time":"2024-08-12T04:01:25Z","timestamp":1723435285000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10620644\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":80,"URL":"https:\/\/doi.org\/10.1109\/taslp.2024.3436618","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}