{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T11:01:45Z","timestamp":1758279705751,"version":"3.28.0"},"reference-count":46,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,12,16]],"date-time":"2023-12-16T00:00:00Z","timestamp":1702684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,12,16]],"date-time":"2023-12-16T00:00:00Z","timestamp":1702684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,12,16]]},"DOI":"10.1109\/asru57964.2023.10389660","type":"proceedings-article","created":{"date-parts":[[2024,1,19]],"date-time":"2024-01-19T18:38:40Z","timestamp":1705689520000},"page":"1-8","source":"Crossref","is-referenced-by-count":3,"title":["Few-Shot Spoken Language Understanding Via Joint Speech-Text Models"],"prefix":"10.1109","author":[{"given":"Chung-Ming","family":"Chien","sequence":"first","affiliation":[{"name":"Toyota Technological Institute at Chicago"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mingjiamei","family":"Zhang","sequence":"additional","affiliation":[{"name":"The University of Chicago"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ju-Chieh","family":"Chou","sequence":"additional","affiliation":[{"name":"Toyota Technological Institute at Chicago"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Karen","family":"Livescu","sequence":"additional","affiliation":[{"name":"Toyota Technological Institute at Chicago"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054548"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref3","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"Baevski","year":"2020","journal-title":"NeurIPS"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3207050"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-475"},{"key":"ref6","first-page":"1336","article-title":"On generative spoken language modeling from raw audio","volume":"9","author":"Lakhotia","year":"2021","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.235"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.63"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-679"},{"key":"ref10","article-title":"What do self-supervised speech models know about words?","author":"Pasad","year":"2023","journal-title":"preprint arXiv:2307.00162"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.393"},{"key":"ref12","article-title":"SLAM: A unified encoder for speech and language modeling via speech-text joint pre-training","author":"Bapna","year":"2021","journal-title":"preprint arXiv:2110.10329"},{"key":"ref13","article-title":"SpeechLM: Enhanced speech pre-training with unpaired textual data","author":"Zhang","year":"2023","journal-title":"preprint arXiv:2209.15329"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.108"},{"key":"ref15","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2023","journal-title":"preprint arXiv:2301.02111"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.189"},{"key":"ref17","article-title":"Cross-lingual language model pretraining","author":"Conneau","year":"2019","journal-title":"NeurIPS"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746137"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.53"},{"key":"ref20","article-title":"Zero-shot end-to-end spoken language understanding via cross-modal selective self-training","author":"He","year":"2023","journal-title":"preprint arXiv:2305.12793"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10231"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11401"},{"key":"ref23","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019","journal-title":"NAACL"},{"key":"ref24","article-title":"mSLAM: Massively multilingual joint pre-training for speech and text","author":"Bapna","year":"2022","journal-title":"preprint arXiv:2202.01374"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10937"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096923"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.105"},{"key":"ref28","article-title":"Mu2SLAM: Multitask, multilingual speech and language models","author":"Cheng","year":"2023","journal-title":"ICML"},{"key":"ref29","article-title":"Speak foreign languages with your own voice: Cross-lingual neural codec language modeling","author":"Zhang","year":"2023","journal-title":"preprint arXiv:2303.03926"},{"key":"ref30","article-title":"VioLA: Unified codec language models for speech recognition, synthesis, and translation","author":"Wang","year":"2023","journal-title":"preprint arXiv:2305.16107"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095702"},{"key":"ref32","article-title":"Maestro-U: Leveraging joint speech-text representation learning for zero supervised speech ASR","author":"Chen","year":"2022","journal-title":"SLT"},{"key":"ref33","article-title":"Google USM: Scaling automatic speech recognition beyond 100 languages","author":"Zhang","year":"2023","journal-title":"preprint arXiv:2303.01037"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1493"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1077"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.blackboxnlp-1.4"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref38","article-title":"Cross-lingual similarity of multilingual representations revisited","author":"Del","year":"2022","journal-title":"AACL"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.2307\/2333955"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1448"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688093"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"ref43","article-title":"Massively multilingual neural machine translation in the wild: Findings and challenges","author":"Arivazhagan","year":"2019","journal-title":"preprint arXiv:1907.05019"},{"key":"ref44","article-title":"ESPnetST: All-in-one speech translation toolkit","author":"Inaguma","year":"2020","journal-title":"ACL"},{"key":"ref45","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","author":"Kim","year":"2021","journal-title":"ICML"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"}],"event":{"name":"2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","start":{"date-parts":[[2023,12,16]]},"location":"Taipei, Taiwan","end":{"date-parts":[[2023,12,20]]}},"container-title":["2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10388490\/10389614\/10389660.pdf?arnumber=10389660","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,23]],"date-time":"2024-01-23T16:44:13Z","timestamp":1706028253000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10389660\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,16]]},"references-count":46,"URL":"https:\/\/doi.org\/10.1109\/asru57964.2023.10389660","relation":{},"subject":[],"published":{"date-parts":[[2023,12,16]]}}}