{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,10]],"date-time":"2026-02-10T18:33:59Z","timestamp":1770748439959,"version":"3.50.0"},"reference-count":57,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,5,23]],"date-time":"2022-05-23T00:00:00Z","timestamp":1653264000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,5,23]],"date-time":"2022-05-23T00:00:00Z","timestamp":1653264000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,5,23]]},"DOI":"10.1109\/icassp43922.2022.9747674","type":"proceedings-article","created":{"date-parts":[[2022,4,27]],"date-time":"2022-04-27T19:50:34Z","timestamp":1651089034000},"page":"7167-7171","source":"Crossref","is-referenced-by-count":34,"title":["ESPnet-SLU: Advancing Spoken Language Understanding Through ESPnet"],"prefix":"10.1109","author":[{"given":"Siddhant","family":"Arora","sequence":"first","affiliation":[{"name":"Carnegie Mellon University"}]},{"given":"Siddharth","family":"Dalmia","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}]},{"given":"Pavel","family":"Denisov","sequence":"additional","affiliation":[{"name":"University of Stuttgart"}]},{"given":"Xuankai","family":"Chang","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}]},{"given":"Yushi","family":"Ueda","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}]},{"given":"Yifan","family":"Peng","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}]},{"given":"Yuekai","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zoom Video Communications"}]},{"given":"Sujay","family":"Kumar","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}]},{"given":"Karthik","family":"Ganesan","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}]},{"given":"Brian","family":"Yan","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}]},{"given":"Ngoc","family":"Thang Vu","sequence":"additional","affiliation":[{"name":"University of Stuttgart"}]},{"given":"Alan W","family":"Black","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3173"},{"key":"ref38","article-title":"Gated embeddings in e2e speech recognition for conversational-context fusion","author":"kim","year":"2019","journal-title":"Proc ACL"},{"key":"ref33","year":"2022","journal-title":"Rasa"},{"key":"ref32","year":"2022","journal-title":"Plato Research Dialogue System"},{"key":"ref31","article-title":"PyDial: A Multidomain Statistical Dialogue System Toolkit","author":"ultes","year":"2017","journal-title":"ACL"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383615"},{"key":"ref37","author":"wang","year":"2021","journal-title":"Pre-training for low resource speech-to-intent applications"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414922"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.coling-industry.11"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.21437\/VCC_BC.2020-24"},{"key":"ref28","article-title":"Japanese dialogue corpus of information navigation and attentive listening annotated with extended ISO-24617 - 2 dialogue act tags","author":"yoshino","year":"2018","journal-title":"Proc LREC 2018"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-818"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-2040"},{"key":"ref2","article-title":"Snips voice platform: An embedded spoken language understanding system for private-by-design voice interfaces","author":"coucke","year":"2018","journal-title":"CoRR"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-3014"},{"key":"ref20","article-title":"Wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"baevski","year":"2020","journal-title":"Proc NeurIPS"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1228"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3095662"},{"key":"ref24","article-title":"Mpnet: Masked and permuted pretraining for language understanding","author":"song","year":"2020","journal-title":"Proc NeurIPS"},{"key":"ref23","first-page":"4171","article-title":"BERT: pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2019","journal-title":"Proc NAACL"},{"key":"ref26","article-title":"Speechbrain: A general-purpose speech toolkit","author":"ravanelli","year":"2021","journal-title":"CoRR"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-788"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-4009"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2016.11.005"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2821"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W17-5530"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1062"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682881"},{"key":"ref52","article-title":"Unsupervised cross-lingual representation learning for speech recognition","author":"conneau","year":"2020","journal-title":"CoRR"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1826"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1460"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1537"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2014.7078545"},{"key":"ref13","article-title":"Spoken language understanding on the edge","author":"saade","year":"2018"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053512"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-demos.34"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.588"},{"key":"ref3","article-title":"Speech model pretraining for end-to-end slu","author":"lugosch","year":"2019","journal-title":"Proc INTERSPEECH"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/s10579-008-9076-6"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1915"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1162\/089120100561737"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461371"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003750"},{"key":"ref9","article-title":"Tie your embeddings down: Cross-modal latent spaces for end-to-end spoken language understanding","author":"agrawal","year":"2020"},{"key":"ref46","article-title":"Switchboard swbd-damsl shallow-discourse-function annotation coders manual","author":"jurafsky","year":"1997"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1992.225858"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2017.2763455"},{"key":"ref47","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2021-1775","article-title":"SUPERB Speech Processing Universal PERformance Bench","author":"yang","year":"2021","journal-title":"Proc INTERSPEECH"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1160"},{"key":"ref41","article-title":"Harpervalleybank: A domainspecific spoken dialog corpus","author":"wu","year":"2020","journal-title":"CoRR"},{"key":"ref44","article-title":"Speech commands: A dataset for limited-vocabulary speech recognition","author":"warden","year":"2018"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3340555.3356098"}],"event":{"name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Singapore, Singapore","start":{"date-parts":[[2022,5,23]]},"end":{"date-parts":[[2022,5,27]]}},"container-title":["ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9745891\/9746004\/09747674.pdf?arnumber=9747674","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,23]],"date-time":"2024-09-23T03:20:53Z","timestamp":1727061653000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9747674\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,5,23]]},"references-count":57,"URL":"https:\/\/doi.org\/10.1109\/icassp43922.2022.9747674","relation":{},"subject":[],"published":{"date-parts":[[2022,5,23]]}}}