{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,20]],"date-time":"2025-09-20T22:05:15Z","timestamp":1758405915927,"version":"3.28.0"},"reference-count":49,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,12,16]],"date-time":"2023-12-16T00:00:00Z","timestamp":1702684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,12,16]],"date-time":"2023-12-16T00:00:00Z","timestamp":1702684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,12,16]]},"DOI":"10.1109\/asru57964.2023.10389735","type":"proceedings-article","created":{"date-parts":[[2024,1,19]],"date-time":"2024-01-19T18:38:40Z","timestamp":1705689520000},"page":"1-8","source":"Crossref","is-referenced-by-count":4,"title":["Joint Prediction and Denoising for Large-Scale Multilingual Self-Supervised Learning"],"prefix":"10.1109","author":[{"given":"William","family":"Chen","sequence":"first","affiliation":[{"name":"Carnegie Mellon University,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiatong","family":"Shi","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Brian","family":"Yan","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dan","family":"Berrebbi","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wangyou","family":"Zhang","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yifan","family":"Peng","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuankai","family":"Chang","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Soumi","family":"Maiti","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Robust speech recognition via large-scale weak supervision","author":"Radford","year":"2022","journal-title":"arXiv preprint"},{"key":"ref2","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"in Proc. NeurIPS","volume":"33","author":"Brown"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3207050"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688137"},{"key":"ref6","article-title":"PaLM: Scaling language modeling with pathways","author":"Chowdhery","year":"2022","journal-title":"arXiv preprint"},{"key":"ref7","article-title":"BLOOM: A 176b-parameter open-access multilingual language model","author":"Scao","year":"2022","journal-title":"arXiv preprint"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.bigscience-1.9"},{"key":"ref9","article-title":"Beyond the imitation game: Quantifying and extrapolating the capabilities of language models","author":"Srivastava","year":"2022","journal-title":"arXiv preprint"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-1873"},{"key":"ref11","first-page":"12 449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"in Proc. NeurIPS","volume":"33","author":"Baevski"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747022"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688253"},{"key":"ref15","first-page":"1298","article-title":"Data2vec: A general framework for self-supervised learning in speech, vision and language","volume-title":"in Proc. ICML","volume":"162","author":"Baevski"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1775"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095326"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2023-1061"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3192714"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2023-327"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1823"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1979"},{"key":"ref23","article-title":"Unsupervised cross-lingual representation learning for speech recognition","author":"Conneau","year":"2020","journal-title":"arXiv preprint"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2022-143"},{"key":"ref25","article-title":"Scaling speech technology to 1,000+ languages","author":"Pratap","year":"2023","journal-title":"arXiv preprint"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.63"},{"key":"ref27","first-page":"3915","article-title":"Self-supervised learning with random-projection quantizer for speech recognition","volume-title":"in Proc. ICML","volume":"162","author":"Chiu"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1087"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1316"},{"key":"ref30","article-title":"Google USM: Scaling automatic speech recognition beyond 100 languages","author":"Zhang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref31","first-page":"4218","article-title":"Common voice: A massively-multilingual speech corpus","volume-title":"in Proceedings of the Twelfth Language Resources and Evaluation Conference","author":"Ardila"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-3038"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.80"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2826"},{"key":"ref36","first-page":"16","article-title":"Speech recognition and keyword spotting for low-resource languages: Babel project research at cued","volume-title":"in Fourth International workshop on spoken language technologies for under-resourced languages (SLTU-2014)","author":"Gales"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/slt54892.2023.10023141"},{"key":"ref38","article-title":"Attention is all you need","volume-title":"Proc. NeurIPS","volume":"30","author":"Vaswani"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1176"},{"article-title":"Adam: A method for stochastic optimization","volume-title":"in Proc. ICLR","author":"Kingma","key":"ref40"},{"key":"ref41","article-title":"Exploration on HuBERT with multiple resolutions","author":"Shi","year":"2023","journal-title":"arXiv preprint"},{"key":"ref42","first-page":"9361","article-title":"Squeezeformer: An efficient transformer for automatic speech recognition","volume-title":"in Proc. NeurIPS","volume":"35","author":"Kim"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9687874"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP49672.2021.9362064"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-2680"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10022656"},{"article-title":"FlashAttention: Fast and memory-efficient exact attention with IO-awareness","volume-title":"in Proc. NeurIPS","author":"Dao","key":"ref48"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"}],"event":{"name":"2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","start":{"date-parts":[[2023,12,16]]},"location":"Taipei, Taiwan","end":{"date-parts":[[2023,12,20]]}},"container-title":["2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10388490\/10389614\/10389735.pdf?arnumber=10389735","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,23]],"date-time":"2024-01-23T16:44:13Z","timestamp":1706028253000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10389735\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,16]]},"references-count":49,"URL":"https:\/\/doi.org\/10.1109\/asru57964.2023.10389735","relation":{},"subject":[],"published":{"date-parts":[[2023,12,16]]}}}