{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T22:11:42Z","timestamp":1773439902905,"version":"3.50.1"},"reference-count":51,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,12,13]],"date-time":"2021-12-13T00:00:00Z","timestamp":1639353600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,12,13]],"date-time":"2021-12-13T00:00:00Z","timestamp":1639353600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["ACI-1548562,ACI-1445606"],"award-info":[{"award-number":["ACI-1548562,ACI-1445606"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,12,13]]},"DOI":"10.1109\/asru51503.2021.9688137","type":"proceedings-article","created":{"date-parts":[[2022,2,3]],"date-time":"2022-02-03T20:31:00Z","timestamp":1643920260000},"page":"228-235","source":"Crossref","is-referenced-by-count":41,"title":["An Exploration of Self-Supervised Pretrained Representations for End-to-End Speech Recognition"],"prefix":"10.1109","author":[{"given":"Xuankai","family":"Chang","sequence":"first","affiliation":[{"name":"Carnegie Mellon University"}]},{"given":"Takashi","family":"Maekaku","sequence":"additional","affiliation":[{"name":"Yahoo Japan Corporation"}]},{"given":"Pengcheng","family":"Guo","sequence":"additional","affiliation":[{"name":"Northwestern Poly technical University"}]},{"given":"Jing","family":"Shi","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences"}]},{"given":"Yen-Ju","family":"Lu","sequence":"additional","affiliation":[{"name":"Academia Sinica"}]},{"given":"Aswin Shanmugam","family":"Subramanian","sequence":"additional","affiliation":[{"name":"Johns Hopkins University"}]},{"given":"Tianzi","family":"Wang","sequence":"additional","affiliation":[{"name":"Johns Hopkins University"}]},{"given":"Shu-wen","family":"Yang","sequence":"additional","affiliation":[{"name":"National Taiwan University"}]},{"given":"Yu","family":"Tsao","sequence":"additional","affiliation":[{"name":"Academia Sinica"}]},{"given":"Hung-yi","family":"Lee","sequence":"additional","affiliation":[{"name":"National Taiwan University"}]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2013.6701894"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054548"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383615"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-demos.34"},{"key":"ref37","first-page":"297","article-title":"Noise-contrastive estimation: A new estimation principle for unnormalized sta-tistical models","author":"gutmann","year":"0","journal-title":"Proc AISTATS"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414858"},{"key":"ref35","first-page":"449","article-title":"A comparative study on Trans-former vs RNN in speech applications","author":"karita","year":"2019","journal-title":"Proc ASRU"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953075"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1775"},{"key":"ref27","first-page":"478","article-title":"Unsuper-vised deep embedding for clustering analysis","author":"xie","year":"2016","journal-title":"Proc ICML"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6638947"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2012.6288864"},{"key":"ref20","article-title":"Non-autoregressive predictive coding for learning speech rep-resentations from local dependencies","author":"liu","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"ref21","article-title":"Repre-sentation learning with contrastive predictive coding","author":"van den oord","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref24","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"baevski","year":"0","journal-title":"Proc NeurIPS"},{"key":"ref23","article-title":"vq-wav2vec: Self-supervised learning of discrete speech represen-tations","author":"baevski","year":"0","journal-title":"Proc ICLR"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_9"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414460"},{"key":"ref50","first-page":"62","volume":"16","author":"john","year":"2014","journal-title":"XSEDE Accelerating Scientific Discovery Computing in Science & Engineering"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1145\/2792745.2792775"},{"key":"ref10","first-page":"896","article-title":"Pseudo-label: The simple and efficient semi-supervised learning method for deep neural networks","author":"lee","year":"0","journal-title":"Proc ICML"},{"key":"ref11","article-title":"End-to-end ASR: From supervised to semi-supervised learning with mod-ern architectures","author":"synnaeve","year":"0","journal-title":"Proc ICML"},{"key":"ref40","first-page":"100","article-title":"The AMI meeting corpus","volume":"88","author":"mccowan","year":"2005","journal-title":"Proceedings of the 5th International Conference on Methods and Techniques in Behavioral Research"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054295"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2587640"},{"key":"ref14","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional trans-formers for language understanding","author":"devlin","year":"2019","journal-title":"Proc NAACL"},{"key":"ref15","author":"radford","year":"2018","journal-title":"Improving language understanding by generative pre-training"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1473"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1228"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054458"},{"key":"ref19","article-title":"Tera: Self-supervised learning of transformer encoder representation for speech","author":"liu","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref4","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"0","journal-title":"Proc NeurIPS"},{"key":"ref3","first-page":"4960","article-title":"Listen, attend and spell","author":"william","year":"2016","journal-title":"Proc ICASSP"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"ref5","first-page":"5884","article-title":"Speech-Transformer: A no-recurrence sequence-to-sequence model for speech recognition","author":"dong","year":"2018","journal-title":"Proc ICASSP"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1965"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-6301"},{"key":"ref9","article-title":"SpeechStew: Simply mix all avail-able speech recognition data to train one large neural network","author":"chan","year":"2021","journal-title":"Proc INTERSPEECH"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.902460"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2016.11.005"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7471631"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICSDA.2017.8384449"},{"key":"ref42","first-page":"198","article-title":"TED-LIUM 3: Twice as much data and corpus repartition for experiments on speaker adaptation","author":"hernandez","year":"2018","journal-title":"Proc Specom"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/11939993_73"},{"key":"ref44","first-page":"3935","article-title":"En-hancing the TED-LIUM corpus with selected data for language modeling and more TED talks","author":"rousseau","year":"0","journal-title":"Proc LREC"},{"key":"ref43","first-page":"3735","article-title":"Sequence to multi-sequence learning via conditional chain mapping for mixture signals","author":"shi","year":"0","journal-title":"Proc NeurIPS"}],"event":{"name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Cartagena, Colombia","start":{"date-parts":[[2021,12,13]]},"end":{"date-parts":[[2021,12,17]]}},"container-title":["2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9687821\/9687855\/09688137.pdf?arnumber=9688137","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,16]],"date-time":"2022-05-16T20:42:04Z","timestamp":1652733724000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9688137\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,12,13]]},"references-count":51,"URL":"https:\/\/doi.org\/10.1109\/asru51503.2021.9688137","relation":{},"subject":[],"published":{"date-parts":[[2021,12,13]]}}}