{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T21:35:16Z","timestamp":1776116116482,"version":"3.50.1"},"reference-count":39,"publisher":"IEEE","license":[{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020,5]]},"DOI":"10.1109\/icassp40776.2020.9054224","type":"proceedings-article","created":{"date-parts":[[2020,4,9]],"date-time":"2020-04-09T20:21:13Z","timestamp":1586463673000},"page":"7694-7698","source":"Crossref","is-referenced-by-count":76,"title":["Effectiveness of Self-Supervised Pre-Training for ASR"],"prefix":"10.1109","author":[{"given":"Alexei","family":"Baevski","sequence":"first","affiliation":[]},{"given":"Abdelrahman","family":"Mohamed","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","article-title":"Unsupervised learning of efficient and robust speech representations","author":"kawakami","year":"2019"},{"key":"ref38","article-title":"Self-training for end-to-end speech recognition","volume":"abs 1909 9116","author":"kahn","year":"2019"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-6301"},{"key":"ref32","article-title":"The kaldi speech recognition toolkit","author":"povey","year":"2011","journal-title":"Proc of ASRU"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-4009"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2277"},{"key":"ref36","article-title":"Transformer-based acoustic modeling for hybrid speech recognition","volume":"abs 1910 9799","author":"wang","year":"2019"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683535"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1285"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2341"},{"key":"ref11","article-title":"Representation learning with contrastive predictive coding","author":"van den oord","year":"2018"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1473"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682903"},{"key":"ref14","article-title":"wav2vec: Unsupervised pre-training for speech recognition","author":"schneider","year":"2020","journal-title":"Proc of Interspeech"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2013.6707741"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953162"},{"key":"ref17","article-title":"Lessons from building acoustic models with a million hours of speech","author":"parthasarathi","year":"2019","journal-title":"Proc of ICASSP"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682172"},{"key":"ref19","article-title":"Representations of language in a model of visually grounded speech signal","author":"chrupa?a","year":"2017","journal-title":"Proc of ACL"},{"key":"ref28","article-title":"Neural discrete representation learning","volume":"abs 1711 937","author":"van den oord","year":"2017"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ISSPA.2012.6310546"},{"key":"ref27","article-title":"Transformers with convolutional context for asr","author":"mohamed","year":"2019"},{"key":"ref3","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2010-483","article-title":"Towards spoken term discovery at scale with zero resources","author":"jansen","year":"2010","journal-title":"Proc of Interspeech"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2013.6707765"},{"key":"ref29","article-title":"Learning representations by maximizing mutual information across views","volume":"abs 1906 910","author":"bachman","year":"2019"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639245"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-82"},{"key":"ref7","article-title":"Word embeddings for speech recognition","author":"bengio","year":"2014","journal-title":"Proc of Interspeech"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.909282"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461761"},{"key":"ref1","article-title":"vq-wav2vec: Self-supervised learning of discrete speech representations","author":"baevski","year":"2020","journal-title":"Proc of ICLR"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-502"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1539"},{"key":"ref21","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2019","journal-title":"Proc of NAACL"},{"key":"ref24","article-title":"Libri-light: A benchmark for asr with limited or no supervision","author":"kahn","year":"2019"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref26","article-title":"Roberta: A robustly optimized bert pretraining approach","volume":"abs 1907 11692","author":"liu","year":"2019"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1780"}],"event":{"name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Barcelona, Spain","start":{"date-parts":[[2020,5,4]]},"end":{"date-parts":[[2020,5,8]]}},"container-title":["ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9040208\/9052899\/09054224.pdf?arnumber=9054224","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,29]],"date-time":"2023-09-29T19:30:56Z","timestamp":1696015856000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9054224\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,5]]},"references-count":39,"URL":"https:\/\/doi.org\/10.1109\/icassp40776.2020.9054224","relation":{},"subject":[],"published":{"date-parts":[[2020,5]]}}}