{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,21]],"date-time":"2026-07-21T10:34:08Z","timestamp":1784630048964,"version":"3.55.0"},"reference-count":355,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"6","license":[{"start":{"date-parts":[[2022,10,1]],"date-time":"2022-10-01T00:00:00Z","timestamp":1664582400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2022,10,1]],"date-time":"2022-10-01T00:00:00Z","timestamp":1664582400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,10,1]],"date-time":"2022-10-01T00:00:00Z","timestamp":1664582400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Innovation Fund Denmark through Industrial Ph.D. Programme","award":["0153-00167B"],"award-info":[{"award-number":["0153-00167B"]}]},{"name":"Innovation Fund Denmark thorugh Industrial Ph.D. Programme","award":["8053-00184B"],"award-info":[{"award-number":["8053-00184B"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE J. Sel. Top. Signal Process."],"published-print":{"date-parts":[[2022,10]]},"DOI":"10.1109\/jstsp.2022.3207050","type":"journal-article","created":{"date-parts":[[2022,9,15]],"date-time":"2022-09-15T19:37:02Z","timestamp":1663270622000},"page":"1179-1210","source":"Crossref","is-referenced-by-count":325,"title":["Self-Supervised Speech Representation Learning: A Review"],"prefix":"10.1109","volume":"16","author":[{"given":"Abdelrahman","family":"Mohamed","sequence":"first","affiliation":[{"name":"Meta, Menlo Park, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9654-5747","authenticated-orcid":false,"given":"Hung-yi","family":"Lee","sequence":"additional","affiliation":[{"name":"Department of Electrical Engineering and Department of Computer Science Information Engineering, National Taiwan University, Taipei, Taiwan"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lasse","family":"Borgholt","sequence":"additional","affiliation":[{"name":"Corti AI and Department of Computer Science, University of Copenhagen, K&#x00F8;benhavn, Denmark"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jakob D.","family":"Havtorn","sequence":"additional","affiliation":[{"name":"Corti AI and Department of Applied Mathematics and Computer Science, Technical University of Denmark, Kgs. Lyngby, Denmark"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Joakim","family":"Edin","sequence":"additional","affiliation":[{"name":"Corti AI, Technical University of Denmark, Kgs. Lyngby, Denmark"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Christian","family":"Igel","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Copenhagen, K&#x00F8;benhavn, Denmark"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Katrin","family":"Kirchhoff","sequence":"additional","affiliation":[{"name":"AWS AI Labs, Amazon, Seattle, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shang-Wen","family":"Li","sequence":"additional","affiliation":[{"name":"Meta, Menlo Park, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Karen","family":"Livescu","sequence":"additional","affiliation":[{"name":"Toyota Technological Institute at Chicago, Chicago, IL, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lars","family":"Maal\u00f8e","sequence":"additional","affiliation":[{"name":"Corti AI and Department of Applied Mathematics and Computer Science, Technical University of Denmark, Kgs. Lyngby, Denmark"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4126-6556","authenticated-orcid":false,"given":"Tara N.","family":"Sainath","sequence":"additional","affiliation":[{"name":"Google, Inc., New York, NY, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5970-8631","authenticated-orcid":false,"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[{"name":"Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1038\/nature14539"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2012.2205597"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4615-3210-1"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/Eurospeech.1999-599"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1006\/csla.2001.0186"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2006.1660839"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1016\/j.tics.2007.09.004"},{"key":"ref8","article-title":"A tutorial on energy-based\n                        learning","volume-title":"Predicting Structured\n                        Data","author":"LeCun","year":"2006"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.50"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1126\/science.aaa8415"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/MASSP.1984.1162229"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.1993.716791"},{"key":"ref13","first-page":"3","article-title":"Autoencoders, minimum description length and Helmholtz\n                        free energy","volume":"6","author":"Hinton","year":"1994","journal-title":"Proc. Adv. Neural Inf. Process.\n                        Syst."},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1038\/44565"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1126\/science.1127647"},{"key":"ref16","article-title":"On the opportunities and risks of foundation\n                        models","author":"Bommasani","year":"2021"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2021.3134634"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/tkde.2021.3090866"},{"key":"ref19","first-page":"842","article-title":"A primer in BERTology: What we know about how BERT\n                        works","volume-title":"Trans. Assoc. Comput.\n                        Linguistics","volume":"8","author":"Rogers","year":"2020"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3560815"},{"key":"ref21","first-page":"7516","article-title":"Which *BERT? A survey organizing contextualized\n                        encoders","volume-title":"Proc. Conf. Empirical Methods\n                        Natural Lang. Process.","author":"Xia","year":"2020"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/s11431-020-1647-3"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2992393"},{"key":"ref24","article-title":"A brief overview of unsupervised neural speech\n                        representation learning","volume-title":"Proc. 2nd\n                        Workshop Self-supervised Learn. Audio Speech Process.","author":"Borgholt","year":"2022"},{"key":"ref25","article-title":"Deep representation learning in speech processing:\n                        Challenges, recent advances, and future trends","author":"Latif","year":"2021"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1979.1170822"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1985.1164581"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/89.279278"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1006\/csla.1994.1019"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1986.1169179"},{"key":"ref31","first-page":"1197","article-title":"Speech recognition using SVMs","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Smith","year":"2001"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2003.1318396"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2003.1202334"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2064307"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2011-328"},{"key":"ref36","first-page":"3371","article-title":"Stacked denoising autoencoders: Learning useful\n                        representations in a deep network with a local denoising\n                        criterion","volume":"11","author":"Vincent","year":"2010","journal-title":"J. Mach. Learn. Res."},{"issue":"2","key":"ref37","first-page":"307","article-title":"Noise-contrastive estimation of unnormalized\n                        statistical models, with applications to natural image\n                        statistics","volume":"13","author":"Gutmann","year":"2012","journal-title":"J. Mach. Learn. Res."},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1038\/381607a0"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/7503.003.0105"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2010.5495649"},{"key":"ref41","first-page":"371","article-title":"A unified energy-based framework for unsupervised\n                        learning","volume-title":"Proc. 11th Int. Conf. Artif.\n                        Intell. Statist.","author":"Ranzato","year":"2007"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1162\/089976602760128018"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/w18-5446"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1775"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46487-9_40"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_9"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.167"},{"key":"ref48","article-title":"Auto-encoding variational\n                    Bayes","volume-title":"Proc. 2nd Int. Conf. Learn.\n                        Representations","author":"Kingma","year":"2014"},{"key":"ref49","first-page":"1278","article-title":"Stochastic backpropagation and approximate inference\n                        in deep generative models","volume-title":"Proc. Int.\n                        Conf. Mach. Learn.","author":"Rezende","year":"2014"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1561\/2200000089"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1202"},{"issue":"8","key":"ref52","first-page":"9","article-title":"Language models are unsupervised multitask\n                        learners","volume-title":"OpenAI Blog","volume":"1","author":"Radford","year":"2019"},{"key":"ref53","article-title":"Megatron-LM: Training multi-billion parameter language\n                        models using model parallelism","author":"Shoeybi","year":"2020"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref55","article-title":"RoBERTa: A robustly optimized BERT pretraining\n                        approach","author":"Liu","year":"2019"},{"key":"ref56","article-title":"Representation learning with contrastive predictive\n                        coding","author":"Oord","year":"2018"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.5555\/3524938.3525087"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"ref59","article-title":"Improved baselines with momentum contrastive\n                        learning","author":"Chen","year":"2020"},{"key":"ref60","first-page":"9912","article-title":"Unsupervised learning of visual features by\n                        contrasting cluster assignments","volume-title":"Proc.\n                        Adv. Neural Inf. Process. Syst.","author":"Caron","year":"2020"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683690"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1470"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1800"},{"key":"ref64","first-page":"5135","article-title":"Scaling ASR improves zero and few shot\n                        learning","volume-title":"Proc. Interspeech","author":"Xiao","year":"2022"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1023\/A:1007379606734"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2015.7404803"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/OJSP.2020.3045349"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688093"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2605"},{"key":"ref70","first-page":"2980","article-title":"A recurrent latent variable model for sequential\n                        data","volume-title":"Proc. 29th Conf. Neural Inf.\n                        Process. Syst.","author":"Chung","year":"2015"},{"key":"ref71","first-page":"2207","article-title":"Sequential neural models with stochastic\n                        layers","volume-title":"Proc. 30th Conf. Neural Inf.\n                        Process. Syst.","author":"Fraccaro","year":"2016"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-349"},{"key":"ref73","first-page":"1876","article-title":"Unsupervised learning of disentangled and\n                        interpretable representations from sequential data","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Hsu","year":"2017"},{"key":"ref74","article-title":"STCN: Stochastic temporal convolutional\n                        networks","volume-title":"Proc. 7th Int. Conf. Learn.\n                        Representations","author":"Aksan","year":"2019"},{"key":"ref75","first-page":"6309","article-title":"Neural discrete representation\n                        learning","volume-title":"Proc. Adv. Neural Inf. Process.\n                        Syst.","author":"Oord","year":"2017"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-475"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.593"},{"key":"ref78","first-page":"1336","article-title":"On generative spoken language modeling from raw\n                        audio","volume":"9","author":"Lakhotia","year":"2021","journal-title":"Trans. Assoc. Comput.\n                        Linguistics"},{"key":"ref79","article-title":"Estimating or propagating gradients through stochastic\n                        neurons for conditional computation","author":"Bengio","year":"2013"},{"key":"ref80","first-page":"125","article-title":"WaveNet: A generative model for raw\n                        audio","volume-title":"Proc. 9th ISCA Speech Synth.\n                        Workshop","author":"Oord","year":"2016"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1983.1171915"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1985.1168412"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2938863"},{"key":"ref84","article-title":"Categorical reparameterization with\n                        Gumbel-softmax","volume-title":"Proc. Int. Conf. Learn.\n                        Representations","author":"Jang","year":"2017"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1518"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6638312"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6855085"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-639"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7179087"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-644"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682903"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-1473"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054438"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/45.1890"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1162\/089976602317318938"},{"key":"ref96","article-title":"TIMIT acoustic-phonetic continuous speech corpus\n                        LDC93S1","volume-title":"Linguistic Data Consortium","author":"Garofolo","year":"1993"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.213"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1228"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053176"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054458"},{"key":"ref101","article-title":"Improving transformer-based speech recognition\n                        using unsupervised pre-training","author":"Jiang","year":"2019"},{"key":"ref102","article-title":"Masked pre-trained encoder base on joint\n                        CTC-Transformer","author":"Liu","year":"2020"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053541"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414539"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-905"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3095662"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-349"},{"key":"ref108","article-title":"Generalized Autoregressive Pretraining for\n                        Language Understanding","volume-title":"Proc. Adv. Neural\n                        Inf. Process. Syst.","volume":"32","author":"Yang","year":"2019"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1511"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/icassp40776.2020.9053176"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1066"},{"key":"ref112","first-page":"1929","article-title":"Dropout: A simple way to prevent neural networks from\n                        overfitting","volume":"15","author":"Srivastava","year":"2014","journal-title":"J. Mach. Learn. Res."},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053569"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2341"},{"key":"ref115","article-title":"Self-supervised audio representation learning for\n                        mobile devices","author":"Tagliasacchi","year":"2019"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2020.2985586"},{"key":"ref117","article-title":"Learning audio representations via phase\n                        prediction","author":"Quitry","year":"2019"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-82"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383575"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2194"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054548"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.106"},{"key":"ref124","article-title":"vq-wav2vec: Self-supervised learning of discrete\n                        speech representations","volume-title":"Proc. Int. Conf.\n                        Learn. Representations","author":"Baevski","year":"2020"},{"key":"ref125","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning\n                        of speech representations","volume-title":"Proc. Adv.\n                        Neural Inf. Process. Syst.","volume":"33","author":"Baevski","year":"2020"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-717"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688253"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-391"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054224"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1109\/jstsp.2022.3188113"},{"key":"ref132","article-title":"data2vec: A general framework for self-supervised\n                        learning in speech, vision and language","author":"Baevski","year":"2022"},{"key":"ref133","first-page":"3915","article-title":"Self-supervised learning with random-projection\n                        quantizer for speech recognition","volume-title":"Proc.\n                        Int. Conf. Mach. Learn.","volume":"162","author":"Chiu","year":"2022"},{"key":"ref134","first-page":"41","article-title":"Learning a distance metric from relative\n                        comparisons","volume-title":"Proc. Neural Inf. Process.\n                        Syst.","author":"Schultz","year":"2003"},{"key":"ref135","first-page":"297","article-title":"Noise-contrastive estimation: A new estimation\n                        principle for unnormalized statistical models","volume-title":"Proc. Int. Conf. Artif. Intell. Statist.","author":"Gutmann","year":"2010"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2010.57"},{"key":"ref137","article-title":"Generating diverse high-fidelity images with\n                        VQ-VAE-2","volume-title":"Proc, Adv. Neural Inf. Process.\n                        Syst.","volume":"32","author":"Razavi van den Oord","year":"2019"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_9"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3497510"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"ref142","first-page":"8026","article-title":"PyTorch: An imperative style, high-performance\n                        deep learning library","volume-title":"Proc. Adv. Neural\n                        Inf. Process. Syst.","author":"Paszke","year":"2019"},{"key":"ref143","article-title":"Automatic lipreading to enhance speech recognition\n                        (speech reading)","author":"Petajan","year":"1984"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2003.817150"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2006.886017"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1016\/0163-6383(90)90039-B"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1207\/s15516709cog2601_4"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2004-424"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-54184-6_6"},{"key":"ref150","article-title":"Learning audio-visual speech representation by masked\n                        multimodal cluster prediction","volume-title":"Proc. Int.\n                        Conf. Learn. Representations","author":"Shi","year":"2022"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1121\/1.2029064"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.25144\/18431"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2011-308"},{"key":"ref154","first-page":"689","article-title":"Multimodal deep learning","volume-title":"Proc. 28th Int. Conf. Mach. Learn.","author":"Ngiam","year":"2011"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2015.05.005"},{"key":"ref156","first-page":"2222","article-title":"Multimodal learning with deep boltzmann\n                        machines","volume-title":"Proc. Adv. Neural Inf. Process.\n                        Syst.","author":"Srivastava","year":"2012"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.2307\/2333955"},{"key":"ref158","first-page":"1247","article-title":"Deep canonical correlation\n                    analysis","volume-title":"Proc. 30th Int. Conf. Mach.\n                        Learn.","author":"Andrew","year":"2013"},{"key":"ref159","first-page":"1083","article-title":"On deep multi-view representation\n                        learning","volume-title":"Proc. Int. Conf. Mach.\n                        Learn.","author":"Wang","year":"2015"},{"key":"ref160","article-title":"Deep variational canonical correlation\n                        analysis","author":"Wang","year":"2016"},{"key":"ref161","first-page":"1967","article-title":"Nonparametric canonical correlation\n                        analysis","volume-title":"Proc. Int. Conf. Mach.\n                        Learn.","author":"Michaeli","year":"2016"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-44668-0_50"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1142\/S012906570000034X"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1016\/S0893-6080(99)00075-1"},{"key":"ref165","article-title":"A probabilistic interpretation of canonical\n                        correlation analysis","volume":"688","author":"Bach","year":"2005"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178840"},{"key":"ref167","article-title":"Multilingual distributed representations without word\n                        alignment","volume-title":"Proc. Int. Conf. Learn.\n                        Representations","author":"Hermann","year":"2013"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1145\/2505515.2505665"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-99"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1.12967"},{"key":"ref171","article-title":"Learning words from images and\n                    speech","volume-title":"Proc. Proc. Adv. Neural Inf. Process.\n                        Syst. Workshop Learn. Semantics","author":"Synnaeve","year":"2014"},{"key":"ref172","first-page":"1866","article-title":"Unsupervised learning of spoken language with visual\n                        context","volume-title":"Proc. Adv. Neural Inf. Process.\n                        Syst.","author":"Harwath","year":"2016"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2015.7404800"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3067"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1312"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747103"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/K19-1006"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-96"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746449"},{"key":"ref180","article-title":"Self-supervised representation learning for speech\n                        using visual grounding and masked language modeling","volume-title":"Proc. AAAI Conf. Artif. Intell. SAS Workshop","author":"Peng","year":"2022"},{"key":"ref181","article-title":"Learning hierarchical discrete linguistic units from\n                        visually-grounded speech","volume-title":"Proc. Int. Conf.\n                        Learn. Representations","author":"Harwath","year":"2019"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p17-1057"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461761"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_40"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10652"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1148"},{"key":"ref187","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462396"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683069"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.21437\/SLTU.2018-52"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3051"},{"key":"ref191","article-title":"SLAM: A unified encoder for speech and language\n                        modeling via speech-text joint pre-training","author":"Bapna","year":"2021"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7179089"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178970"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1592"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8269008"},{"key":"ref196","first-page":"154","article-title":"A segmental framework for fully-unsupervised\n                        large-vocabulary speech recognition","volume-title":"Comput.\n                        Speech Lang.","volume":"46","author":"Kamper","year":"2017"},{"key":"ref197","article-title":"Word-level acoustic modeling with convolutional vector\n                        regression","volume-title":"Proc. Int. Conf. Mach. Learn.\n                        Workshop Representation Learn","author":"Maas","year":"2012"},{"key":"ref198","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2014-273"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2013.6707765"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2364"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683639"},{"key":"ref202","article-title":"A correspondence variational autoencoder for\n                        unsupervised acoustic word embeddings","volume-title":"Proc. NeurIPS Workshop Self-Supervised Learn. Speech Audio\n                        Process.","author":"Peng","year":"2020"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2011-304"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472619"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.repl4nlp-1.20"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.846"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383625"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201357"},{"key":"ref211","first-page":"69","article-title":"The fisher corpus: A resource for the next generations\n                        of speech-to-text","volume-title":"Proc. Int. Conf. Lang.\n                        Resour. Eval.","volume":"4","author":"Cieri","year":"2004"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.3115\/1075527.1075614"},{"key":"ref214","first-page":"4218","article-title":"Common voice: A massively-multilingual speech\n                        corpus","volume-title":"Proc. Int. Conf. Lang. Resour.\n                        Eval.","author":"Ardila","year":"2020"},{"key":"ref215","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2826"},{"key":"ref216","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.80"},{"key":"ref217","first-page":"16","article-title":"Speech recognition and keyword spotting for\n                        low-resource languages: BABEL project research at\n                    CUED","volume-title":"Proc. 4th Int. Workshop Spoken Lang.\n                        Technol. Under-Resourced Lang.","author":"Gales","year":"2014"},{"key":"ref218","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-143"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747077"},{"key":"ref220","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1965"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-99579-3_21"},{"key":"ref222","first-page":"125","article-title":"TED-LIUM: An automatic speech recognition dedicated\n                        corpus","volume-title":"Proc. Int. Conf. Lang. Resour.\n                        Eval.","author":"Rousseau","year":"2012"},{"key":"ref223","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1992.225858"},{"key":"ref224","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383459"},{"key":"ref225","doi-asserted-by":"publisher","DOI":"10.1007\/11939993_73"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.1109\/ICSDA.2017.8384449"},{"key":"ref227","article-title":"Aidatatang_200zh, a free Chinese\n                        mandarin speech corpus"},{"key":"ref228","article-title":"MAGICDATA mandarin chinese read speech\n                        corpus","year":"2019"},{"key":"ref229","article-title":"ST-CMDS-20170001_1 Free ST Chinese mandarin\n                        corpus"},{"key":"ref230","article-title":"Primewords Chinese\n                        corpus set 1","year":"2018"},{"key":"ref231","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2015.7404805"},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1768"},{"key":"ref233","first-page":"2024","article-title":"Interface databases: Design and collection of a\n                        multilingual emotional speech database","volume-title":"Proc. Int. Conf. Lang. Resour. Eval.","author":"Hozjan","year":"2002"},{"key":"ref234","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p18-1208"},{"key":"ref235","article-title":"CSTR VCTK corpus: English multi-speaker corpus for\n                        CSTR voice cloning toolkit","author":"Veaux","year":"2017"},{"key":"ref236","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-950"},{"key":"ref237","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2396"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7179090"},{"key":"ref239","first-page":"1","article-title":"Augmenting LibriSpeech with french translations: A\n                        multimodal corpus for direct speech translation\n                    evaluation","volume-title":"Proc. Int. Conf. Lang. Resour.\n                        Eval.","author":"Kocabiyikoglu","year":"2018"},{"key":"ref240","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-2027"},{"key":"ref241","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2013.01.008"},{"key":"ref242","first-page":"477","article-title":"First automatic fongbe continuous speech recognition\n                        system: Development of acoustic models and language\n                    models","volume-title":"Proc. Federated Conf. Comput. Sci.\n                        Inf. Syst.","author":"Laleye","year":"2016"},{"key":"ref243","first-page":"94","article-title":"Developments of swahili resources for an automatic\n                        speech recognition system","volume-title":"Proc. Spoken\n                        Lang. Technol. Under-Resourced Lang.","author":"Gelas","year":"2012"},{"key":"ref244","first-page":"3863","article-title":"Collecting resources in sub-Saharan African languages\n                        for automatic speech recognition: A case study of\n                    Wolof","volume-title":"Proc. 10th Lang. Resour. Eval.\n                        Conf.","author":"Gauthier","year":"2016"},{"key":"ref245","article-title":"MUSAN: A music, speech, and noise\n                        corpus","author":"Snyder","year":"2015"},{"key":"ref246","doi-asserted-by":"publisher","DOI":"10.1111\/2041-210X.13103"},{"key":"ref247","article-title":"Speech commands: A dataset for limited-vocabulary\n                        speech recognition","author":"Warden","year":"2018"},{"key":"ref248","article-title":"Spoken language identification","author":"Oponowicz","year":"2018"},{"key":"ref249","doi-asserted-by":"publisher","DOI":"10.1109\/taslp.2017.2778423"},{"key":"ref250","first-page":"1068","article-title":"Neural audio synthesis of musical notes with\n                        WaveNet autoencoders","volume-title":"Proc. Int. Conf.\n                        Mach. Learn.","author":"Engel","year":"2017"},{"key":"ref251","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2380"},{"key":"ref252","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3084"},{"key":"ref253","doi-asserted-by":"publisher","DOI":"10.1163\/9789401206884_014"},{"key":"ref254","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-740"},{"key":"ref255","article-title":"Pushing the limits of semi-supervised learning\n                        for automatic speech recognition","volume-title":"Proc.\n                        Workshop Self-Supervised Learn. Speech Audio Process.","author":"Zhang","year":"2020"},{"key":"ref256","article-title":"Unified hypersphere embedding for speaker\n                        recognition","author":"Hajibabaei","year":"2018"},{"key":"ref257","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414722"},{"key":"ref258","article-title":"A fine-tuned wav2vec 2.0\/HuBERT benchmark for speech\n                        emotion recognition, speaker verification and spoken language\n                        understanding","author":"Wang","year":"2021"},{"key":"ref259","article-title":"GTTS-EHU systems for QUESST at MediaEval\n                        2014","volume-title":"Proc. MediaEval","author":"Rodrguez-Fuentes","year":"2014"},{"key":"ref260","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-556"},{"key":"ref261","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2743"},{"key":"ref262","first-page":"125","article-title":"Hear: Holistic evaluation of audio\n                        representations","volume-title":"Proc. NeurIPS\n                        Competitions Demonstrations Track","volume":"176","author":"Turian","year":"2022"},{"key":"ref263","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1242"},{"key":"ref264","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746790"},{"key":"ref265","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3033"},{"key":"ref266","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1693"},{"key":"ref267","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P14-5004"},{"key":"ref268","first-page":"27826","article-title":"Unsupervised speech\n                    recognition","volume-title":"Proc. Neural Inf. Process.\n                        Syst.","volume":"34","author":"Baevski","year":"2021"},{"key":"ref269","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1448"},{"key":"ref270","article-title":"Do self-supervised and supervised methods learn\n                        similar visual representations","author":"Grigg","year":"2021"},{"key":"ref271","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1182"},{"key":"ref272","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2020-2"},{"key":"ref273","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747022"},{"key":"ref274","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-2231"},{"key":"ref275","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414321"},{"key":"ref276","article-title":"A comparison of discrete latent variable models for\n                        speech representation learning","author":"Zhou","year":"2020"},{"key":"ref277","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054548"},{"key":"ref278","first-page":"1084","article-title":"Scaling effect of self-supervised\n                        models","volume-title":"Proc. Annu. Conf. Int. Speech\n                        Commun. Assoc.","author":"Pu","year":"2021"},{"key":"ref279","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-638"},{"key":"ref280","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688137"},{"key":"ref281","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-236"},{"key":"ref282","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-329"},{"key":"ref283","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1800"},{"key":"ref284","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462002"},{"key":"ref285","first-page":"5769","article-title":"Improved training of wasserstein\n                    GANs","volume-title":"Proc. 31st Int. Conf. Neural Inf.\n                        Process. Syst.","author":"Gulrajani","year":"2017"},{"key":"ref286","first-page":"7354","article-title":"Unsupervised cross-modal alignment of speech and text\n                        embedding spaces","volume-title":"Proc. Adv. Neural Inf.\n                        Process. Syst.","volume":"31","author":"Chung","year":"2018"},{"key":"ref287","article-title":"Word translation without parallel\n                    data","volume-title":"Proc. Int. Conf. Learn.\n                        Representations","author":"Conneau","year":"2018"},{"key":"ref288","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683550"},{"key":"ref289","first-page":"789","article-title":"A robust self-learning method for fully unsupervised\n                        cross-lingual mappings of word embeddings","volume-title":"Proc. 56th Annu. Meeting Assoc. Comput. Linguistics","author":"Artetxe","year":"2018"},{"key":"ref290","article-title":"Unsupervised speech recognition via segmental\n                        empirical output distribution matching","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Yeh","year":"2018"},{"key":"ref291","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-877"},{"key":"ref292","first-page":"3553","article-title":"Unsupervised sequence classification using sequential\n                        output statistics","volume-title":"Proc. Adv. Neural Inf.\n                        Process. Syst.","author":"Liu","year":"2017"},{"key":"ref293","first-page":"1856","article-title":"Completely unsupervised speech recognition by a\n                        generative adversarial network harmonized with iteratively refined hidden\n                        Markov models","volume-title":"Proc. Annu. Conf. Int.\n                        Speech Commun. Assoc.","author":"Chen","year":"2019"},{"key":"ref294","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10170"},{"key":"ref295","first-page":"12","article-title":"Deciphering foreign language","volume-title":"Proc. 49th Annu. Meeting Assoc. Comput. Linguistics: Hum. Lang.\n                        Technol.","author":"Ravi","year":"2011"},{"key":"ref296","doi-asserted-by":"publisher","DOI":"10.1109\/slt54892.2023.10023187"},{"key":"ref297","first-page":"2672","article-title":"Generative adversarial\n                    nets","volume-title":"Proc. Adv. Neural Inf. Process.\n                        Syst.","author":"Goodfellow","year":"2014"},{"key":"ref298","first-page":"214","article-title":"GAN wasserstein","volume-title":"Proc. 34th Int. Conf. Mach. Learn.","author":"Arjovsky","year":"2017"},{"key":"ref299","article-title":"Unsupervised neural machine\n                        translation","volume-title":"Proc. Int. Conf. Learn.\n                        Representations","author":"Artetxe","year":"2018"},{"key":"ref300","article-title":"Unsupervised machine translation using monolingual\n                        corpora only","volume-title":"Proc. Int. Conf. Learn.\n                        Representations","author":"Lample","year":"2018"},{"key":"ref301","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747357"},{"key":"ref302","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268950"},{"key":"ref303","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683307"},{"key":"ref304","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053831"},{"key":"ref305","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1558"},{"key":"ref306","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683480"},{"key":"ref307","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3167"},{"key":"ref308","doi-asserted-by":"publisher","DOI":"10.1007\/BF00992696"},{"key":"ref309","volume-title":"Statistical Methods for Speech Recognition","author":"Jelinek","year":"1997"},{"key":"ref310","doi-asserted-by":"publisher","DOI":"10.1006\/csla.2001.0184"},{"key":"ref311","article-title":"On using monolingual corpora in neural machine\n                        translation","author":"Gulcehre","year":"2015"},{"key":"ref312","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2017-343"},{"key":"ref313","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref314","article-title":"Training neural speech recognition systems with\n                        synthetic speech augmentation","author":"Li","year":"2018"},{"key":"ref315","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682816"},{"key":"ref316","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003990"},{"key":"ref317","doi-asserted-by":"publisher","DOI":"10.1109\/CISP-BMEI51763.2020.9263564"},{"key":"ref318","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053104"},{"key":"ref319","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639619"},{"key":"ref320","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2456"},{"key":"ref321","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1930"},{"key":"ref322","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1009"},{"key":"ref323","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639245"},{"key":"ref324","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268953"},{"key":"ref325","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2904"},{"key":"ref326","article-title":"The zero resource speech benchmark 2021: Metrics\n                        and baselines for unsupervised spoken language\n                    modeling","author":"Nguyen","year":"2020"},{"key":"ref327","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2016.04.033"},{"key":"ref328","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8269011"},{"key":"ref329","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3232"},{"key":"ref330","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1503"},{"key":"ref331","article-title":"Textless speech emotion conversion using\n                        decomposed and discrete representations","author":"Kreuk","year":"2021"},{"key":"ref332","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00545"},{"key":"ref333","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-612"},{"key":"ref334","article-title":"Discretization and re-synthesis: An alternative method\n                        to solve the cocktail party problem","author":"Shi","year":"2021"},{"key":"ref335","article-title":"Discretalk: Text-to-speech as a machine translation\n                        problem","author":"Hayashi","year":"2020"},{"key":"ref336","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.63"},{"key":"ref337","first-page":"2790","article-title":"Parameter-efficient transfer learning for\n                        NLP","volume":"97","author":"Houlsby","year":"2019","journal-title":"Proc. Int. Conf. Mach. Learn.\n                        Res."},{"key":"ref338","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-short.1"},{"key":"ref339","first-page":"4884","article-title":"Parameter-efficient transfer learning with diff\n                        pruning","volume-title":"Proc. 59th Annu. Meeting Assoc.\n                        Comput. Linguistics, 11th Int. Joint Conf. Natural Lang. Process.","author":"Guo","year":"2021"},{"key":"ref340","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746223"},{"key":"ref341","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2022-10610"},{"key":"ref342","first-page":"21256","article-title":"PARP: Prune, adjust and re-prune for\n                        self-supervised speech recognition","volume-title":"Proc.\n                        Adv. Neural Inf. Process. Syst.","author":"Lai","year":"2021"},{"key":"ref343","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747490"},{"key":"ref344","article-title":"Conditional computation in neural networks for faster\n                        models","author":"Bengio","year":"2016"},{"key":"ref345","doi-asserted-by":"publisher","DOI":"10.1201\/9781003162810-13"},{"key":"ref346","doi-asserted-by":"publisher","DOI":"10.1145\/3530811"},{"key":"ref347","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1454"},{"key":"ref348","first-page":"18003","article-title":"ContentVec: An improved self-supervised speech\n                        representation by disentangling speakers","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Qian","year":"2022"},{"key":"ref349","first-page":"16251","article-title":"Neural analysis and synthesis: Reconstructing speech\n                        from self-supervised representations","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Choi","year":"2021"},{"key":"ref350","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-390"},{"key":"ref351","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747242"},{"key":"ref352","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-519"},{"key":"ref353","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746220"},{"key":"ref354","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747379"},{"key":"ref355","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3200909"}],"container-title":["IEEE Journal of Selected Topics in Signal Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/4200690\/9923627\/09893562.pdf?arnumber=9893562","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,4]],"date-time":"2024-07-04T04:01:58Z","timestamp":1720065718000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9893562\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10]]},"references-count":355,"journal-issue":{"issue":"6"},"URL":"https:\/\/doi.org\/10.1109\/jstsp.2022.3207050","relation":{},"ISSN":["1932-4553","1941-0484"],"issn-type":[{"value":"1932-4553","type":"print"},{"value":"1941-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,10]]}}}