{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,17]],"date-time":"2026-04-17T23:16:05Z","timestamp":1776467765108,"version":"3.51.2"},"reference-count":134,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"6","license":[{"start":{"date-parts":[[2022,10,1]],"date-time":"2022-10-01T00:00:00Z","timestamp":1664582400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2022,10,1]],"date-time":"2022-10-01T00:00:00Z","timestamp":1664582400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,10,1]],"date-time":"2022-10-01T00:00:00Z","timestamp":1664582400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100007224","name":"Connaught Fund","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100007224","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Arts and Science Bridging Fund"},{"DOI":"10.13039\/501100003579","name":"University of Toronto","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003579","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100000038","name":"Natural Sciences and Engineering Research Council of Canada","doi-asserted-by":"publisher","award":["RGPIN-2022-04431"],"award-info":[{"award-number":["RGPIN-2022-04431"]}],"id":[{"id":"10.13039\/501100000038","id-type":"DOI","asserted-by":"publisher"}]},{"name":"ANR","award":["ANR-17-CE28-0009"],"award-info":[{"award-number":["ANR-17-CE28-0009"]}]},{"name":"ANR","award":["ANR-17-EURE-0017"],"award-info":[{"award-number":["ANR-17-EURE-0017"]}]},{"name":"ANR","award":["ANR-10-IDEX-0001-02"],"award-info":[{"award-number":["ANR-10-IDEX-0001-02"]}]},{"name":"ANR","award":["ANR-19-P3IA-0001"],"award-info":[{"award-number":["ANR-19-P3IA-0001"]}]},{"name":"PRAIRIE 3IA Institute"},{"name":"Meta AI Research Gift"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE J. Sel. Top. Signal Process."],"published-print":{"date-parts":[[2022,10]]},"DOI":"10.1109\/jstsp.2022.3206084","type":"journal-article","created":{"date-parts":[[2022,9,12]],"date-time":"2022-09-12T19:54:59Z","timestamp":1663012499000},"page":"1211-1226","source":"Crossref","is-referenced-by-count":21,"title":["Self-Supervised Language Learning From Raw Audio: Lessons From the Zero Resource Speech Challenge"],"prefix":"10.1109","volume":"16","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9603-953X","authenticated-orcid":false,"given":"Ewan","family":"Dunbar","sequence":"first","affiliation":[{"name":"Department of French, University of Toronto, Toronto, ON, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nicolas","family":"Hamilakis","sequence":"additional","affiliation":[{"name":"&#x00C9;cole Normale Sup&#x00E9;rieure, Paris, France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Emmanuel","family":"Dupoux","sequence":"additional","affiliation":[{"name":"&#x00C9;cole des Hautes &#x00C9;tudes en Sciences Sociales, Paris, France"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cognition.2017.11.008"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511576164"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1775"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/2.28"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.3115\/1557690.1557736"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2011.5947338"},{"key":"ref7","first-page":"40","article-title":"A nonparametric Bayesian approach to acoustic model discovery","volume-title":"Proc. Assoc. Comput. Linguistics","author":"Lee","year":"2012"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639245"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6855085"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2016.04.031"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268953"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2904"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2743"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1755"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-639"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-643"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-642"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/GLU.2017-6"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2016.04.032"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-644"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-45925-7_7"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-640"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472622"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8269013"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8269009"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8269011"},{"key":"ref27","article-title":"Technical report the IRIT-UPS system, zerospeech 2017 track1: Unsupervised subword modeling","author":"Pellegrini","year":"2017"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8269012"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8269010"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2938863"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383605"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2011.6163965"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8269008"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-646"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-645"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/taslp.2022.3229264"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00505"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2017.04.008"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3000"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1738"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-339"},{"key":"ref42","first-page":"1082","article-title":"Unsupervised spoken term discovery using wav2vec 2.0","volume-title":"Proc. IEEE Asia-Pacific Signal Inf. Process. Assoc. Annu. Summit Conf.","author":"Iwamoto","year":"2021"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2016.04.033"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3232"},{"key":"ref45","first-page":"1113","article-title":"Zero resource speech synthesis using transcripts derived from perceptual acoustic units","volume-title":"Proc. Interspeech","author":"Pandia","year":"2019"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1518"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1337"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2048"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/GlobalSIP45357.2019.8969412"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1430"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2765"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414899"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3127"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-2731"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1785"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2559"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1693"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3033"},{"key":"ref59","article-title":"The zero resource speech benchmark 2021: Metrics and baselines for unsupervised spoken language modeling","author":"Nguyen","year":"2020"},{"key":"ref60","article-title":"ZR-2021VG: Zero-resource speech challenge, visually-grounded language modelling track","author":"Alishahi","year":"2021"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1182"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1465"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1503"},{"key":"ref64","article-title":"Self-supervised representation learning for speech using visual grounding and masked language modeling","author":"Peng","year":"2022"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3200909"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3180684"},{"key":"ref67","first-page":"18003","article-title":"ContentVec: An improved self-supervised speech representation by disentangling speakers","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Qian","year":"2022"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746102"},{"key":"ref69","first-page":"3863","article-title":"Collecting resources in sub-saharan african languages for automatic speech recognition: A case study of Wolof","volume-title":"Proc. 10th Lang. Resour. Eval. Conf.","author":"Gauthier","year":"2016"},{"key":"ref70","article-title":"Development of Indonesian large vocabulary continuous speech recognition system within A-STAR project","volume-title":"Proc. Workshop Technol. Corpora Asia-Pacific Speech Transl.","author":"Sakti","year":"2008"},{"key":"ref71","first-page":"215","article-title":"Development of HMM-based Indonesian speech synthesis","volume-title":"Proc. Oriental COCOSDA","author":"Sakti","year":"2008"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2013-441"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2014-228"},{"key":"ref74","article-title":"Representation learning with contrastive predictive coding","volume":"abs\/1807.03748","author":"Oord","year":"2018"},{"key":"ref75","first-page":"12449","article-title":"wav2vec 2.0: A. framework for self-supervised learning of speech representations","volume-title":"Proc. 34th Int. Conf. Neural Inf. Process. Syst.","author":"Baevski","year":"2020"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref77","article-title":"A brief overview of unsupervised neural speech representation learning","author":"Borgholt","year":"2022"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1037\/a0034245"},{"key":"ref79","first-page":"184","article-title":"Bootstrapping a unified model of lexical and phonetic acquisition","volume-title":"Proc. Assoc. Comput. Linguistics","author":"Elsner","year":"2012"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1111\/cogs.12008"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.conll-1.51"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.523"},{"key":"ref83","volume-title":"An Introduction to the Psychology of Hearing","author":"Moore","year":"2012"},{"key":"ref84","doi-asserted-by":"crossref","DOI":"10.1101\/2021.04.19.440438","article-title":"The psychometrics of automatic speech recognition","volume-title":"bioRxiv","author":"Weerts","year":"2021"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.580"},{"key":"ref86","article-title":"Masked autoencoders that listen","author":"Huang","year":"2022"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2022-10961"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.909282"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2010.5495637"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2011.6163965"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2012.2194283"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639241"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1016\/j.cognition.2009.03.008"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2362"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/7503.003.0085"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1874"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461761"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2016-33"},{"key":"ref99","first-page":"1336","article-title":"On generative spoken language modeling from raw audio","volume":"9","author":"Lakhotia","year":"2021","journal-title":"Trans. Assoc. Comput. Linguistics"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054548"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.3758\/BRM.42.3.627"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00321"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2341"},{"key":"ref105","volume-title":"Verb Similarity on the Taxonomy of WordNet","author":"Yang","year":"2006"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.3115\/1620754.1620758"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1080\/01690969108406936"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1145\/365628.365657"},{"key":"ref109","first-page":"104","article-title":"Better word representations with recursive neural networks for morphology","volume-title":"Proc. Comput. Natural Lang. Learn.","author":"Luong","year":"2013"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1162\/COLI_a_00237"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1235"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1034"},{"key":"ref113","first-page":"136","article-title":"Distributional semantics in technicolor","volume-title":"Proc. Assoc. Comput. Linguistics","author":"Bruni","year":"2012"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1145\/1963405.1963455"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1145\/2339530.2339751"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref117","first-page":"1858","article-title":"Unsupervised learning of spoken language with visual context","volume-title":"Proc. Neural Inf. Process. Syst.","author":"Harwath","year":"2016"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p17-1057"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1523\/JNEUROSCI.0065-18.2018"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1111\/cogs.12943"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.2307\/538001"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1007\/s11525-014-9242-z"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.593"},{"key":"ref126","article-title":"Generative spoken dialogue language modeling","author":"Nguyen","year":"2022"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2022-373"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1671"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383461"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003853"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1951"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.235"},{"key":"ref133","article-title":"Translatotron 2: Robust direct speech-to-speech translation","author":"Jia","year":"2021"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.63"}],"container-title":["IEEE Journal of Selected Topics in Signal Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/4200690\/9923627\/09888095.pdf?arnumber=9888095","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,24]],"date-time":"2024-01-24T01:55:29Z","timestamp":1706061329000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9888095\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10]]},"references-count":134,"journal-issue":{"issue":"6"},"URL":"https:\/\/doi.org\/10.1109\/jstsp.2022.3206084","relation":{},"ISSN":["1932-4553","1941-0484"],"issn-type":[{"value":"1932-4553","type":"print"},{"value":"1941-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,10]]}}}