{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,21]],"date-time":"2026-07-21T15:02:30Z","timestamp":1784646150876,"version":"3.55.0"},"reference-count":124,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"EPSRC programme","award":["EP\/T028572\/1"],"award-info":[{"award-number":["EP\/T028572\/1"]}]},{"name":"MSIT, Korea"},{"name":"Information Technology Research Center","award":["IITP-2024-RS-2023-00259991"],"award-info":[{"award-number":["IITP-2024-RS-2023-00259991"]}]},{"name":"IITP"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/taslp.2024.3444456","type":"journal-article","created":{"date-parts":[[2024,8,20]],"date-time":"2024-08-20T15:47:50Z","timestamp":1724168870000},"page":"3850-3866","source":"Crossref","is-referenced-by-count":24,"title":["The VoxCeleb Speaker Recognition Challenge: A Retrospective"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7247-6401","authenticated-orcid":false,"given":"Jaesung","family":"Huh","sequence":"first","affiliation":[{"name":"Visual Geometry Group, University of Oxford, Oxford, U.K."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7741-7275","authenticated-orcid":false,"given":"Joon Son","family":"Chung","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science, and Technology, Daejeon, South Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2190-9013","authenticated-orcid":false,"given":"Arsha","family":"Nagrani","sequence":"additional","affiliation":[{"name":"Visual Geometry Group, University of Oxford, Oxford, U.K."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Andrew","family":"Brown","sequence":"additional","affiliation":[{"name":"Visual Geometry Group, University of Oxford, Oxford, U.K."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0505-2988","authenticated-orcid":false,"given":"Jee-weon","family":"Jung","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Daniel","family":"Garcia-Romero","sequence":"additional","affiliation":[{"name":"Johns Hopkins University, Baltimore, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8945-8573","authenticated-orcid":false,"given":"Andrew","family":"Zisserman","sequence":"additional","affiliation":[{"name":"Visual Geometry Group, University of Oxford, Oxford, U.K."}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","article-title":"VoxSRC 2019: The first voxCeleb speaker recognition challenge","author":"Chung","year":"2019"},{"key":"ref2","article-title":"VoxSRC 2020: The second voxCeleb speaker recognition challenge","author":"Nagrani","year":"2020"},{"key":"ref3","article-title":"VoxSRC 2021: The third voxCeleb speaker recognition challenge","author":"Brown","year":"2022"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/taslp.2024.3444456"},{"key":"ref5","first-page":"15","article-title":"NIST speaker recognition evaluation chronicles","volume-title":"Proc. Odyssey-Speaker Lang. Recognit. Workshop","author":"Alvin","year":"2004"},{"key":"ref6","article-title":"The NIST year 2012 speaker recognition evaluation plan","author":"Greenberg","year":"2012"},{"key":"ref7","article-title":"NIST 2018 speaker recognition evaluation plan","year":"2018"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-458"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2020-38"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-1129"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1893"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.5555\/3524938.3525087"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3497510"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref17","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Brown","year":"2020"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2022.01.002"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2017-950"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2019.101027"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.l007\/978-3-319-46448-0_2"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2018.00020"},{"key":"ref23","first-page":"251","article-title":"Out of time: Automated lip sync in the wild","volume-title":"Proc. Workshop Multi-View Lip-Reading","author":"Chung","year":"2016"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413815"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383459"},{"key":"ref27","article-title":"Facelib","year":"2022"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2337"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096449"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1400"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-1064"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1268"},{"issue":"198","key":"ref34","first-page":"1","article-title":"CodaLab competitions: An open source platform to organize scientific challenges","volume":"24","author":"Pavao","year":"2023","journal-title":"J. Mach. Learn. Res."},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413948"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1388"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052974"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"ref39","article-title":"ClovaAI voxCeleb trainer","year":"2024"},{"key":"ref40","article-title":"Simple diarization repository","year":"2024"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref42","article-title":"MUSAN: A music, speech, and noise corpus","author":"Snyder","year":"2015"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00482"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2012.6288859"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-803"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2018-54"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01352"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2018.2822810"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747077"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-143"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1083"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953152"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414600"},{"key":"ref58","article-title":"The speakin system for voxCeleb speaker recognition challange 2021","author":"Zhao","year":"2021"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6638975"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2013.2279332"},{"key":"ref61","article-title":"ID R&D system description to voxCeleb speaker recognition challenge 2022","author":"Makarov","year":"2022"},{"key":"ref62","article-title":"The ID R&D voxCeleb speaker recognition challenge 2023 system description","author":"Torgashov","year":"2023"},{"key":"ref63","article-title":"But system description to voxCeleb speaker recognition challenge 2019","author":"Zeinali","year":"2019"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053209"},{"key":"ref65","doi-asserted-by":"crossref","DOI":"10.1109\/ASRU46091.2019.9003826","article-title":"CNN with phonetic attention for text-independent speaker verification","author":"Zhou","year":"2019"},{"key":"ref66","article-title":"The xx205 system for the voxCeleb speaker recognition challenge 2020","author":"Xiang","year":"2020"},{"key":"ref67","article-title":"Beijing ZKJ-NPU speaker verification system for voxCeleb speaker recognition challenge 2021","author":"Zhang","year":"2021"},{"key":"ref68","article-title":"The Kriston AI system for the voxCeleb speaker recognition challenge 2022","author":"Cai","year":"2022"},{"key":"ref69","article-title":"Unisound system for voxCeleb speaker recognition challenge 2023","author":"Zheng","year":"2023"},{"key":"ref70","article-title":"The bilibili voxCeleb speaker recognition challenge 2023 system description","author":"Zeng","year":"2023"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2007.366191"},{"key":"ref72","article-title":"Augmentation adversarial training for unsupervised speaker recognition","volume-title":"Proc. Workshop Self-Supervised Learn. Speech Audio Process.","author":"Huh","year":"2020"},{"key":"ref73","article-title":"The DKU-DukeECE systems for voxCeleb speaker recognition challenge 2020","author":"Wang","year":"2020"},{"key":"ref74","article-title":"The DKU-dukeECE system for the self-supervision speaker verification task of the 2021 voxCeleb speaker recognition challenge","author":"Cai","year":"2021"},{"key":"ref75","article-title":"The phonexia voxCeleb speaker recognition challenge 2021 system description","author":"Slavek","year":"2021"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_43"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414713"},{"key":"ref78","article-title":"The HCCL system for voxCeleb speaker recognition challenge 2022","author":"Zhao","year":"2023"},{"key":"ref79","article-title":"The DKU-tencent system for the voxCeleb speaker recognition challenge 2022","author":"Qin","year":"2022"},{"key":"ref80","article-title":"The DKU-MSXF speaker verification system for the voxCeleb speaker recognition challenge 2023","author":"Li","year":"2023"},{"key":"ref81","article-title":"The xx205 voxCeleb speaker recognition challenge 2023 system description","author":"Xiang","year":"2023"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-58347-1_8"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682852"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-3015"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2938758"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383490"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-1602"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746294"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10094752"},{"key":"ref90","article-title":"Microsoft speaker diarization system for the voxCeleb speaker recognition challenge 2020","author":"Xiao","year":"2020"},{"key":"ref91","first-page":"5819","article-title":"Analysis of the but diarization system for voxconverse challenge","volume-title":"Proc. IEEE Int. Conf. Acoust., Speech Signal Process.","author":"Landini","year":"2020"},{"key":"ref92","article-title":"The DKU-dukeECE-Lenovo system for the diarization task of the 2021 voxCeleb speaker recognition challenge","author":"Wang","year":"2021"},{"key":"ref93","article-title":"The bytedance speaker diarization system for the voxCeleb speaker recognition challenge 2021","author":"Wang","year":"2021"},{"key":"ref94","article-title":"The DKU-dukeECE diarization system for the voxCeleb speaker recognition challenge 2022","author":"Wang","year":"2022"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-97-0601-3_28"},{"key":"ref96","article-title":"The krisp diarization system for the voxCeleb speaker recognition challenge 2023","volume-title":"Proc. VoxCeleb Speaker Recognit. Challenge 2023","author":"Karamyan","year":"2023"},{"key":"ref97","article-title":"The Kaldi speech recognition toolkit","volume-title":"Proc. IEEE 2011 Workshop Autom. Speech Recognit. Understanding, Conf.","author":"Povey","year":"2011"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-01793-3_47"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683120"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-993"},{"key":"ref101","article-title":"Eventbrite webpage","year":"2023"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1002\/9781444395068"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2011.5947703"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/29.21701"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-647"},{"key":"ref106","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref107","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy","year":"2021"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref109","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2021"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953094"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462628"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683892"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383502"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2899"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1022"},{"key":"ref116","first-page":"1983","article-title":"pyannote. audio 2.1 speaker diarization pipeline: Principle, benchmark, and recipe","volume-title":"Proc. 24th Interspeech Conf.","author":"Bredin","year":"2023"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447957"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2021.101317"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414333"},{"key":"ref120","article-title":"pyannote.audio speaker diarization pipeline at VoxSRC 2023","author":"Baroudi","year":"2023"},{"key":"ref121","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2023"},{"key":"ref122","article-title":"Kaggle website","year":"2023"},{"key":"ref123","article-title":"EvalAI: Towards better evaluation systems for AI agents","author":"Yadav","year":"2019"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2022.105151"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-014-0733-5"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6570655\/10304349\/10640299.pdf?arnumber=10640299","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,2]],"date-time":"2024-09-02T04:10:32Z","timestamp":1725250232000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10640299\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":124,"URL":"https:\/\/doi.org\/10.1109\/taslp.2024.3444456","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}