{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:34:24Z","timestamp":1776890064296,"version":"3.51.2"},"reference-count":38,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,5,23]],"date-time":"2022-05-23T00:00:00Z","timestamp":1653264000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,5,23]],"date-time":"2022-05-23T00:00:00Z","timestamp":1653264000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,5,23]]},"DOI":"10.1109\/icassp43922.2022.9747814","type":"proceedings-article","created":{"date-parts":[[2022,4,27]],"date-time":"2022-04-27T19:50:34Z","timestamp":1651089034000},"page":"6147-6151","source":"Crossref","is-referenced-by-count":107,"title":["Large-Scale Self-Supervised Speech Representation Learning for Automatic Speaker Verification"],"prefix":"10.1109","author":[{"given":"Zhengyang","family":"Chen","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute, X-LANCE Lab,Department of Computer Science and Engineering"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sanyuan","family":"Chen","sequence":"additional","affiliation":[{"name":"Microsoft Corporation"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yu","family":"Wu","sequence":"additional","affiliation":[{"name":"Microsoft Corporation"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yao","family":"Qian","sequence":"additional","affiliation":[{"name":"Microsoft Corporation"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chengyi","family":"Wang","sequence":"additional","affiliation":[{"name":"Microsoft Corporation"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shujie","family":"Liu","sequence":"additional","affiliation":[{"name":"Microsoft Corporation"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yanmin","family":"Qian","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute, X-LANCE Lab,Department of Computer Science and Engineering"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Michael","family":"Zeng","sequence":"additional","affiliation":[{"name":"Microsoft Corporation"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2011-64"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00482"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-703"},{"key":"ref31","article-title":"Res2net: A new multi-scale backbone architecture","author":"gao","year":"2019","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2011.5947357"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414600"},{"key":"ref35","article-title":"MUSAN: A Music, Speech, and Noise Corpus","author":"snyder","year":"2015"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-993"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1158"},{"key":"ref12","article-title":"Bert: Pretraining of deep bidirectional transformers for language understanding","author":"devlin","year":"2018"},{"key":"ref13","article-title":"Improving language understanding by generative pretraining","author":"radford","year":"2018"},{"key":"ref14","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"baevski","year":"2020"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413351"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414973"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414713"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1356"},{"key":"ref28","article-title":"Unispeech-sat: Universal speech representation learning with speaker aware pre-training","author":"chen","year":"2021"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1982"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2015.07.003"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPAASC47483.2019.9023039"},{"key":"ref29","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proc NIPS"},{"key":"ref5","article-title":"But system description to voxceleb speaker recognition challenge 2019","author":"zeinali","year":"2019"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1545"},{"key":"ref7","doi-asserted-by":"crossref","first-page":"1487","DOI":"10.21437\/Interspeech.2017-1608","article-title":"End-to-end text-independent speaker verification with triplet loss on short utterances","author":"zhang","year":"2017","journal-title":"InterSpeech"},{"key":"ref2","article-title":"The speakin system for voxceleb speaker recognition challange 2021","author":"zhao","year":"2021"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688093"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1775"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1280"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-1140"},{"key":"ref23","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2017-950","article-title":"Voxceleb: a large-scale speaker identification dataset","author":"nagrani","year":"2017"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639585"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2017.01.001"}],"event":{"name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Singapore, Singapore","start":{"date-parts":[[2022,5,23]]},"end":{"date-parts":[[2022,5,27]]}},"container-title":["ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9745891\/9746004\/09747814.pdf?arnumber=9747814","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,15]],"date-time":"2022-08-15T20:11:18Z","timestamp":1660594278000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9747814\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,5,23]]},"references-count":38,"URL":"https:\/\/doi.org\/10.1109\/icassp43922.2022.9747814","relation":{},"subject":[],"published":{"date-parts":[[2022,5,23]]}}}