{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T18:22:23Z","timestamp":1772907743945,"version":"3.50.1"},"reference-count":41,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T00:00:00Z","timestamp":1673222400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T00:00:00Z","timestamp":1673222400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,1,9]]},"DOI":"10.1109\/slt54892.2023.10022552","type":"proceedings-article","created":{"date-parts":[[2023,1,27]],"date-time":"2023-01-27T18:54:03Z","timestamp":1674845643000},"page":"1-8","source":"Crossref","is-referenced-by-count":10,"title":["CCC-WAV2VEC 2.0: Clustering AIDED Cross Contrastive Self-Supervised Learning of Speech Representations"],"prefix":"10.1109","author":[{"given":"Vasista Sai","family":"Lodagala","sequence":"first","affiliation":[{"name":"Indian Institute of Technology,Madras"}]},{"given":"Sreyan","family":"Ghosh","sequence":"additional","affiliation":[{"name":"University of Maryland,College Park"}]},{"given":"S.","family":"Umesh","sequence":"additional","affiliation":[{"name":"Indian Institute of Technology,Madras"}]}],"member":"263","reference":[{"key":"ref1","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","author":"Chen","journal-title":"ICML 2020"},{"key":"ref2","author":"Devlin","year":"2018","journal-title":"Bert: Pre-training of deep bidirectional transformers for language understanding"},{"key":"ref3","first-page":"12449","article-title":"wav2vec 2. 0: A framework for self-supervised learning of speech representations","author":"Baevski","year":"2020","journal-title":"N eurIP S"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747077"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref7","article-title":"The history of speech recognition to the year 2030","author":"Hannun","year":"2021","journal-title":"arXiv preprint"},{"key":"ref8","article-title":"Audio augmentation for speech recog-nition","volume-title":"Sixteenth annual conference of the ISCA","author":"Ko"},{"key":"ref9","first-page":"173","article-title":"Deep speech 2: End-to-end speech recog-nition in english and mandarin","author":"Amodei","journal-title":"ICML 2016"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-667"},{"key":"ref11","author":"Gao","year":"2021","journal-title":"Multi-variant consistency based self-supervised learning for robust automatic speech recognition"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383605"},{"key":"ref13","article-title":"Rep-resentation learning with contrastive predictive coding","author":"van den Oord","year":"2018","journal-title":"arXiv preprint"},{"key":"ref14","first-page":"9912","article-title":"Unsu-pervised learning of visual features by contrasting clus-ter assignments","author":"Caron","journal-title":"NeurIPS 2020"},{"key":"ref15","article-title":"Contrastive learning with hard negative samples","author":"Robinson","year":"2021","journal-title":"ICLR"},{"key":"ref16","first-page":"8765","article-title":"Debiased contrastive learning","author":"Chuang","journal-title":"NeurIPS 2020"},{"key":"ref17","article-title":"Negative selection by clus-tering for contrastive learning in human activity recog-nition","author":"Wang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/757"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00937"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/icassp.1992.225858"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.3115\/1075527.1075614"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054458"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3095662"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-391"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688253"},{"key":"ref26","first-page":"18003","article-title":"Contentvec: An improved self-supervised speech representation by disentangling speakers","author":"Qian","journal-title":"ICML 2022"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1775"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i10.17037"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2015.7298682"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00742"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414483"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953152"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-958"},{"key":"ref36","author":"Spijkervet","year":"2021","journal-title":"Spijkervet\/torchaudio-augmentations"},{"key":"ref37","first-page":"563","volume-title":"K-Means Clustering","author":"Jin","year":"2010"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-4009"},{"key":"ref40","article-title":"MU-SAN: A Music, Speech, and Noise Corpus","author":"Snyder","year":"2015","journal-title":"arXiv"},{"key":"ref41","author":"Balam","year":"2020","journal-title":"Improving noise robustness of an end-to-end neural model for automatic speech recognition"}],"event":{"name":"2022 IEEE Spoken Language Technology Workshop (SLT)","location":"Doha, Qatar","start":{"date-parts":[[2023,1,9]]},"end":{"date-parts":[[2023,1,12]]}},"container-title":["2022 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10022052\/10022330\/10022552.pdf?arnumber=10022552","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,13]],"date-time":"2024-02-13T06:50:12Z","timestamp":1707807012000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10022552\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,1,9]]},"references-count":41,"URL":"https:\/\/doi.org\/10.1109\/slt54892.2023.10022552","relation":{},"subject":[],"published":{"date-parts":[[2023,1,9]]}}}