{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T10:57:06Z","timestamp":1730199426886,"version":"3.28.0"},"reference-count":50,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,12,16]],"date-time":"2023-12-16T00:00:00Z","timestamp":1702684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,12,16]],"date-time":"2023-12-16T00:00:00Z","timestamp":1702684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,12,16]]},"DOI":"10.1109\/asru57964.2023.10389725","type":"proceedings-article","created":{"date-parts":[[2024,1,19]],"date-time":"2024-01-19T18:38:40Z","timestamp":1705689520000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["Pseudo-Label Based Supervised Contrastive Loss for Robust Speech Representations"],"prefix":"10.1109","author":[{"given":"Varun","family":"Krishna","sequence":"first","affiliation":[{"name":"Learning and Extraction of Acoustic Patterns (LEAP) Lab, Indian Institute of Science,Bangalore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sriram","family":"Ganapathy","sequence":"additional","affiliation":[{"name":"Learning and Extraction of Acoustic Patterns (LEAP) Lab, Indian Institute of Science,Bangalore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018","journal-title":"arXiv preprint arXiv:1810.04805"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref4","article-title":"Emergent abilities of large language models","author":"Wei","year":"2022","journal-title":"arXiv preprint arXiv:2206.07682"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2605"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054438"},{"key":"ref7","article-title":"Representation learning with contrastive predictive coding","author":"den Oord","year":"2018","journal-title":"arXiv preprint arXiv:1807.03748"},{"key":"ref8","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688253"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3200909"},{"key":"ref11","first-page":"3915","article-title":"Self-supervised learning with random-projection quantizer for speech recognition","volume-title":"International Conference on Machine Learning","author":"Chiu"},{"key":"ref12","article-title":"The zero resource speech benchmark 2021: Metrics and baselines for unsupervised spoken language modeling","author":"Nguyen","year":"2020","journal-title":"arXiv preprint arXiv:2011.11588"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3207050"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2023-1862"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1242"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414776"},{"key":"ref17","article-title":"On the robustness of self-supervised representations for spoken language modeling","author":"Gat","year":"2022","journal-title":"arXiv preprint arXiv:2209.15483"},{"key":"ref18","article-title":"Data augmenting contrastive learning of speech representations in the time domain","author":"Kharitonov","year":"2020","journal-title":"arXiv preprint arXiv:2007.00991"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10022552"},{"key":"ref20","article-title":"Multi-variant consistency based self-supervised learning for robust automatic speech recognition","author":"Gao","year":"2021","journal-title":"arXiv preprint arXiv:2112.12522"},{"key":"ref21","article-title":"Learning with pseudo-ensembles","volume":"27","author":"Bachman","year":"2014","journal-title":"Advances in neural information processing systems"},{"key":"ref22","first-page":"18003","article-title":"Contentvec: An improved self-supervised speech representation by disentangling speakers","volume-title":"International Conference on Machine Learning","author":"Qian"},{"key":"ref23","first-page":"18661","article-title":"Supervised contrastive learning","volume":"33","author":"Khosla","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.552"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01045"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01618"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.359"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3337670"},{"key":"ref29","article-title":"Audio word2vec: Unsupervised learning of audio segment representations using sequence-tosequence autoencoder","author":"Chung","year":"2016","journal-title":"arXiv preprint arXiv:1603.00982"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3095662"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-349"},{"key":"ref32","article-title":"vqwav2vec: Self-supervised learning of discrete speech representations","author":"Baevski","year":"2019","journal-title":"arXiv preprint arXiv:1910.05453"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-391"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383605"},{"key":"ref36","article-title":"Regularization with stochastic transformations and perturbations for deep semi-supervised learning","volume":"29","author":"Sajjadi","year":"2016","journal-title":"Advances in neural information processing systems"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-24797-2_7"},{"key":"ref38","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","volume-title":"International conference on machine learning","author":"Chen"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1465"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2013-441"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/E17-2020"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00321"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.6028\/nist.ir.4930"},{"key":"ref45","first-page":"707","article-title":"Binary codes capable of correcting deletions, insertions, and reversals","volume":"10","author":"al","year":"1966","journal-title":"Soviet physics doklady. Soviet Union"},{"key":"ref46","article-title":"Musan: A music, speech, and noise corpus","author":"Snyder","year":"2015","journal-title":"arXiv preprint arXiv:1510.08484"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1080\/03610927408827101"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-950"},{"volume-title":"Free speech\u2026 recognition (linux, windows and mac)-voxforge.org","year":"2014","key":"ref49"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2014.2336244"}],"event":{"name":"2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","start":{"date-parts":[[2023,12,16]]},"location":"Taipei, Taiwan","end":{"date-parts":[[2023,12,20]]}},"container-title":["2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10388490\/10389614\/10389725.pdf?arnumber=10389725","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,23]],"date-time":"2024-01-23T16:42:42Z","timestamp":1706028162000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10389725\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,16]]},"references-count":50,"URL":"https:\/\/doi.org\/10.1109\/asru57964.2023.10389725","relation":{},"subject":[],"published":{"date-parts":[[2023,12,16]]}}}