{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T07:03:00Z","timestamp":1775199780043,"version":"3.50.1"},"reference-count":48,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100006112","name":"Microsoft Research","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100006112","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434653","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["JOOCI: a Novel Method for Learning Comprehensive Speech Representations"],"prefix":"10.1109","author":[{"given":"Hemant","family":"Yadav","sequence":"first","affiliation":[{"name":"IIIT,Delhi,India"}]},{"given":"Sunayana","family":"Sitaram","sequence":"additional","affiliation":[{"name":"Microsoft Research,India"}]},{"given":"Rajiv Ratn","family":"Shah","sequence":"additional","affiliation":[{"name":"IIIT,Delhi,India"}]}],"member":"263","reference":[{"key":"ref1","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref2","article-title":"An image is worth 16 \u00d7 16 words: Transformers for image recognition at scale","author":"Alexey","year":"2020","journal-title":"arXiv preprint arXiv: 2010.11929"},{"key":"ref3","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3207050"},{"key":"ref5","article-title":"High fidelity neural audio compression","author":"D\u00e9fossez","year":"2022","journal-title":"arXiv preprint arXiv:2210.13438"},{"key":"ref6","article-title":"Efficient self-supervised learning with contextualized target representations for vision, speech and language","author":"Baevski","year":"2022","journal-title":"arXiv preprint arXiv:2212.07525"},{"key":"ref7","article-title":"Exploration on hubert with multiple resolutions","author":"Shi","year":"2023","journal-title":"arXiv preprint arXiv:2306.01084"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1775"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096167"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688253"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"ref15","article-title":"Silence is sweeter than speech: Self-supervised model using silence to store speaker information","author":"Feng","year":"2022","journal-title":"arXiv preprint arXiv:2205.03759"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747077"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1978"},{"key":"ref18","article-title":"Deep Learning","author":"Goodfellow","year":"2016"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref20","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv preprint arXiv:2302.13971"},{"key":"ref21","article-title":"The llama 3 herd of models","author":"Dubey","year":"2024","journal-title":"arXiv preprint arXiv:2407.21783"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"ref24","first-page":"1298","article-title":"Data2vec: A general framework for self-supervised learning in speech, vision and language","volume-title":"International Conference on Machine Learning.","author":"Baevski"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747526"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2018.2822810"},{"key":"ref27","article-title":"Multi-resolution hubert: Multi-resolution speech self-supervised learning with masked unit prediction","author":"Shi","year":"2023","journal-title":"arXiv preprint arXiv:2310.02720"},{"key":"ref28","first-page":"18003","article-title":"Contentvec: An improved self-supervised speech representation by disentangling speakers","volume-title":"International Conference on Machine Learning.","author":"Qian"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-847"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-847"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.394"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-390"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096915"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2938758"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2018-993"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3497510"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10368"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054548"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref40","article-title":"Musan: A music, speech, and noise corpus","author":"Snyder","year":"2015","journal-title":"arXiv preprint arXiv:1510.08484"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3389631"},{"key":"ref42","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","volume-title":"International conference on machine learning.","author":"Chen"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688093"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096149"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023274"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSPW59220.2023.10193427"},{"key":"ref47","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2023","journal-title":"arXiv preprint arXiv:2301.02111"},{"key":"ref48","article-title":"Vall-e 2: Neural codec language models are human parity zero-shot text to speech synthesizers","author":"Chen","year":"2024","journal-title":"arXiv preprint arXiv:2406.05370"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434653.pdf?arnumber=11434653","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:57:48Z","timestamp":1775192268000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434653\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":48,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434653","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}