{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:07:05Z","timestamp":1776881225112,"version":"3.51.2"},"reference-count":59,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434802","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-8","source":"Crossref","is-referenced-by-count":1,"title":["USAD: Universal Speech and Audio Representation via Distillation"],"prefix":"10.1109","author":[{"given":"Heng-Jui","family":"Chang","sequence":"first","affiliation":[{"name":"MIT CSAIL,Cambridge,MA,USA"}]},{"given":"Saurabhchand","family":"Bhati","sequence":"additional","affiliation":[{"name":"MIT CSAIL,Cambridge,MA,USA"}]},{"given":"James","family":"Glass","sequence":"additional","affiliation":[{"name":"MIT CSAIL,Cambridge,MA,USA"}]},{"given":"Alexander H.","family":"Liu","sequence":"additional","affiliation":[{"name":"MIT CSAIL,Cambridge,MA,USA"}]}],"member":"263","reference":[{"key":"ref1","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"Baevski","year":"2020","journal-title":"NeurIPS"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21315"},{"key":"ref5","article-title":"Beats: Audio pre-training with acoustic tokenizers","author":"Chen","year":"2022","journal-title":"ICML"},{"key":"ref6","article-title":"Mert: Acoustic music understanding model with large-scale self-supervised training","author":"Li","year":"2024","journal-title":"ICLR"},{"key":"ref7","article-title":"Listen, think, and understand","author":"Gong","year":"2024","journal-title":"ICLR"},{"key":"ref8","article-title":"SALMONN: Towards generic hearing abilities for large language models","author":"Tang","year":"2024","journal-title":"ICLR"},{"key":"ref9","article-title":"Qwen2-audio technical report","volume-title":"arXiv","author":"Chu","year":"2024"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.361"},{"key":"ref11","article-title":"Google usm: Scaling automatic speech recognition beyond 100 languages","author":"Zhang","year":"2023","journal-title":"arXiv"},{"key":"ref12","article-title":"Speechtokenizer: Unified speech tokenizer for speech language models","author":"Zhang","year":"2024","journal-title":"ICLR"},{"key":"ref13","article-title":"Soundstorm: Efficient parallel audio generation","author":"Borsos","year":"2023","journal-title":"arXiv"},{"key":"ref14","article-title":"Moshi: a speech-text foundation model for real-time dialogue","author":"Defossez","year":"2024","journal-title":"arXiv"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2025-246"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389742"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2025-1524"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446637"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10961"},{"key":"ref20","article-title":"Masked spectrogram modeling using masked autoencoders for learning general-purpose audio representation","author":"Niizumi","year":"2022","journal-title":"HEAR: Holistic Evaluation of Audio Representations (NeurIPS 2021 Competition)"},{"key":"ref21","article-title":"Masked autoencoders that listen","author":"Huang","year":"2022","journal-title":"NeurIPS"},{"key":"ref22","article-title":"data2vec: A general framework for self-supervised learning in speech, vision and language","author":"Baevski","year":"2022","journal-title":"ICML"},{"key":"ref23","article-title":"Efficient self-supervised learning with contextualized target representations for vision, speech and language","author":"Baevski","year":"2023","journal-title":"ICML"},{"key":"ref24","article-title":"Dinosr: Self-distillation and online clustering for self-supervised speech representation learning","author":"Liu","year":"2023","journal-title":"NeurIPS"},{"key":"ref25","article-title":"Eat: Self-supervised pre-training with efficient audio transformer","author":"Chen","year":"2024","journal-title":"IJCAI"},{"key":"ref26","article-title":"Sslam: Enhancing self-supervised models with audio mixtures for polyphonic soundscapes","author":"Alex","year":"2025","journal-title":"ICLR"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN52387.2021.9534474"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3352248"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10097236"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747490"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1213"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832169"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096445"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2025-747"},{"key":"ref35","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"NIPS"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688093"},{"key":"ref37","article-title":"Robust speech recognition via large-scale weak supervision","author":"Radford","year":"2023","journal-title":"ICML"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2826"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.80"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1965"},{"key":"ref43","article-title":"Common voice: A massively-multilingual speech corpus","author":"Ardila","year":"2020","journal-title":"LREC"},{"key":"ref44","article-title":"The fisher corpus: A resource for the next generations of speech-to-text","author":"Cieri","year":"2004","journal-title":"LREC"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383459"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref47","article-title":"Soundnet: Learning sound representations from unlabeled video","author":"Aytar","year":"2016","journal-title":"NIPS"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/iwssip48289.2020.9145170"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-4009"},{"key":"ref51","article-title":"Layer normalization","author":"Ba","year":"2016","journal-title":"arXiv"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-2074"},{"key":"ref53","article-title":"Speech commands: A dataset for limited-vocabulary speech recognition","author":"Warden","year":"2018","journal-title":"arXiv"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1775"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.580"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3389631"},{"key":"ref57","article-title":"Hear: Holistic evaluation of audio representations","author":"Turian","year":"2022","journal-title":"NeurIPS 2021 Competitions and Demonstrations Track"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"ref59","article-title":"Superb@ slt 2022: Challenge on generalization and efficiency of self-supervised speech representation learning","author":"Feng","year":"2022","journal-title":"SLT"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434802.pdf?arnumber=11434802","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:59:56Z","timestamp":1775192396000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434802\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":59,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434802","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}