{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T15:45:58Z","timestamp":1774021558035,"version":"3.50.1"},"reference-count":41,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100003696","name":"Electronics and Telecommunications Research Institute","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003696","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,12]]},"DOI":"10.1109\/waspaa66052.2025.11230960","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:47Z","timestamp":1763146007000},"page":"1-5","source":"Crossref","is-referenced-by-count":1,"title":["Task-Specific Audio Coding for Machines: Machine-Learned Latent Features Are Codes for That Machine"],"prefix":"10.1109","author":[{"given":"Anastasia","family":"Kuznetsova","sequence":"first","affiliation":[{"name":"Indiana University,Bloomington,IN,USA"}]},{"given":"Inseon","family":"Jang","sequence":"additional","affiliation":[{"name":"Electronics and Telecommunications Research Institute,Daejeon,Korea"}]},{"given":"Wootaek","family":"Lim","sequence":"additional","affiliation":[{"name":"Electronics and Telecommunications Research Institute,Daejeon,Korea"}]},{"given":"Minje","family":"Kim","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign,Champaign,IL,USA"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Coding of moving pictures and associated audio for digital storage media at up to about 1.5 Mbit\/s","year":"1993"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/5.842996"},{"key":"ref3","article-title":"Information technology \u2014 Generic coding of moving pictures and associated audio information \u2014 Part 7: Advanced Audio Coding (AAC)","year":"2006"},{"key":"ref4","year":"2010","journal-title":"ISO\/IEC - Information technology \u2013 MPEG audio technologies \u2013 Part 2: Spatial Audio Object Coding (SAOC), ISO\/IEC IS 23 003-2"},{"key":"ref5","article-title":"Transport of unified speech and audio coding (USAC)","year":"2011"},{"key":"ref6","article-title":"Wideband coding of speech at around 16 kbit\/s using Adaptive Multi-Rate Wideband (AMR-WB)","year":"2003"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"ref8","article-title":"High-fidelity audio compression with improved RVQGAN","author":"Kumar","year":"2023","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref9","article-title":"High fidelity neural audio compression","author":"D\u00e9fossez","year":"2023","journal-title":"Transactions on Machine Learning Research"},{"key":"ref10","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2023"},{"key":"ref11","first-page":"1336","article-title":"On generative spoken language modeling from raw audio","volume":"9","author":"Lakhotia","year":"2021","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.682"},{"key":"ref13","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2023"},{"key":"ref14","article-title":"Beats: Audio pre-training with acoustic tokenizers","author":"Chen","year":"2022"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-590"},{"key":"ref16","article-title":"DASB - discrete audio and speech benchmark","author":"Mousavi","year":"2024"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.616"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2020.3039765"},{"key":"ref19","article-title":"Use Cases and Requirements on Audio Coding for Machines","volume-title":"International Organisation for Standardisation, Tech. Rep. N0046"},{"key":"ref20","first-page":"12 449","article-title":"Wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref23","article-title":"On The Landscape of Spoken Language Models: A Comprehensive Survey","author":"Arora","year":"2025"},{"key":"ref24","article-title":"Towards audio language modeling\u2013an overview","author":"Wu","year":"2024"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832289"},{"key":"ref26","article-title":"Recent advances in discrete speech tokens: A review","author":"Guo","year":"2025"},{"key":"ref27","article-title":"CosyVoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens","author":"Du","year":"2024"},{"key":"ref28","first-page":"1141","article-title":"Soft-to-hard vector quantization for end-to-end learning compressible representations","author":"Agustsson","year":"2017","journal-title":"Advances in Neural Information Processing Systems (NIPS)"},{"key":"ref29","article-title":"High-fidelity audio compression with improved RVQGAN","author":"Kumar","year":"2023"},{"issue":"333","key":"ref30","article-title":"Open-Source Conversational AI with SpeechBrain 1.0","volume":"25","author":"Ravanelli","year":"2024","journal-title":"Journal of Machine Learning Research"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d18-2012"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref33","article-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2015","journal-title":"CoRR"},{"key":"ref34","first-page":"6306","article-title":"Neural discrete representation learning","author":"van den Oord","year":"2017","journal-title":"Advances in Neural Information Processing Systems (NIPS)"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPAASC47483.2019.9023039"},{"key":"ref37","article-title":"Tree-structured parzen estimator: Understanding its algorithm components and their roles for better empirical performance","author":"Watanabe","year":"2023"},{"key":"ref38","article-title":"Epistimio\/orion: Asynchronous Distributed Hyperparameter Optimization","author":"Bouthillier","year":"2022"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2655045"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953233"}],"event":{"name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","location":"Tahoe City, CA, USA","start":{"date-parts":[[2025,10,12]]},"end":{"date-parts":[[2025,10,15]]}},"container-title":["2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11230875\/11230917\/11230960.pdf?arnumber=11230960","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,17]],"date-time":"2025-11-17T18:38:41Z","timestamp":1763404721000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11230960\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":41,"URL":"https:\/\/doi.org\/10.1109\/waspaa66052.2025.11230960","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]}}}