{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T07:08:35Z","timestamp":1775200115399,"version":"3.50.1"},"reference-count":40,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434734","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-4","source":"Crossref","is-referenced-by-count":0,"title":["VERSA-v2: A Modular and Scalable Toolkit for Speech and Audio Evaluation with Expanded Metrics, Visualization, and LLM Integration"],"prefix":"10.1109","author":[{"given":"Jiatong","family":"Shi","sequence":"first","affiliation":[{"name":"Carnegie Mellon University,USA"}]},{"given":"Bo-Hao","family":"Su","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}]},{"given":"Shikhar","family":"Bharadwaj","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}]},{"given":"Yiwen","family":"Zhao","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}]},{"given":"Shih-Heng","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Southern California,USA"}]},{"given":"Jionghao","family":"Hang","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}]},{"given":"Haoran","family":"Wang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,China"}]},{"given":"Wei","family":"Wang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,China"}]},{"given":"Wenhao","family":"Feng","sequence":"additional","affiliation":[{"name":"Renmin University of China,China"}]},{"given":"Yuxun","family":"Tang","sequence":"additional","affiliation":[{"name":"Renmin University of China,China"}]},{"given":"Nezih","family":"Topalo\u011flu","sequence":"additional","affiliation":[{"name":"Yeditepe University,Turkey"}]},{"given":"Siddhant","family":"Arora","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}]},{"given":"Jinchuan","family":"Tian","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}]},{"given":"William","family":"Chen","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}]},{"given":"Hye-jin","family":"Shim","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}]},{"given":"Wangyou","family":"Zhang","sequence":"additional","affiliation":[{"name":"Yeditepe University,Turkey"}]},{"given":"Wen-Chin","family":"Huang","sequence":"additional","affiliation":[{"name":"Nagoya University,Japan"}]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"Telephone transmission quality subjective opinion tests. a method for subjective performance assessment of the quality of speech voice output devices","year":"1994"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-970"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832295"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10597"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/s00521-018-3849-7"},{"key":"ref6","article-title":"Challenge on sound scene synthesis: Evaluating text-to-audio generation","volume-title":"Audio Imagination: NeurIPS 2024 Workshop AI-Driven Speech, Music, and Sound Generation","author":"Lee"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832255"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2025-1977"},{"key":"ref10","article-title":"AudioLDM: Text-toaudio generation with latent diffusion models","volume-title":"Proc. ICML","author":"Liu"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446663"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.naacl-demo.19"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832289"},{"key":"ref14","article-title":"The Llama 3 herd of models","author":"Grattafiori","year":"2024","journal-title":"arXiv preprint arXiv:2407.21783"},{"key":"ref15","article-title":"Qwen2-audio technical report","volume-title":"arXiv preprint arXiv:2407.10759","author":"Chu","year":"2024"},{"key":"ref16","article-title":"GLM-4-voice: Towards intelligent and human-like end-to-end spoken chatbot","author":"Zeng","year":"2024","journal-title":"arXiv preprint arXiv:2412.02612"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1191"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413711"},{"key":"ref19","article-title":"Understanding features and distance functions for music sequence alignment","volume-title":"Proc. ISMIR","author":"\u0130zmirli"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ASPAA.2005.1540223"},{"key":"ref21","article-title":"BEATs: Audio pre-training with acoustic tokenizers","volume-title":"Proc. ICML","author":"Chen"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2025-1960"},{"key":"ref23","article-title":"Meta Audiobox Aesthetics: Unified automatic quality assessment for speech, music, and sound","author":"Tjandra","year":"2025","journal-title":"arXiv preprint arXiv:2502.05139"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-478"},{"key":"ref25","article-title":"Self-supervised speech quality estimation and enhancement using only clean speech","volume-title":"ICLR","author":"Fu"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/5.842996"},{"key":"ref27","article-title":"Vox-Profile: A speech foundation model benchmark for characterizing diverse speaker and speech traits","author":"Feng","year":"2025","journal-title":"arXiv preprint arXiv:2505.14648"},{"key":"ref28","article-title":"SongEval: A benchmark dataset for song aesthetics evaluation","author":"Yao","year":"2025","journal-title":"arXiv preprint arXiv:2505.10793"},{"key":"ref29","article-title":"Talking turns: Benchmarking audio foundation models on turn-taking dynamics","volume-title":"Proc. ICLR","author":"Arora"},{"key":"ref30","article-title":"Kimi-audio technical report","volume-title":"arXiv preprint arXiv:2504.18425","author":"Ding","year":"2025"},{"key":"ref31","article-title":"Qwen2.5-omni technical report","volume-title":"arXiv preprint arXiv:2503.20215","author":"Xu","year":"2025"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3263585"},{"key":"ref33","article-title":"faster-whisper: Faster whisper transcription with ctranslate2","year":"2025","journal-title":"SYSTRAN"},{"key":"ref34","article-title":"NeMo: a toolkit for building ai applications using neural modules","author":"Kuchaiev","year":"2019","journal-title":"arXiv preprint arXiv:1909.09577"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref36","article-title":"FireRedASR: Open-source industrial-grade mandarin speech recognition models from encoder-decoder to llm integration","author":"Xu","year":"2025","journal-title":"arXiv preprint arXiv:2501.14350"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-9996"},{"key":"ref38","article-title":"Singer identity representation learning using self-supervised techniques","volume-title":"Proc. ISMIR","author":"Torres"},{"key":"ref39","article-title":"Improving speech enhancement with multimetric supervision from learned quality assessment","author":"Wang","year":"2025","journal-title":"arXiv preprint arXiv:2506.12260"},{"key":"ref40","article-title":"ARECHO: Autoregressive evaluation via chain-based hypothesis optimization for speech multi-metric estimation","volume-title":"arXiv preprint arXiv:2505.24518","author":"Shi","year":"2025"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434734.pdf?arnumber=11434734","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:59:04Z","timestamp":1775192344000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434734\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":40,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434734","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}