{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:25:58Z","timestamp":1776889558639,"version":"3.51.2"},"reference-count":71,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434723","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-8","source":"Crossref","is-referenced-by-count":2,"title":["AudioLens: A Closer Look at Auditory Attribute Perception of Large Audio-Language Models"],"prefix":"10.1109","author":[{"given":"Chih-Kai","family":"Yang","sequence":"first","affiliation":[{"name":"National Taiwan University,Taipei,Taiwan"}]},{"given":"Neo","family":"Ho","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taipei,Taiwan"}]},{"given":"Yi-Jyun","family":"Lee","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taipei,Taiwan"}]},{"given":"Hung-Yi","family":"Lee","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taipei,Taiwan"}]}],"member":"263","reference":[{"key":"ref1","article-title":"A survey of large language models","author":"Zhao","year":"2023","journal-title":"arXiv preprint arXiv:2303.18223"},{"key":"ref2","article-title":"The llama 3 herd of models","author":"Grattafiori","year":"2024","journal-title":"arXiv preprint arXiv:2407.21783"},{"key":"ref3","article-title":"Gpt-4o system card","author":"Hurst","year":"2024","journal-title":"arXiv preprint arXiv:2410.21276"},{"key":"ref4","article-title":"Listen, think, and understand","volume-title":"International Conference on Learning Representations","author":"Gong"},{"key":"ref5","article-title":"Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models","author":"Chu","year":"2023","journal-title":"arXiv preprint arXiv:2311.07919"},{"key":"ref6","article-title":"Qwen2-audio technical report","volume-title":"arXiv preprint arXiv:2407.10759","author":"Chu","year":"2024"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10889444"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.361"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-457"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832184"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389742"},{"key":"ref12","article-title":"SALMONN: Towards generic hearing abilities for large language models","volume-title":"The Twelfth International Conference on Learning Representations","author":"Tang"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.1070"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.263"},{"key":"ref15","article-title":"Dynamic-SUPERB phase-2: A collaboratively expanding benchmark for measuring the capabilities of spoken language models with 180 tasks","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Huang"},{"key":"ref16","first-page":"1979","article-title":"AIR-bench: Benchmarking large audio-language models via generative comprehension","volume-title":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","author":"Yang"},{"key":"ref17","article-title":"MMAU: A massive multi-task audio understanding and reasoning benchmark","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Sakshi"},{"key":"ref18","doi-asserted-by":"crossref","DOI":"10.52202\/079017-1813","article-title":"SD-eval: A benchmark dataset for spoken dialogue understanding beyond words","volume-title":"The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track","author":"Ao"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.naacl-long.218"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2025-839"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.emnlp-main.514"},{"key":"ref22","article-title":"A preliminary exploration with gpt-4o voice mode","author":"Lin","year":"2025","journal-title":"arXiv preprint arXiv:2502.09940"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.309"},{"key":"ref24","article-title":"Unveiling a core linguistic region in large language models","author":"Zhao","year":"2023","journal-title":"arXiv preprint arXiv:2310.14928"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.550"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.781"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.emnlp-main.567"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.581"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.191"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832317"},{"key":"ref31","article-title":"The curse of multi-modalities: Evaluating hallucinations of large multimodal models across language, visual, and audio","author":"Leng","year":"2024","journal-title":"arXiv preprint arXiv:2410.12787"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888384"},{"key":"ref33","volume-title":"Interpreting GPT: the logit lens","year":"2020"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.3"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.63317\/5griomztwyxs"},{"key":"ref36","article-title":"Eliciting latent predictions from transformers with the tuned lens","author":"Belrose","year":"2023","journal-title":"arXiv preprint arXiv:2303.08112"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1775"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.580"},{"key":"ref39","first-page":"125","article-title":"Hear: Holistic evaluation of audio representations","author":"Turian","year":"2022","journal-title":"NeurIPS 2021 Competitions and Demonstrations Track"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1722"},{"key":"ref41","article-title":"The zero resource speech benchmark 2021: Metrics and baselines for unsupervised spoken language modeling","volume-title":"NeuRIPS Workshop on Self-Supervised Learning for Speech and Audio Processing","author":"Nguyen"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446737"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSPW62465.2024.10626762"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1316"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747490"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"ref48","first-page":"12 449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-329"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688093"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096149"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00656"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1157"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832151"},{"key":"ref55","first-page":"53656","article-title":"And: Audio network dissection for interpreting deep acoustic models","volume-title":"International Conference on Machine Learning","author":"Wu"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832185"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-21707-4_30"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1073"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.574"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.387"},{"key":"ref61","article-title":"Understanding and enhancing safety mechanisms of LLMs via safety-specific neuron","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Zhao"},{"key":"ref62","first-page":"15466","article-title":"Patchscopes: a unifying framework for inspecting hidden representations of language models","volume-title":"Proceedings of the 41st International Conference on Machine Learning","author":"Ghandeharioun"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1162\/coli_a_00422"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1262"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.281"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.751"},{"key":"ref67","article-title":"Logitlens411ms: Extending logit lens analysis to modern large language models","author":"Wang","year":"2025","journal-title":"arXiv preprint arXiv:2503.11667"},{"key":"ref68","article-title":"Interpreting and editing vision-language representations to mitigate hallucinations","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Jiang"},{"key":"ref69","article-title":"Towards interpreting visual information processing in vision-language models","author":"Neo","year":"2024","journal-title":"arXiv preprint arXiv:2410.07149"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.387"},{"key":"ref71","article-title":"Measuring massive multitask language understanding","volume-title":"International Conference on Learning Representations","author":"Hendrycks"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434723.pdf?arnumber=11434723","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:58:50Z","timestamp":1775192330000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434723\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":71,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434723","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}