{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,29]],"date-time":"2025-12-29T19:01:34Z","timestamp":1767034894769,"version":"3.45.0"},"reference-count":40,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,12]]},"DOI":"10.1109\/waspaa66052.2025.11230922","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:47Z","timestamp":1763146007000},"page":"1-5","source":"Crossref","is-referenced-by-count":1,"title":["Bridging Ears and Eyes: Analyzing Audio and Visual Large Language Models to Humans in Visible Sound Recognition and Reducing Their Sensory Gap via Cross-Modal Distillation"],"prefix":"10.1109","author":[{"given":"Xilin","family":"Jiang","sequence":"first","affiliation":[{"name":"Columbia University,NY,USA"}]},{"given":"Junkai","family":"Wu","sequence":"additional","affiliation":[{"name":"University of Washington,WA,USA"}]},{"given":"Vishal","family":"Choudhari","sequence":"additional","affiliation":[{"name":"Columbia University,NY,USA"}]},{"given":"Nima","family":"Mesgarani","sequence":"additional","affiliation":[{"name":"Columbia University,NY,USA"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1146\/annurev.neuro.051508.135431"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1146\/annurev.neuro.23.1.315"},{"article-title":"Sparks of large audio models: A survey and outlook","year":"2023","author":"Latif","key":"ref3"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3369699"},{"key":"ref5","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"International conference on machine learning","author":"Radford"},{"article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"International Conference on Learning Representations","author":"Dosovitskiy","key":"ref6"},{"article-title":"Improving language understanding by generative pre-training","year":"2018","author":"Radford","key":"ref7"},{"issue":"8","key":"ref8","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1016\/j.tics.2004.02.002"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/j.tics.2008.07.006"},{"issue":"12","key":"ref11","first-page":"3","article-title":"Synaesthesia\u2013a window into perception, thought and language","volume":"8","author":"Ramachandran","year":"2001","journal-title":"Journal of consciousness studies"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/6590.001.0001"},{"article-title":"Qwen2-audio technical report","year":"2024","author":"Chu","key":"ref13"},{"article-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution","year":"2024","author":"Wang","key":"ref14"},{"article-title":"Qwen2. 5-omni technical report","year":"2025","author":"Xu","key":"ref15"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-698"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref20","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"ref22","first-page":"4297","article-title":"AudioBench: A universal benchmark for audio large language models","volume-title":"Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)","author":"Wang"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.52202\/079017-4464"},{"article-title":"AVHBench: A cross-modal hallucination benchmark for audio-visual large language models","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Sung-Bin","key":"ref24"},{"article-title":"Avtrustbench: Assessing and enhancing reliability and robustness in audio-visual llms","year":"2025","author":"Chowdhury","key":"ref25"},{"article-title":"MiniLLM: Knowledge distillation of large language models","volume-title":"The Twelfth International Conference on Learning Representations","author":"Gu","key":"ref26"},{"key":"ref27","first-page":"13899","article-title":"EfficientVLM: Fast and accurate vision-language models via knowledge distillation and modal-adaptive pruning","volume-title":"Findings of the Association for Computational Linguistics: ACL 2023","author":"Wang"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr46437.2021.00694"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i13.29407"},{"article-title":"Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models","year":"2023","author":"Chu","key":"ref30"},{"article-title":"Kimi-audio technical report","year":"2025","author":"Ding","key":"ref31"},{"article-title":"Qwen2. 5-vl technical report","year":"2025","author":"Bai","key":"ref32"},{"article-title":"Videollama 3: Frontier multimodal foundation models for image and video understanding","year":"2025","author":"Zhang","key":"ref33"},{"article-title":"Qwen2 technical report","year":"2024","author":"Yang","key":"ref34"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3229643"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2297"},{"issue":"2","key":"ref37","first-page":"3","article-title":"Lora: Low-rank adaptation of large language models","volume":"1","author":"Hu","year":"2022","journal-title":"ICLR"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.1257"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00265"}],"event":{"name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","start":{"date-parts":[[2025,10,12]]},"location":"Tahoe City, CA, USA","end":{"date-parts":[[2025,10,15]]}},"container-title":["2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11230875\/11230917\/11230922.pdf?arnumber=11230922","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:23:01Z","timestamp":1763191381000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11230922\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":40,"URL":"https:\/\/doi.org\/10.1109\/waspaa66052.2025.11230922","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]}}}