{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:20:10Z","timestamp":1765308010903,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","funder":[{"name":"Shenzhen Science and Technology Program","award":["JCYJ20220818103001002"],"award-info":[{"award-number":["JCYJ20220818103001002"]}]},{"name":"Shenzhen Doctoral Startup Funding","award":["RCBS20221008093330065"],"award-info":[{"award-number":["RCBS20221008093330065"]}]},{"name":"Tianyuan Fund for Mathematics of National Natural Science Foundation of China (NSFC)","award":["12326608"],"award-info":[{"award-number":["12326608"]}]},{"name":"Shenzhen Science and Technology Program (Shenzhen Key Laboratory)","award":["Grant No. ZDSYS20230626091302006"],"award-info":[{"award-number":["Grant No. ZDSYS20230626091302006"]}]},{"name":"Shenzhen Stability Science Program 2023","award":["None"],"award-info":[{"award-number":["None"]}]},{"name":"Shenzhen Key Lab of Multi-Modal Cognitive Computing","award":["None"],"award-info":[{"award-number":["None"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758173","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:44:48Z","timestamp":1761371088000},"page":"12464-12472","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Can Audio Language Models Listen Between the Lines? A Study on Metaphorical Reasoning via\n                    <scp>Unspoken<\/scp>"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-6715-357X","authenticated-orcid":false,"given":"Hongru","family":"Xiao","sequence":"first","affiliation":[{"name":"College of Civil Engineering, Tongji University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2453-3390","authenticated-orcid":false,"given":"Xiang","family":"Li","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4769-3502","authenticated-orcid":false,"given":"Duyi","family":"Pan","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7317-9883","authenticated-orcid":false,"given":"Longfei","family":"Zhang","sequence":"additional","affiliation":[{"name":"Henan Polytechnic University, JiaoZuo, Henan, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7152-0125","authenticated-orcid":false,"given":"ZhixueSong","family":"ZhixueSong","sequence":"additional","affiliation":[{"name":"China University of Mining Technology - Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6477-0424","authenticated-orcid":false,"given":"Jiale","family":"Han","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3132-9414","authenticated-orcid":false,"given":"Songning","family":"Lai","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1966-6059","authenticated-orcid":false,"given":"Wenshuo","family":"Chen","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9430-9660","authenticated-orcid":false,"given":"Jing","family":"Tang","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1501-9914","authenticated-orcid":false,"given":"Benyou","family":"Wang","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Shenzhen, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conf. on Neural Information Processing Systems 2020","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, and et al. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems 33: Annual Conf. on Neural Information Processing Systems 2020, NeurIPS 2020, December 6--12, 2020, virtual, Hugo Larochelle, Marc'Aurelio Ranzato, Raia Hadsell, Maria-Florina Balcan, and Hsuan-Tien Lin (Eds.)."},{"key":"e_1_3_2_1_2_1","volume-title":"VALL-E 2: Neural Codec Language Models are Human Parity Zero-Shot Text to Speech Synthesizers. CoRR","author":"Chen Sanyuan","year":"2024","unstructured":"Sanyuan Chen, Shujie Liu, Long Zhou, Yanqing Liu, Xu Tan, Jinyu Li, Sheng Zhao, Yao Qian, and Furu Wei. 2024. VALL-E 2: Neural Codec Language Models are Human Parity Zero-Shot Text to Speech Synthesizers. CoRR (2024)."},{"key":"e_1_3_2_1_3_1","volume-title":"VoiceBench: Benchmarking LLM-Based Voice Assistants. CoRR","author":"Chen Yiming","year":"2024","unstructured":"Yiming Chen, Xianghu Yue, Chen Zhang, Xiaoxue Gao, Robby T. Tan, and Haizhou Li. 2024. VoiceBench: Benchmarking LLM-Based Voice Assistants. CoRR (2024)."},{"key":"e_1_3_2_1_4_1","unstructured":"Yunfei Chu Jin Xu Qian Yang Haojie Wei Xipin Wei Zhifang Guo Yichong Leng Yuanjun Lv Jinzheng He Junyang Lin Chang Zhou and Jingren Zhou. 2024. Qwen2-Audio Technical Report. CoRR (2024)."},{"key":"e_1_3_2_1_5_1","volume-title":"David Dale, Ning Dong, Paul-Ambroise Duquenne, and et al.","author":"Communication Seamless","year":"2023","unstructured":"Seamless Communication, Lo\u00efc Barrault, Yu-An Chung, Mariano Coria Meglioli, David Dale, Ning Dong, Paul-Ambroise Duquenne, and et al. 2023. SeamlessM4TMassively Multilingual & Multimodal Machine Translation. CoRR (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"Moshi: a speech-text foundation model for real-time dialogue. CoRR","author":"D\u00e9fossez Alexandre","year":"2024","unstructured":"Alexandre D\u00e9fossez, Laurent Mazar\u00e9, Manu Orsini, Am\u00e9lie Royer, Patrick P\u00e9rez, Herv\u00e9 J\u00e9gou, Edouard Grave, and Neil Zeghidour. 2024. Moshi: a speech-text foundation model for real-time dialogue. CoRR (2024)."},{"key":"e_1_3_2_1_7_1","volume-title":"Audio Entailment: Assessing Deductive Reasoning for Audio Understanding. In AAAI-25, Sponsored by the Association for the Advancement of Artificial Intelligence, February 25 -","author":"Deshmukh Soham","year":"2025","unstructured":"Soham Deshmukh, Shuo Han, Hazim T. Bukhari, Benjamin Elizalde, Hannes Gamper, Rita Singh, and Bhiksha Raj. 2025. Audio Entailment: Assessing Deductive Reasoning for Audio Understanding. In AAAI-25, Sponsored by the Association for the Advancement of Artificial Intelligence, February 25 - March 4, 2025, Philadelphia, PA, USA, Toby Walsh, Julie Shah, and Zico Kolter (Eds.). AAAI Press, 23769--23777."},{"key":"e_1_3_2_1_8_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman and et al. 2024. The Llama 3 Herd of Models. CoRR (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"Natural Language Supervision For General-Purpose Audio Representations. In IEEE International Conf. on Acoustics, Speech and Signal Processing, ICASSP 2024","author":"Elizalde Benjamin","year":"2024","unstructured":"Benjamin Elizalde, Soham Deshmukh, and Huaming Wang. 2024. Natural Language Supervision For General-Purpose Audio Representations. In IEEE International Conf. on Acoustics, Speech and Signal Processing, ICASSP 2024, Seoul, Republic of Korea, April 14--19, 2024. IEEE, 336--340."},{"key":"e_1_3_2_1_10_1","volume-title":"LLaMA-Omni: Seamless Speech Interaction with Large Language Models. In The Thirteenth International Conf. on Learning Representations, ICLR 2025","author":"Fang Qingkai","year":"2025","unstructured":"Qingkai Fang, Shoutao Guo, Yan Zhou, Zhengrui Ma, Shaolei Zhang, and Yang Feng. 2025. LLaMA-Omni: Seamless Speech Interaction with Large Language Models. In The Thirteenth International Conf. on Learning Representations, ICLR 2025, Singapore, April 24--28, 2025. OpenReview.net."},{"key":"e_1_3_2_1_11_1","unstructured":"Chaoyou Fu Haojia Lin Xiong Wang Yi-Fan Zhang Yunhang Shen Xiaoyu Liu Haoyu Cao Zuwei Long Heting Gao Ke Li et al. 2025. Vita-1.5: Towards gpt-4o level real-time vision and speech interaction. arXiv preprint arXiv:2501.01957 (2025)."},{"key":"e_1_3_2_1_12_1","unstructured":"Ailin Huang Boyong Wu Bruce Wang Chao Yan Chen Hu Chengli Feng and et al. 2025. Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction. CoRR (2025)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448257"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102422"},{"key":"e_1_3_2_1_15_1","unstructured":"KimiTeam Ding Ding Zeqian Ju Yichong Leng Songxiang Liu Tong Liu Zeyu Shang Kai Shen Wei Song Xu Tan Heyi Tang Zhengtao Wang Chu Wei Yifei Xin Xinran Xu Jianwei Yu Yutao Zhang Xinyu Zhou Y. Charles Jun Chen Yanru Chen Yulun Du Weiran He Zhenxing Hu Guokun Lai Qingcheng Li Yangyang Liu Weidong Sun Jianzhou Wang Yuzhi Wang Yuefeng Wu Yuxin Wu Dongchao Yang Hao Yang Ying Yang Zhilin Yang Aoxiong Yin Ruibin Yuan Yutong Zhang and Zaida Zhou. 2025. Kimi-Audio Technical Report. arXiv:2504.18425 [eess.AS] https:\/\/arxiv.org\/abs\/2504.18425"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.figlang-1.3"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2078"},{"key":"e_1_3_2_1_18_1","volume-title":"Advances in Neural Information Processing Systems 38: Annual Conf. on Neural Information Processing Systems","author":"Liu Ziqiang","year":"2024","unstructured":"Ziqiang Liu, Feiteng Fang, Xi Feng, Xeron Du, Chenhao Zhang, Noah Wang, Yuelin Bai, Qixuan Zhao, Liyang Fan, Chengguang Gan, Hongquan Lin, Jiaming Li, Yuansheng Ni, HaihongWu, Yaswanth Narsupalli, Zhigang Zheng, Chengming Li, Xiping Hu, Ruifeng Xu, Xiaojun Chen, Min Yang, Jiaheng Liu, Ruibo Liu,Wenhao Huang, Ge Zhang, and Shiwen Ni. 2024. II-Bench: An Image Implication Understanding Benchmark for Multimodal Large Language Models. In Advances in Neural Information Processing Systems 38: Annual Conf. on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver, BC, Canada, December 10 - 15, 2024, Amir Globersons, Lester Mackey, Danielle Belgrave, Angela Fan, Ulrich Paquet, Jakub M. Tomczak, and Cheng Zhang (Eds.)."},{"key":"e_1_3_2_1_19_1","unstructured":"Zuwei Long Yunhang Shen Chaoyou Fu Heting Gao Lijiang Li Peixian Chen Mengdan Zhang Hang Shao Jian Li Jinlong Peng et al. 2025. VITA-Audio: Fast Interleaved Cross-Modal Token Generation for Efficient Large Speech-Language Model. arXiv preprint arXiv:2505.03739 (2025)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Titouan Parcollet Ha Nguyen Sol\u00e8ne Evain Marcely Zanon Boito Adrien Pupier Salima Mdhaffar Hang Le Sina Alisamir Natalia A. Tomashenko Marco Dinarelli Shucong Zhang Alexandre Allauzen Maximin Coavoux Yannick Est\u00e8ve Mickael Rouvier J\u00e9r\u00f4me Goulian Benjamin Lecouteux Fran\u00e7ois Portet Solange Rossato Fabien Ringeval Didier Schwab and Laurent Besacier. 2024. LeBenchmark 2.0: A standardized replicable and enhanced framework for self-supervised representations of French speech. Comput. Speech Lang. (2024) 101622.","DOI":"10.1016\/j.csl.2024.101622"},{"key":"e_1_3_2_1_21_1","volume-title":"International Conf. on Machine Learning, ICML 2023","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, JongWook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust Speech Recognition via Large-Scale Weak Supervision. In International Conf. on Machine Learning, ICML 2023, 23--29 July 2023, Honolulu, Hawaii, USA (Proceedings of Machine Learning Research), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 28492--28518."},{"key":"e_1_3_2_1_22_1","volume-title":"MMAU: A Massive Multi-Task Audio Understanding and Reasoning Benchmark. In The Thirteenth International Conf. on Learning Representations, ICLR 2025","author":"Sakshi S.","year":"2025","unstructured":"S. Sakshi, Utkarsh Tyagi, Sonal Kumar, Ashish Seth, Ramaneswaran Selvakumar, Oriol Nieto, Ramani Duraiswami, Sreyan Ghosh, and Dinesh Manocha. 2025. MMAU: A Massive Multi-Task Audio Understanding and Reasoning Benchmark. In The Thirteenth International Conf. on Learning Representations, ICLR 2025, Singapore, April 24--28, 2025. OpenReview.net."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00521-018-3760-2"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.580"},{"key":"e_1_3_2_1_25_1","volume-title":"Chen","author":"Wang Bin","year":"2024","unstructured":"Bin Wang, Xunlong Zou, Geyu Lin, Shuo Sun, Zhuohan Liu, Wenyu Zhang, Zhengyuan Liu, AiTi Aw, and Nancy F. Chen. 2024. AudioBench: A Universal Benchmark for Audio Large Language Models. CoRR (2024)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.109"},{"key":"e_1_3_2_1_27_1","volume-title":"Glm-4-voice: Towards intelligent and human-like end-to-end spoken chatbot. arXiv preprint arXiv:2412.02612","author":"Zeng Aohan","year":"2024","unstructured":"Aohan Zeng, Zhengxiao Du, Mingdao Liu, Kedong Wang, Shengmin Jiang, Lei Zhao, Yuxiao Dong, and Jie Tang. 2024. Glm-4-voice: Towards intelligent and human-like end-to-end spoken chatbot. arXiv preprint arXiv:2412.02612 (2024)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Dongyu Zhang Minghao Zhang Heting Zhang Liang Yang and Hongfei Lin. 2021. MultiMET: A Multimodal Dataset for Metaphor Understanding. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conf. on Natural Language Processing ACL\/IJCNLP 2021 (Vol. 1: Long Papers) Virtual Event August 1--6 2021 Chengqing Zong Fei Xia Wenjie Li and Roberto Navigli (Eds.). Association for Computational Linguistics 3214--3225.","DOI":"10.18653\/v1\/2021.acl-long.249"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758173","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:15:09Z","timestamp":1765307709000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758173"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":28,"alternative-id":["10.1145\/3746027.3758173","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758173","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}