{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:57:43Z","timestamp":1781539063259,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"National Natural Science Foundation of China","award":["62171138"],"award-info":[{"award-number":["62171138"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810622","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1073-1082","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Teaching Audio-Language Models to Reason over Time"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-8533-285X","authenticated-orcid":false,"given":"Yufeng","family":"Xu","sequence":"first","affiliation":[{"name":"College of Computer Science and Artificial Intelligence, Fudan University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-7100-2214","authenticated-orcid":false,"given":"Yunjia","family":"Li","sequence":"additional","affiliation":[{"name":"College of Computer Science and Artificial Intelligence, Fudan University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6535-6352","authenticated-orcid":false,"given":"Hai","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer Science, Nanjing University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4486-8341","authenticated-orcid":false,"given":"Wei","family":"Li","sequence":"additional","affiliation":[{"name":"College of Computer Science and Artificial Intelligence, Fudan University, Shanghai, China and Shanghai Key Laboratory of Intelligent Information Processing, Fudan University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Debarpan Bhattacharya Apoorva Kulkarni and Sriram Ganapathy. 2025. Benchmarking and Confidence Evaluation of LALMs For Temporal Reasoning. arxiv:https:\/\/arXiv.org\/abs\/2505.13115\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2505.13115"},{"key":"e_1_3_3_1_3_2","unstructured":"Pengfei Cai Yan Song Qing Gu Nan Jiang Haoyu Song and Ian McLoughlin. 2025. Detect Any Sound: Open-Vocabulary Sound Event Detection with Multi-Modal Queries. arxiv:https:\/\/arXiv.org\/abs\/2507.16343\u00a0[cs.SD] https:\/\/arxiv.org\/abs\/2507.16343"},{"key":"e_1_3_3_1_4_2","unstructured":"Yunfei Chu Jin Xu Qian Yang Haojie Wei Xipin Wei Zhifang Guo Yichong Leng Yuanjun Lv Jinzheng He Junyang Lin Chang Zhou and Jingren Zhou. 2024. Qwen2-Audio Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2407.10759\u00a0[eess.AS] https:\/\/arxiv.org\/abs\/2407.10759"},{"key":"e_1_3_3_1_5_2","unstructured":"Yunfei Chu Jin Xu Xiaohuan Zhou Qian Yang Shiliang Zhang Zhijie Yan Chang Zhou and Jingren Zhou. 2023. Qwen-Audio: Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models. arxiv:https:\/\/arXiv.org\/abs\/2311.07919\u00a0[eess.AS] https:\/\/arxiv.org\/abs\/2311.07919"},{"key":"e_1_3_3_1_6_2","unstructured":"Gheorghe Comanici Eric Bieber Mike Schaekermann et\u00a0al. 2025. Gemini 2.5: Pushing the Frontier with Advanced Reasoning Multimodality Long Context and Next Generation Agentic Capabilities. arxiv:https:\/\/arXiv.org\/abs\/2507.06261\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2507.06261"},{"key":"e_1_3_3_1_7_2","unstructured":"Samuele Cornell Janek Ebbers Constance Douwes Irene Mart\u00edn-Morat\u00f3 Manu Harju Annamaria Mesaros and Romain Serizel. 2024. DCASE 2024 task 4: Sound event detection with heterogeneous data and missing labels. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.08056 (2024)."},{"key":"e_1_3_3_1_8_2","unstructured":"Wenqian Cui Xiaoqi Jiao Ziqiao Meng and Irwin King. 2025. VoxEval: Benchmarking the Knowledge Understanding Capabilities of End-to-End Spoken Language Models. arxiv:https:\/\/arXiv.org\/abs\/2501.04962\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2501.04962"},{"key":"e_1_3_3_1_9_2","unstructured":"DeepSeek-AI Aixin Liu Bei Feng et\u00a0al. 2025. DeepSeek-V3 Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2412.19437\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2412.19437"},{"key":"e_1_3_3_1_10_2","unstructured":"Soham Deshmukh Benjamin Elizalde and Huaming Wang. 2022. Audio Retrieval with WavText5K and CLAP Training. arxiv:https:\/\/arXiv.org\/abs\/2209.14275\u00a0[eess.AS] https:\/\/arxiv.org\/abs\/2209.14275"},{"key":"e_1_3_3_1_11_2","unstructured":"Konstantinos Drossos Samuel Lipping and Tuomas Virtanen. 2019. Clotho: An Audio Captioning Dataset. arxiv:https:\/\/arXiv.org\/abs\/1910.09387\u00a0[cs.SD] https:\/\/arxiv.org\/abs\/1910.09387"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Eduardo Fonseca Xavier Favory Jordi Pons Frederic Font and Xavier Serra. 2022. FSD50K: An Open Dataset of Human-Labeled Sound Events. arxiv:https:\/\/arXiv.org\/abs\/2010.00475\u00a0[cs.SD] https:\/\/arxiv.org\/abs\/2010.00475","DOI":"10.1109\/TASLP.2021.3133208"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_3_1_14_2","unstructured":"Tiantian Geng Jinrui Zhang Qingni Wang Teng Wang Jinming Duan and Feng Zheng. 2025. LongVALE: Vision-Audio-Language-Event Benchmark Towards Time-Aware Omni-Modal Perception of Long Videos. arxiv:https:\/\/arXiv.org\/abs\/2411.19772\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2411.19772"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"Mandeep Goyal and Qusay\u00a0H Mahmoud. 2024. A systematic review of synthetic data generation techniques using generative AI. Electronics 13 17 (2024) 3509.","DOI":"10.3390\/electronics13173509"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Yuto Harada Yusuke Yamauchi Yusuke Oda Yohei Oseki Yusuke Miyao and Yu Takagi. 2025. Massive Supervised Fine-tuning Experiments Reveal How Data Layer and Training Factors Shape LLM Alignment Quality. arxiv:https:\/\/arXiv.org\/abs\/2506.14681\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2506.14681","DOI":"10.18653\/v1\/2025.emnlp-main.1138"},{"key":"e_1_3_3_1_17_2","unstructured":"Shawn Hershey Daniel P.\u00a0W. Ellis Eduardo Fonseca Aren Jansen Caroline Liu R.\u00a0Channing Moore and Manoj Plakal. 2021. The Benefit Of Temporally-Strong Labels In Audio Event Classification. CoRR abs\/2105.07031 (2021). arXiv:https:\/\/arXiv.org\/abs\/2105.07031https:\/\/arxiv.org\/abs\/2105.07031"},{"key":"e_1_3_3_1_18_2","unstructured":"Edward\u00a0J. Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang and Weizhu Chen. 2021. LoRA: Low-Rank Adaptation of Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2106.09685\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2106.09685"},{"key":"e_1_3_3_1_19_2","volume-title":"NAACL-HLT","author":"Kim Chris\u00a0Dongjoo","year":"2019","unstructured":"Chris\u00a0Dongjoo Kim, Byeongchang Kim, Hyunmin Lee, and Gunhee Kim. 2019. AudioCaps: Generating Captions for Audios in The Wild. In NAACL-HLT."},{"key":"e_1_3_3_1_20_2","unstructured":"KimiTeam Ding Ding Zeqian Ju Yichong Leng Songxiang Liu Tong Liu Zeyu Shang Kai Shen Wei Song Xu Tan Heyi Tang Zhengtao Wang Chu Wei Yifei Xin Xinran Xu Jianwei Yu Yutao Zhang Xinyu Zhou Y. Charles Jun Chen Yanru Chen Yulun Du Weiran He Zhenxing Hu Guokun Lai Qingcheng Li Yangyang Liu Weidong Sun Jianzhou Wang Yuzhi Wang Yuefeng Wu Yuxin Wu Dongchao Yang Hao Yang Ying Yang Zhilin Yang Aoxiong Yin Ruibin Yuan Yutong Zhang and Zaida Zhou. 2025. Kimi-Audio Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2504.18425\u00a0[eess.AS] https:\/\/arxiv.org\/abs\/2504.18425"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","unstructured":"A.\u00a0Sophia Koepke Andreea-Maria Oncescu Jo\u00e3o\u00a0F. Henriques Zeynep Akata and Samuel Albanie. 2023. Audio Retrieval With Natural Language Queries: A Benchmark Study. IEEE Transactions on Multimedia 25 (2023) 2675\u20132685. 10.1109\/tmm.2022.3149712","DOI":"10.1109\/tmm.2022.3149712"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","unstructured":"Kaung\u00a0Myat Kyaw and Jonathan\u00a0Hoyin Chan. 2025. A Framework for Synthetic Audio Conversations Generation using Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2409.00946\u00a0[cs.SD] 10.1109\/WI-IAT62293.2024.00056","DOI":"10.1109\/WI-IAT62293.2024.00056"},{"key":"e_1_3_3_1_23_2","unstructured":"Songtao Li and Hao Tang. 2024. Multimodal Alignment and Fusion: A Survey. arxiv:https:\/\/arXiv.org\/abs\/2411.17040\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2411.17040"},{"key":"e_1_3_3_1_24_2","unstructured":"Huadai Liu Jialei Wang Kaicheng Luo Wen Wang Qian Chen Zhou Zhao and Wei Xue. 2025. ThinkSound: Chain-of-Thought Reasoning in Multimodal Large Language Models for Audio Generation and Editing. arxiv:https:\/\/arXiv.org\/abs\/2506.21448\u00a0[eess.AS] https:\/\/arxiv.org\/abs\/2506.21448"},{"key":"e_1_3_3_1_25_2","unstructured":"Ziyang Ma Zhuo Chen Yuping Wang Eng\u00a0Siong Chng and Xie Chen. 2025. Audio-CoT: Exploring Chain-of-Thought Reasoning in Large Audio Language Model. arxiv:https:\/\/arXiv.org\/abs\/2501.07246\u00a0[cs.SD] https:\/\/arxiv.org\/abs\/2501.07246"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","unstructured":"Xinhao Mei Chutong Meng Haohe Liu Qiuqiang Kong Tom Ko Chengqi Zhao Mark\u00a0D. Plumbley Yuexian Zou and Wenwu Wang. 2024. WavCaps: A ChatGPT-Assisted Weakly-Labelled Audio Captioning Dataset for Audio-Language Multimodal Research. arxiv:https:\/\/arXiv.org\/abs\/2303.17395\u00a0[eess.AS] 10.1109\/TASLP.2024.3419446","DOI":"10.1109\/TASLP.2024.3419446"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/EUSIPCO.2016.7760424"},{"key":"e_1_3_3_1_28_2","unstructured":"OpenAI Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia\u00a0Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat Red Avila Igor Babuschkin Suchir Balaji Valerie Balcom Paul Baltescu Haiming Bao Mohammad Bavarian Jeff Belgum Irwan Bello Jake Berdine Gabriel Bernadett-Shapiro Christopher Berner Lenny Bogdonoff Oleg Boiko Madelaine Boyd Anna-Luisa Brakman Greg Brockman Tim Brooks Miles Brundage Kevin Button Trevor Cai Rosie Campbell Andrew Cann Brittany Carey Chelsea Carlson Rory Carmichael Brooke Chan Che Chang Fotis Chantzis Derek Chen Sully Chen Ruby Chen Jason Chen Mark Chen Ben Chess Chester Cho et\u00a0al. 2024. GPT-4 Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2303.08774\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2303.08774"},{"key":"e_1_3_3_1_29_2","unstructured":"Soujanya Poria Devamanyu Hazarika Navonil Majumder Gautam Naik Erik Cambria and Rada Mihalcea. 2019. MELD: A Multimodal Multi-Party Dataset for Emotion Recognition in Conversations. arxiv:https:\/\/arXiv.org\/abs\/1810.02508\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/1810.02508"},{"key":"e_1_3_3_1_30_2","unstructured":"Alec Radford Jong\u00a0Wook Kim Tao Xu Greg Brockman Christine McLeavey and Ilya Sutskever. 2022. Robust Speech Recognition via Large-Scale Weak Supervision. arxiv:https:\/\/arXiv.org\/abs\/2212.04356\u00a0[eess.AS] https:\/\/arxiv.org\/abs\/2212.04356"},{"key":"e_1_3_3_1_31_2","unstructured":"Rafael Rafailov Archit Sharma Eric Mitchell Stefano Ermon Christopher\u00a0D. Manning and Chelsea Finn. 2024. Direct Preference Optimization: Your Language Model is Secretly a Reward Model. arxiv:https:\/\/arXiv.org\/abs\/2305.18290\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2305.18290"},{"key":"e_1_3_3_1_32_2","unstructured":"S Sakshi Utkarsh Tyagi Sonal Kumar Ashish Seth Ramaneswaran Selvakumar Oriol Nieto Ramani Duraiswami Sreyan Ghosh and Dinesh Manocha. 2024. MMAU: A Massive Multi-Task Audio Understanding and Reasoning Benchmark. arxiv:https:\/\/arXiv.org\/abs\/2410.19168\u00a0[eess.AS] https:\/\/arxiv.org\/abs\/2410.19168"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2655045"},{"key":"e_1_3_3_1_34_2","unstructured":"Arvind\u00a0Krishna Sridhar Yinyi Guo and Erik Visser. 2024. Enhancing Temporal Understanding in Audio Question Answering for Large Audio Language Models. arxiv:https:\/\/arXiv.org\/abs\/2409.06223\u00a0[cs.SD] https:\/\/arxiv.org\/abs\/2409.06223"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.naacl-industry.78"},{"key":"e_1_3_3_1_36_2","unstructured":"Gemma Team Aishwarya Kamath Johan Ferret Shreya Pathak Nino Vieillard Ramona Merhej Sarah Perrin Tatiana Matejovicova Alexandre Ram\u00e9 Morgane Rivi\u00e8re Louis Rouillard Thomas Mesnard Geoffrey Cideron Jean bastien Grill Sabela Ramos Edouard Yvinec Michelle Casbon Etienne Pot Ivo Penchev Ga\u00ebl Liu Francesco Visin Kathleen Kenealy Lucas Beyer Xiaohai Zhai Anton Tsitsulin Robert Busa-Fekete Alex Feng Noveen Sachdeva Benjamin Coleman Yi Gao Basil Mustafa Iain Barr Emilio Parisotto David Tian Matan Eyal Colin Cherry Jan-Thorsten Peter Danila Sinopalnikov Surya Bhupatiraju Rishabh Agarwal Mehran Kazemi Dan Malkin Ravin Kumar David Vilar Idan Brusilovsky Jiaming Luo Andreas Steiner Abe Friesen Abhanshu Sharma et\u00a0al. 2025. Gemma 3 Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2503.19786\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2503.19786"},{"key":"e_1_3_3_1_37_2","unstructured":"Jason Wei Xuezhi Wang Dale Schuurmans Maarten Bosma Brian Ichter Fei Xia Ed Chi Quoc Le and Denny Zhou. 2023. Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2201.11903\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2201.11903"},{"key":"e_1_3_3_1_38_2","unstructured":"Wenyi Xiao Zechuan Wang Leilei Gan Shuai Zhao Zongrui Li Ruirui Lei Wanggui He Luu\u00a0Anh Tuan Long Chen Hao Jiang Zhou Zhao and Fei Wu. 2025. A Comprehensive Survey of Direct Preference Optimization: Datasets Theories Variants and Applications. arxiv:https:\/\/arXiv.org\/abs\/2410.15595\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2410.15595"},{"key":"e_1_3_3_1_39_2","unstructured":"Zhifei Xie Mingbao Lin Zihang Liu Pengcheng Wu Shuicheng Yan and Chunyan Miao. 2025. Audio-Reasoner: Improving Reasoning Capability in Large Audio Language Models. arxiv:https:\/\/arXiv.org\/abs\/2503.02318\u00a0[cs.SD] https:\/\/arxiv.org\/abs\/2503.02318"},{"key":"e_1_3_3_1_40_2","unstructured":"Jin Xu Zhifang Guo Jinzheng He Hangrui Hu Ting He Shuai Bai Keqin Chen Jialin Wang Yang Fan Kai Dang Bin Zhang Xiong Wang Yunfei Chu and Junyang Lin. 2025. Qwen2.5-Omni Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2503.20215\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2503.20215"},{"key":"e_1_3_3_1_41_2","unstructured":"Zhuosheng Zhang Aston Zhang Mu Li Hai Zhao George Karypis and Alex Smola. 2024. Multimodal Chain-of-Thought Reasoning in Language Models. arxiv:https:\/\/arXiv.org\/abs\/2302.00923\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2302.00923"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:39:22Z","timestamp":1781537962000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810622"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":40,"alternative-id":["10.1145\/3805622.3810622","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810622","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}