{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:56:11Z","timestamp":1781538971918,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810818","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"167-176","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["RoATR: A Systematic Study of Audio-Text Retrieval Robustness Against Realistic Perturbations"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-1711-3018","authenticated-orcid":false,"given":"Honglei","family":"Zhang","sequence":"first","affiliation":[{"name":"School of Software, Nanjing University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9028-674X","authenticated-orcid":false,"given":"Pengfei","family":"Zhou","sequence":"additional","affiliation":[{"name":"School of Software, Northwestern Polytechnical University, Xi'an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1957-3677","authenticated-orcid":false,"given":"Ruohan","family":"Wang","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University, School of Software, Xi'an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9406-6745","authenticated-orcid":false,"given":"Siyue","family":"Zhang","sequence":"additional","affiliation":[{"name":"College of Computing and Data Science, Nanyang Technological University, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7386-0026","authenticated-orcid":false,"given":"Yilei","family":"Shi","sequence":"additional","affiliation":[{"name":"School of Software, Northwestern Polytechnical University, Xi'an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"crossref","unstructured":"John Alderete Macarious Kin\u00a0Fung Hui and Aanchan Mohan. 2025. Evaluating ASR robustness to spontaneous speech errors: A study of WhisperX using a Speech Error Database. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.13060 (2025).","DOI":"10.21437\/Interspeech.2025-2164"},{"key":"e_1_3_3_2_3_2","first-page":"4218","volume-title":"Proceedings of the twelfth language resources and evaluation conference","author":"Ardila Rosana","year":"2020","unstructured":"Rosana Ardila, Megan Branson, Kelly Davis, Michael Kohler, Josh Meyer, Michael Henretty, Reuben Morais, Lindsay Saunders, Francis Tyers, and Gregor Weber. 2020. Common voice: A massively-multilingual speech corpus. In Proceedings of the twelfth language resources and evaluation conference. 4218\u20134222."},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-64680-0_14"},{"key":"e_1_3_3_2_5_2","unstructured":"Parishad BehnamGhader Vaibhav Adlakha Marius Mosbach Dzmitry Bahdanau Nicolas Chapados and Siva Reddy. [n. d.]. Llm2vec: Large language models are secretly powerful text encoders 2024. URL https:\/\/arxiv. org\/abs\/2404.05961 ([n. d.])."},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Carlos Busso Murtaza Bulut Chi-Chun Lee Abe Kazemzadeh Emily Mower Samuel Kim Jeannette\u00a0N Chang Sungbok Lee and Shrikanth\u00a0S Narayanan. 2008. IEMOCAP: Interactive emotional dyadic motion capture database. Language resources and evaluation 42 4 (2008) 335\u2013359.","DOI":"10.1007\/s10579-008-9076-6"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747755"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.137"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"crossref","unstructured":"Sanyuan Chen Chengyi Wang Zhengyang Chen Yu Wu Shujie Liu Zhuo Chen Jinyu Li Naoyuki Kanda Takuya Yoshioka Xiong Xiao et\u00a0al. 2022. Wavlm: Large-scale self-supervised pre-training for full stack speech processing. IEEE Journal of Selected Topics in Signal Processing 16 6 (2022) 1505\u20131518.","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.613"},{"key":"e_1_3_3_2_11_2","unstructured":"Yunfei Chu Jin Xu Qian Yang Haojie Wei Xipin Wei Zhifang Guo Yichong Leng Yuanjun Lv Jinzheng He Junyang Lin et\u00a0al. 2024. Qwen2-audio technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.10759 (2024)."},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"e_1_3_3_2_13_2","unstructured":"Zhihao Du Changfeng Gao Yuxuan Wang Fan Yu Tianyu Zhao Hao Wang Xiang Lv Hui Wang Chongjia Ni Xian Shi et\u00a0al. 2025. Cosyvoice 3: Towards in-the-wild speech generation via scaling-up and post-training. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.17589 (2025)."},{"key":"e_1_3_3_2_14_2","unstructured":"Google Research. 2025. Simple Voice Questions (SVQ) Dataset. https:\/\/huggingface.co\/datasets\/google\/svq. Accessed: Feb. 1 2026."},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-2605"},{"key":"e_1_3_3_2_17_2","volume-title":"The Thirty-ninth Annual Conference on Neural Information Processing Systems Datasets and Benchmarks Track","author":"Heigold Georg","year":"2025","unstructured":"Georg Heigold, Ehsan Variani, Tom Bagby, Cyril Allauzen, Ji Ma, Shankar Kumar, and Michael Riley. 2025. Massive Sound Embedding Benchmark (MSEB). In The Thirty-ninth Annual Conference on Neural Information Processing Systems Datasets and Benchmarks Track. https:\/\/openreview.net\/forum?id=X0juYgFVng"},{"key":"e_1_3_3_2_18_2","unstructured":"Wei-Ning Hsu Anuroop Sriram Alexei Baevski Tatiana Likhomanenko Qiantong Xu Vineel Pratap Jacob Kahn Ann Lee Ronan Collobert Gabriel Synnaeve et\u00a0al. 2021. Robust wav2vec 2.0: Analyzing domain shift in self-supervised pre-training. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2104.01027 (2021)."},{"key":"e_1_3_3_2_19_2","unstructured":"Ting Jiang Minghui Song Zihan Zhang Haizhen Huang Weiwei Deng Feng Sun Qi Zhang Deqing Wang and Fuzhen Zhuang. 2024. E5-v: Universal embeddings with multimodal large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.12580 (2024)."},{"key":"e_1_3_3_2_20_2","unstructured":"Ziyan Jiang Rui Meng Xinyi Yang Semih Yavuz Yingbo Zhou and Wenhu Chen. [n. d.]. Vlm2vec: Training vision-language models for massive multimodal embedding tasks 2024. URL https:\/\/arxiv. org\/abs\/2410.05160 ([n. d.])."},{"key":"e_1_3_3_2_21_2","first-page":"119","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)","author":"Kim Chris\u00a0Dongjoo","year":"2019","unstructured":"Chris\u00a0Dongjoo Kim, Byeongchang Kim, Hyunmin Lee, and Gunhee Kim. 2019. Audiocaps: Generating captions for audios in the wild. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). 119\u2013132."},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2013.6701894"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"crossref","unstructured":"Tom Kwiatkowski Jennimaria Palomaki Olivia Redfield Michael Collins Ankur Parikh Chris Alberti Danielle Epstein Illia Polosukhin Jacob Devlin Kenton Lee et\u00a0al. 2019. Natural questions: a benchmark for question answering research. Transactions of the Association for Computational Linguistics 7 (2019) 453\u2013466.","DOI":"10.1162\/tacl_a_00276"},{"key":"e_1_3_3_2_24_2","volume-title":"Interspeech","author":"Lee Chia-Hsuan","year":"2018","unstructured":"Chia-Hsuan Lee, Szu-Lin Wu, Chi-Liang Liu, and Hung yi Lee. 2018. Spoken SQuAD: A Study of Mitigating the Impact of Speech Recognition Errors on Listening Comprehension. In Interspeech. https:\/\/api.semanticscholar.org\/CorpusID:4561735"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","unstructured":"Lin-shan Lee James Glass Hung-yi Lee and Chun-an Chan. 2015. Spoken Content Retrieval\u2014Beyond Cascading Speech Recognition with Text Retrieval. IEEE\/ACM Transactions on Audio Speech and Language Processing 23 9 (2015) 1389\u20131420. 10.1109\/TASLP.2015.2438543","DOI":"10.1109\/TASLP.2015.2438543"},{"key":"e_1_3_3_2_26_2","unstructured":"Tatiana Likhomanenko Qiantong Xu Vineel Pratap Paden Tomasello Jacob Kahn Gilad Avidov Ronan Collobert and Gabriel Synnaeve. 2020. Rethinking evaluation in ASR: Are our models robust enough? arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.11745 (2020)."},{"key":"e_1_3_3_2_27_2","unstructured":"Sarthak\u00a0Kumar Maharana Saksham\u00a0Singh Kushwaha Baoming Zhang Adrian Rodriguez Songtao Wei Yapeng Tian and Yunhui Guo. 2025. AVROBUSTBENCH: Benchmarking the Robustness of Audio-Visual Recognition Models at Test-Time. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.00358 (2025)."},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Xinhao Mei Chutong Meng Haohe Liu Qiuqiang Kong Tom Ko Chengqi Zhao Mark\u00a0D Plumbley Yuexian Zou and Wenwu Wang. 2024. Wavcaps: A chatgpt-assisted weakly-labelled audio captioning dataset for audio-language multimodal research. IEEE\/ACM Transactions on Audio Speech and Language Processing 32 (2024) 3339\u20133354.","DOI":"10.1109\/TASLP.2024.3419446"},{"key":"e_1_3_3_2_29_2","unstructured":"Tri Nguyen Mir Rosenberg Xia Song Jianfeng Gao Saurabh Tiwary Rangan Majumder and Li Deng. 2016. Ms marco: A human-generated machine reading comprehension dataset. (2016)."},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_3_2_31_2","first-page":"553","volume-title":"2004 12th European Signal Processing Conference","author":"Parihar Naveen","year":"2004","unstructured":"Naveen Parihar, Joseph Picone, David Pearce, and Hans-G\u00fcnter Hirsch. 2004. Performance analysis of the Aurora large vocabulary baseline system. In 2004 12th European Signal Processing Conference. IEEE, 553\u2013556."},{"key":"e_1_3_3_2_32_2","unstructured":"David Pearce and J Picone. 2002. Aurora working group: DSR front end LVCSR evaluation AU\/384\/02. Inst. for Signal & Inform. Process. Mississippi State Univ. Tech. Rep (2002)."},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"e_1_3_3_2_34_2","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR."},{"key":"e_1_3_3_2_35_2","first-page":"28492","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In International conference on machine learning. PMLR, 28492\u201328518."},{"key":"e_1_3_3_2_36_2","unstructured":"Alec Radford Jeff Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners. (2019). https:\/\/d4mucfpksywv.cloudfront.net\/better-language-models\/language-models.pdf"},{"key":"e_1_3_3_2_37_2","first-page":"38625","volume-title":"International Conference on Learning Representations","author":"Shah Muhammad","year":"2025","unstructured":"Muhammad Shah, David Solans\u00a0Noguero, Mikko Heikkil\u00e4, Bhiksha Raj, and Nicolas Kourtellis. 2025. Speech Robust Bench: A Robustness Benchmark For Speech Recognition. In International Conference on Learning Representations. 38625\u201338651."},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413386"},{"key":"e_1_3_3_2_39_2","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Springer Jacob\u00a0Mitchell","year":"2025","unstructured":"Jacob\u00a0Mitchell Springer, Suhas Kotha, Daniel Fried, Graham Neubig, and Aditi Raghunathan. 2025. Repetition Improves Language Model Embeddings. In The Thirteenth International Conference on Learning Representations. https:\/\/openreview.net\/pdf?id=Ahlrf2HGJR"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3758260"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746213"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"crossref","unstructured":"James Thorne Andreas Vlachos Christos Christodoulopoulos and Arpit Mittal. 2018. FEVER: a large-scale dataset for fact extraction and VERification. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1803.05355 (2018).","DOI":"10.18653\/v1\/N18-1074"},{"key":"e_1_3_3_2_43_2","unstructured":"Ke Wang Houxing Ren Zimu Lu Mingjie Zhan and Hongsheng Li. 2025. VoiceAssistant-Eval: Benchmarking AI Assistants across Listening Speaking and Viewing. https:\/\/arxiv.org\/abs\/2509.22651"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"crossref","unstructured":"Shinji Watanabe Michael Mandel Jon Barker Emmanuel Vincent Ashish Arora Xuankai Chang Sanjeev Khudanpur Vimal Manohar Daniel Povey Desh Raj et\u00a0al. 2020. CHiME-6 challenge: Tackling multispeaker speech recognition for unsegmented recordings. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2004.09249 (2020).","DOI":"10.21437\/CHiME.2020-1"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1145\/1458082.1458158"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.naacl-long.597"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"crossref","unstructured":"Gordon Wichern Joe Antognini Michael Flynn Licheng\u00a0Richard Zhu Emmett McQuinn Dwight Crow Ethan Manilow and Jonathan\u00a0Le Roux. 2019. Wham!: Extending speech separation to noisy environments. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1907.01160 (2019).","DOI":"10.21437\/Interspeech.2019-2821"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"e_1_3_3_2_49_2","volume-title":"IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP","author":"Wu* Yusong","year":"2023","unstructured":"Yusong Wu*, Ke Chen*, Tianyu Zhang*, Yuchen Hui*, Taylor Berg-Kirkpatrick, and Shlomo Dubnov. 2023. Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation. In IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP. https:\/\/arxiv.org\/abs\/2211.06687"},{"key":"e_1_3_3_2_50_2","unstructured":"Chenghao Xiao Hou\u00a0Pong Chan Hao Zhang Weiwen Xu Mahani Aljunied and Yu Rong. 2025. Scaling Language-Centric Omnimodal Representation Learning. arxiv:https:\/\/arXiv.org\/abs\/2510.11693\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2510.11693"},{"key":"e_1_3_3_2_51_2","unstructured":"Mengyao Xu Wenfei Zhou Yauhen Babakhin Gabriel Moreira Ronay Ak Radek Osmulski Bo Liu Even Oldridge and Benedikt Schifferer. 2025. Omni-Embed-Nemotron: A Unified Multimodal Retrieval Model for Text Image Audio and Video. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2510.03458 (2025)."},{"key":"e_1_3_3_2_52_2","unstructured":"An Yang Anfeng Li Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chang Gao Chengen Huang Chenxu Lv et\u00a0al. 2025. Qwen3 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.09388 (2025)."},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1259"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.emnlp-main.213"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"crossref","unstructured":"Xinyu Zhang Nandan Thakur Odunayo Ogundepo Ehsan Kamalloo David Alfonso-Hermelo Xiaoguang Li Qun Liu Mehdi Rezagholizadeh and Jimmy Lin. 2023. Miracl: A multilingual retrieval dataset covering 18 diverse languages. Transactions of the Association for Computational Linguistics 11 (2023) 1114\u20131131.","DOI":"10.1162\/tacl_a_00595"},{"key":"e_1_3_3_2_56_2","unstructured":"Xin Zhang Yanzhao Zhang Wen Xie Mingxin Li Ziqi Dai Dingkun Long Pengjun Xie Meishan Zhang Wenjie Li and Min Zhang. 2024. GME: Improving Universal Multimodal Retrieval by Multimodal LLMs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.16855 (2024)."},{"key":"e_1_3_3_2_57_2","unstructured":"Yiman Zhang Ziheng Luo Qiangyu Yan Wei He Borui Jiang Xinghao Chen and Kai Han. 2025. OmniEval: A Benchmark for Evaluating Omni-modal Models with Visual Auditory and Textual Inputs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.20960 (2025)."},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413391"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:28:54Z","timestamp":1781537334000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810818"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":57,"alternative-id":["10.1145\/3805622.3810818","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810818","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}