{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:57:10Z","timestamp":1781539030100,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"National Natural Science Foundation of China","award":["62472291"],"award-info":[{"award-number":["62472291"]}]},{"name":"Guangdong Basic and Applied Basic Research Foundation","award":["2025A1515012154"],"award-info":[{"award-number":["2025A1515012154"]}]},{"name":"Shenzhen Science and Technology Program","award":["JCYJ20250604181605008"],"award-info":[{"award-number":["JCYJ20250604181605008"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810703","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1308-1316","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["SliceCSRef: Dual-Level Semantic Alignment for Robust Speech Referring Expression Comprehension"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-5656-5476","authenticated-orcid":false,"given":"Lihong","family":"Huang","sequence":"first","affiliation":[{"name":"College of Computer Science and Software Engineering, Shenzhen university, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7524-5999","authenticated-orcid":false,"given":"Sheng-Hua","family":"Zhong","sequence":"additional","affiliation":[{"name":"College of Computer Science and Software Engineering, Shenzhen University, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0412-0416","authenticated-orcid":false,"given":"Qiao","family":"Yan","sequence":"additional","affiliation":[{"name":"College of Computer Science and Software Engineering, Shenzhen University, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9664-821X","authenticated-orcid":false,"given":"Zhijiao","family":"Xiao","sequence":"additional","affiliation":[{"name":"College of Computer Science and Software Engineering, Shenzhen University, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4242-4840","authenticated-orcid":false,"given":"Yan","family":"Liu","sequence":"additional","affiliation":[{"name":"Department of Computing, The Hong Kong Polytechnic University, Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Alexei Baevski Yuhao Zhou Abdelrahman Mohamed and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems 33 (2020) 12449\u201312460."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"crossref","unstructured":"Sanyuan Chen Chengyi Wang Zhengyang Chen Yu Wu Shujie Liu Zhuo Chen Jinyu Li Naoyuki Kanda Takuya Yoshioka Xiong Xiao et\u00a0al. 2022. Wavlm: Large-scale self-supervised pre-training for full stack speech processing. IEEE Journal of Selected Topics in Signal Processing 16 6 (2022) 1505\u20131518.","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"e_1_3_3_1_5_2","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers)","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). Association for Computational Linguistics, Minneapolis, MN, USA, 4171\u20134186."},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448504"},{"key":"e_1_3_3_1_7_2","unstructured":"Zheng Ge Songtao Liu Feng Wang Zeming Li and Jian Sun. 2021. Yolox: Exceeding yolo series in 2021. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2107.08430."},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","unstructured":"Robert Geirhos J\u00f6rn-Henrik Jacobsen Claudio Michaelis Richard Zemel Wieland Brendel Matthias Bethge and Felix\u00a0A Wichmann. 2020. Shortcut learning in deep neural networks. Nature Machine Intelligence 2 11 (2020) 665\u2013673.","DOI":"10.1038\/s42256-020-00257-z"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2015. Spatial pyramid pooling in deep convolutional networks for visual recognition. IEEE transactions on pattern analysis and machine intelligence 37 9 (2015) 1904\u20131916.","DOI":"10.1109\/TPAMI.2015.2389824"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Wei-Ning Hsu Benjamin Bolte Yao-Hung\u00a0Hubert Tsai Kushal Lakhotia Ruslan Salakhutdinov and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM transactions on audio speech and language processing 29 (2021) 3451\u20133460.","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","unstructured":"Lihong Huang Sheng-hua Zhong and Yan Liu. 2025. CSRef: Contrastive Semantic Alignment for Speech Referring Expression Comprehension. ACM Transactions on Multimedia Computing Communications and Applications (2025). 10.1145\/3765520","DOI":"10.1145\/3765520"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953152"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_3_1_16_2","first-page":"38","volume-title":"European conference on computer vision","author":"Liu Shilong","year":"2024","unstructured":"Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Qing Jiang, Chunyuan Li, Jianwei Yang, Hang Su, et\u00a0al. 2024. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. In European conference on computer vision. Springer, Cham, 38\u201355."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"crossref","unstructured":"Gen Luo Yiyi Zhou Jiamu Sun Xiaoshuai Sun and Rongrong Ji. 2023. A survivor in the era of large-scale pretraining: An empirical study of one-stage referring expression comprehension. IEEE Transactions on Multimedia 26 (2023) 3689\u20133700.","DOI":"10.1109\/TMM.2023.3314153"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01005"},{"key":"e_1_3_3_1_19_2","unstructured":"Andrew\u00a0L Maas Awni\u00a0Y Hannun and Andrew\u00a0Y Ng. 2013. Rectifier nonlinearities improve neural network acoustic models. ICML Workshop."},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/3460426.3466932"},{"key":"e_1_3_3_1_22_2","unstructured":"Aaron van\u00a0den Oord Yazhe Li and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1807.03748."},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"crossref","unstructured":"Daniel\u00a0S Park William Chan Yu Zhang Chung-Cheng Chiu Barret Zoph Ekin\u00a0D Cubuk and Quoc\u00a0V Le. 2019. Specaugment: A simple data augmentation method for automatic speech recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1904.08779.","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"e_1_3_3_1_25_2","unstructured":"Zhiliang Peng Wenhui Wang Li Dong Yaru Hao Shaohan Huang Shuming Ma and Furu Wei. 2023. Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.14824."},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.emnlp-main.589"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Yanyuan Qiao Chaorui Deng and Qi Wu. 2020. Referring expression comprehension: A survey of methods and datasets. IEEE Transactions on Multimedia 23 (2020) 4426\u20134440.","DOI":"10.1109\/TMM.2020.3042066"},{"key":"e_1_3_3_1_28_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.5555\/3618408.3619590"},{"key":"e_1_3_3_1_30_2","unstructured":"Kentaro Seki Yuki Okamoto Kouei Yamaoka Yuki Saito Shinnosuke Takamichi and Hiroshi Saruwatari. 2025. Spatial-CLAP: Learning Spatially-Aware audio\u2013text Embeddings for Multi-Source Conditions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2509.14785."},{"key":"e_1_3_3_1_31_2","unstructured":"Naftali Tishby Fernando\u00a0C Pereira and William Bialek. 2000. The information bottleneck method. arXiv preprint physics\/0004057."},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00203"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"crossref","unstructured":"Linhui Xiao Xiaoshan Yang Xiangyuan Lan Yaowei Wang and Changsheng Xu. 2026. Towards visual grounding: A survey. IEEE Transactions on Pattern Analysis and Machine Intelligence 48 3 (2026) 2749\u20132771.","DOI":"10.1109\/TPAMI.2025.3630635"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1174"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"crossref","unstructured":"Shukang Yin Chaoyou Fu Sirui Zhao Ke Li Xing Sun Tong Xu and Enhong Chen. 2024. A survey on multimodal large language models. National Science Review 11 12 (2024) nwae403.","DOI":"10.1093\/nsr\/nwae403"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00142"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/MLSP58920.2024.10734763"},{"key":"e_1_3_3_1_40_2","unstructured":"Hongyi Zhang Moustapha Cisse Yann\u00a0N Dauphin and David Lopez-Paz. 2017. mixup: Beyond empirical risk minimization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1710.09412."},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"crossref","unstructured":"Yiyi Zhou Rongrong Ji Gen Luo Xiaoshuai Sun Jinsong Su Xinghao Ding Chia-Wen Lin and Qi Tian. 2021. A real-time global inference network for one-stage referring expression comprehension. IEEE Transactions on Neural Networks and Learning Systems 34 1 (2021) 134\u2013143.","DOI":"10.1109\/TNNLS.2021.3090426"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_35"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:28:26Z","timestamp":1781537306000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810703"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":41,"alternative-id":["10.1145\/3805622.3810703","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810703","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}