{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:56:40Z","timestamp":1781539000210,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"the National Science and Technology Innovation 2030 Major Project","award":["2025ZD1502104"],"award-info":[{"award-number":["2025ZD1502104"]}]},{"name":"the Anhui Province Science and Technology Key Project","award":["202423l10050033"],"award-info":[{"award-number":["202423l10050033"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810802","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"12-20","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Mitigating Semantic Bias in Multilingual Visual Document Retrieval via Language-Vision-Aware Late Interaction"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-5588-8015","authenticated-orcid":false,"given":"Haowei","family":"Li","sequence":"first","affiliation":[{"name":"School of Cyber Science and Technology, University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1848-4245","authenticated-orcid":false,"given":"Haojie","family":"Wu","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Technology, University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5129-3945","authenticated-orcid":false,"given":"Jie","family":"Bao","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Technology, University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2151-7028","authenticated-orcid":false,"given":"Zhen","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Technology, University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6403-0557","authenticated-orcid":false,"given":"Yong","family":"Liao","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Technology, University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","volume-title":"vdr-multilingual-test benchmark for visual document retrieval.","year":"2026","unstructured":"2026. vdr-multilingual-test benchmark for visual document retrieval.Retrieved Feb 1, 2026 from https:\/\/huggingface.co\/datasets\/llamaindex\/vdr-multilingual-test"},{"key":"e_1_3_3_1_3_2","volume-title":"VDSID-French: Vision Retrieval Dataset on French documents.","year":"2026","unstructured":"2026. VDSID-French: Vision Retrieval Dataset on French documents.Retrieved Feb 1, 2026 from https:\/\/huggingface.co\/datasets\/vidore\/vdsid_french"},{"key":"e_1_3_3_1_4_2","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia\u00a0Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et\u00a0al. 2023. Gpt-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_1_5_2","unstructured":"Lucas Beyer Andreas Steiner Andr\u00e9\u00a0Susano Pinto Alexander Kolesnikov Xiao Wang Daniel Salz Maxim Neumann Ibrahim Alabdulmohsin Michael Tschannen Emanuele Bugliarello et\u00a0al. 2024. Paligemma: A versatile 3b vlm for transfer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.07726 (2024)."},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.728"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Jianlv Chen Shitao Xiao Peitian Zhang Kun Luo Defu Lian and Zheng Liu. 2024. BGE M3-Embedding: Multi-Lingual Multi-Functionality Multi-Granularity Text Embeddings Through Self-Knowledge Distillation. arxiv:https:\/\/arXiv.org\/abs\/2402.03216\u00a0[cs.CL]","DOI":"10.18653\/v1\/2024.findings-acl.137"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.375"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.747"},{"key":"e_1_3_3_1_10_2","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers)","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171\u20134186."},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"e_1_3_3_1_12_2","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Faysse Manuel","unstructured":"Manuel Faysse, Hugues Sibille, Tony Wu, Bilel Omrani, Gautier Viaud, CELINE HUDELOT, and Pierre Colombo. [n. d.]. ColPali: Efficient Document Retrieval with Vision Language Models. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_3_1_13_2","unstructured":"Ankit Garg Kirankumar Shiragur Neeraj Kayal et\u00a0al. 2025. Incorporating Token Importance in Multi-Vector Retrieval. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2511.16106 (2025)."},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.mrl-main.36"},{"key":"e_1_3_3_1_15_2","volume-title":"International Conference on Learning Representations","author":"Hu Edward\u00a0J","unstructured":"Edward\u00a0J Hu, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et\u00a0al. [n. d.]. LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations."},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-emnlp.293"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.emnlp-main.854"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401075"},{"key":"e_1_3_3_1_19_2","unstructured":"Diederik\u00a0P Kingma. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1412.6980 (2014)."},{"key":"e_1_3_3_1_20_2","unstructured":"Adithya\u00a0S Kolavi and Vyoman Jain. 2025. M3DR: Towards Universal Multilingual Multimodal Document Retrieval. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2512.03514 (2025)."},{"key":"e_1_3_3_1_21_2","unstructured":"Patrick Lewis Ethan Perez Aleksandra Piktus Fabio Petroni Vladimir Karpukhin Naman Goyal Heinrich K\u00fcttler Mike Lewis Wen-tau Yih Tim Rockt\u00e4schel et\u00a0al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in neural information processing systems 33 (2020) 9459\u20139474."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i11.26538"},{"key":"e_1_3_3_1_23_2","unstructured":"Zehan Li Xin Zhang Yanzhao Zhang Dingkun Long Pengjun Xie and Meishan Zhang. 2023. Towards general text embeddings with multi-stage contrastive learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.03281 (2023)."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.943"},{"key":"e_1_3_3_1_25_2","unstructured":"Ant\u00f3nio Loison Quentin Mac\u00e9 Antoine Edy Victor Xing Tom Balough Gabriel Moreira Bo Liu Manuel Faysse C\u00e9line Hudelot and Gautier Viaud. 2026. ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2601.08620 (2026)."},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"crossref","unstructured":"Canjie Luo Lianwen Jin and Zenghui Sun. 2019. Moran: A multi-object rectified attention network for scene text recognition. Pattern Recognition 90 (2019) 109\u2013118.","DOI":"10.1016\/j.patcog.2019.01.020"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.373"},{"key":"e_1_3_3_1_28_2","unstructured":"Quentin Mac\u00e9 Ant\u00f3nio Loison and Manuel Faysse. 2025. ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.17166 (2025)."},{"key":"e_1_3_3_1_29_2","first-page":"2071","volume-title":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track","author":"Masry Ahmed","year":"2025","unstructured":"Ahmed Masry, Megh Thakkar, Patrice Bechard, Sathwik\u00a0Tejaswi Madhusudhan, Rabiul Awal, Shambhavi Mishra, Akshay\u00a0Kalkunte Suresh, Srivatsava Daruru, Enamul Hoque, Spandana Gella, et\u00a0al. 2025. ColMate: Contrastive late interaction and masked text for multimodal document retrieval. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track. 2071\u20132080."},{"key":"e_1_3_3_1_30_2","unstructured":"Elio Musacchio Lucia Siciliani Pierpaolo Basile and Giovanni Semeraro. 2025. xVLM2Vec: Adapting LVLM-based embedding models to multilinguality using Self-Knowledge Distillation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.09313 (2025)."},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"crossref","unstructured":"Humza Naveed Asad\u00a0Ullah Khan Shi Qiu Muhammad Saqib Saeed Anwar Muhammad Usman Naveed Akhtar Nick Barnes and Ajmal Mian. 2025. A comprehensive overview of large language models. ACM Transactions on Intelligent Systems and Technology 16 5 (2025) 1\u201372.","DOI":"10.1145\/3744746"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3726302.3730285"},{"key":"e_1_3_3_1_33_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"crossref","unstructured":"Baoguang Shi Xiang Bai and Cong Yao. 2016. An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition. IEEE transactions on pattern analysis and machine intelligence 39 11 (2016) 2298\u20132304.","DOI":"10.1109\/TPAMI.2016.2646371"},{"key":"e_1_3_3_1_35_2","unstructured":"David Wan Han Wang Elias Stengel-Eskin Jaemin Cho and Mohit Bansal. 2025. CLaMR: Contextualized Late-Interaction for Multimodal Content Retrieval. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.06144 (2025)."},{"key":"e_1_3_3_1_36_2","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Yu Shi","unstructured":"Shi Yu, Chaoyue Tang, Bokai Xu, Junbo Cui, Junhao Ran, Yukun Yan, Zhenghao Liu, Shuo Wang, Xu Han, Zhiyuan Liu, et\u00a0al. [n. d.]. VisRAG: Vision-based Retrieval-augmented Generation on Multi-modality Documents. In The Thirteenth International Conference on Learning Representations."}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:31:46Z","timestamp":1781537506000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810802"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":35,"alternative-id":["10.1145\/3805622.3810802","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810802","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}