{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T11:48:52Z","timestamp":1774352932721,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100006374","name":"China Scholarship Council","doi-asserted-by":"publisher","award":["202208410053"],"award-info":[{"award-number":["202208410053"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"NWO Talent Programme","award":["VI.Vidi.223.166"],"award-info":[{"award-number":["VI.Vidi.223.166"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,13]]},"DOI":"10.1145\/3726302.3730285","type":"proceedings-article","created":{"date-parts":[[2025,7,14]],"date-time":"2025-07-14T01:21:38Z","timestamp":1752456098000},"page":"3335-3345","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Reproducibility, Replicability, and Insights into Visual Document Retrieval with Late Interaction"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4474-6213","authenticated-orcid":false,"given":"Jingfen","family":"Qiao","sequence":"first","affiliation":[{"name":"University of Amsterdam, Amsterdam, Netherlands"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2247-3370","authenticated-orcid":false,"given":"Jia-Huei","family":"Ju","sequence":"additional","affiliation":[{"name":"University of Amsterdam, Amsterdam, Netherlands"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5511-9370","authenticated-orcid":false,"given":"Xinyu","family":"Ma","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8312-0694","authenticated-orcid":false,"given":"Evangelos","family":"Kanoulas","sequence":"additional","affiliation":[{"name":"University of Amsterdam, Amsterdam, Netherlands"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5970-880X","authenticated-orcid":false,"given":"Andrew","family":"Yates","sequence":"additional","affiliation":[{"name":"HLTCOE, Johns Hopkins University, Baltimore, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,7,13]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Getting ViT in Shape: Scaling Laws for Compute-Optimal Model Design. In Thirty-seventh Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=en4LGxpd9E","author":"Alabdulmohsin Ibrahim","year":"2023","unstructured":"Ibrahim Alabdulmohsin, Xiaohua Zhai, Alexander Kolesnikov, and Lucas Beyer. 2023. Getting ViT in Shape: Scaling Laws for Compute-Optimal Model Design. In Thirty-seventh Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=en4LGxpd9E"},{"key":"e_1_3_2_1_2_1","volume-title":"Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Alexander Kolesnikov, Xiao Wang, Daniel Salz, Maxim Neumann, Ibrahim Alabdulmohsin, Michael Tschannen, Emanuele Bugliarello, et al.","author":"Beyer Lucas","year":"2024","unstructured":"Lucas Beyer, Andreas Steiner, Andr\u00e9 Susano Pinto, Alexander Kolesnikov, Xiao Wang, Daniel Salz, Maxim Neumann, Ibrahim Alabdulmohsin, Michael Tschannen, Emanuele Bugliarello, et al., 2024. Paligemma: A versatile 3b vlm for transfer. arXiv preprint arXiv:2407.07726 (2024)."},{"key":"e_1_3_2_1_4_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877-1901."},{"key":"e_1_3_2_1_5_1","volume-title":"Bge m3-embedding: Multi-lingual, multi-functionality, multi-granularity text embeddings through self-knowledge distillation. arXiv preprint arXiv:2402.03216","author":"Chen Jianlv","year":"2024","unstructured":"Jianlv Chen, Shitao Xiao, Peitian Zhang, Kun Luo, Defu Lian, and Zheng Liu. 2024. Bge m3-embedding: Multi-lingual, multi-functionality, multi-granularity text embeddings through self-knowledge distillation. arXiv preprint arXiv:2402.03216 (2024)."},{"key":"e_1_3_2_1_6_1","volume-title":"PaLI: A Jointly-Scaled Multilingual Language-Image Model. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=mWVoBz4W0u","author":"Chen Xi","year":"2023","unstructured":"Xi Chen, Xiao Wang, Soravit Changpinyo, AJ Piergiovanni, Piotr Padlewski, Daniel Salz, Sebastian Goodman, Adam Grycner, Basil Mustafa, Lucas Beyer, Alexander Kolesnikov, Joan Puigcerver, Nan Ding, Keran Rong, Hassan Akbari, Gaurav Mishra, Linting Xue, Ashish V Thapliyal, James Bradbury, Weicheng Kuo, Mojtaba Seyedhosseini, Chao Jia, Burcu Karagol Ayan, Carlos Riquelme Ruiz, Andreas Peter Steiner, Anelia Angelova, Xiaohua Zhai, Neil Houlsby, and Radu Soricut. 2023. PaLI: A Jointly-Scaled Multilingual Language-Image Model. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=mWVoBz4W0u"},{"key":"e_1_3_2_1_7_1","unstructured":"Jaemin Cho Debanjan Mahata Ozan Irsoy Yujie He and Mohit Bansal. 2024. M3DocRAG: Multi-modal Retrieval is What You Need for Multi-page Multi-document Understanding. arxiv:2411.04952 [cs.CV] https:\/\/arxiv.org\/abs\/2411.04952"},{"key":"e_1_3_2_1_8_1","volume-title":"PDFVQA: A New Dataset for Real-World VQA on PDF Documents. arxiv:2304.06447 [cs.CV] https:\/\/arxiv.org\/abs\/2304.06447","author":"Ding Yihao","year":"2023","unstructured":"Yihao Ding, Siwen Luo, Hyunsuk Chung, and Soyeon Caren Han. 2023. PDFVQA: A New Dataset for Real-World VQA on PDF Documents. arxiv:2304.06447 [cs.CV] https:\/\/arxiv.org\/abs\/2304.06447"},{"key":"e_1_3_2_1_9_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=YicbFdNTTy","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"e_1_3_2_1_10_1","volume-title":"ColPali: Efficient Document Retrieval with Vision Language Models. In The Thirteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=ogjBpZ8uSi","author":"Faysse Manuel","year":"2025","unstructured":"Manuel Faysse, Hugues Sibille, Tony Wu, Bilel Omrani, Gautier Viaud, CELINE HUDELOT, and Pierre Colombo. 2025. ColPali: Efficient Document Retrieval with Vision Language Models. In The Thirteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=ogjBpZ8uSi"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.mrl-1.11"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.550"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401075"},{"key":"e_1_3_2_1_14_1","first-page":"1487","volume-title":"Probing Compositionality in Large Image Models. In Findings of the Association for Computational Linguistics: EACL 2024","author":"Lewis Martha","year":"2024","unstructured":"Martha Lewis, Nihal Nayak, Peilin Yu, Jack Merullo, Qinan Yu, Stephen Bach, and Ellie Pavlick. 2024. Does CLIP Bind Concepts? Probing Compositionality in Large Image Models. In Findings of the Association for Computational Linguistics: EACL 2024, Yvette Graham and Matthew Purver (Eds.). Association for Computational Linguistics, St. Julian's, Malta, 1487-1500. https:\/\/aclanthology.org\/2024.findings-eacl.101\/"},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 39th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In Proceedings of the 39th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 162), Kamalika Chaudhuri, Stefanie Jegelka, Le Song, Csaba Szepesvari, Gang Niu, and Sivan Sabato (Eds.). PMLR, 12888-12900. https:\/\/proceedings.mlr.press\/v162\/li22n.html"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.775"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3611380.3628554"},{"key":"e_1_3_2_1_18_1","unstructured":"Zichao Li Aizier Abulaiti Yaojie Lu Xuanang Chen Jia Zheng Hongyu Lin Xianpei Han and Le Sun. 2024a. READoc: A Unified Benchmark for Realistic Document Structured Extraction. arxiv:2409.05137 [cs.CL] https:\/\/arxiv.org\/abs\/2409.05137"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.772"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.373"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.373"},{"key":"e_1_3_2_1_22_1","volume-title":"VISA: Retrieval Augmented Generation with Visual Source Attribution. arxiv:2412.14457 [cs.IR] https:\/\/arxiv.org\/abs\/2412.14457","author":"Ma Xueguang","year":"2024","unstructured":"Xueguang Ma, Shengyao Zhuang, Bevan Koopman, Guido Zuccon, Wenhu Chen, and Jimmy Lin. 2024c. VISA: Retrieval Augmented Generation with Visual Source Attribution. arxiv:2412.14457 [cs.IR] https:\/\/arxiv.org\/abs\/2412.14457"},{"key":"e_1_3_2_1_23_1","volume-title":"Infographicvqa. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. 1697-1706","author":"Mathew Minesh","year":"2022","unstructured":"Minesh Mathew, Viraj Bagal, Rub\u00e8n Tito, Dimosthenis Karatzas, Ernest Valveny, and CV Jawahar. 2022. Infographicvqa. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. 1697-1706."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"e_1_3_2_1_25_1","unstructured":"Microsoft. 2024. Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone. arxiv:2404.14219 [cs.CL] https:\/\/arxiv.org\/abs\/2404.14219"},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 139), Marina Meila and Tong Zhang (Eds.). PMLR, 8748-8763. https:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.272"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Ryota Tanaka Kyosuke Nishida Kosuke Nishida Taku Hasegawa Itsumi Saito and Kuniko Saito. 2023. SlideVQA: A Dataset for Document Visual Question Answering on Multiple Images. In AAAI.","DOI":"10.1609\/aaai.v37i11.26598"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.463"},{"key":"e_1_3_2_1_31_1","volume-title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv preprint arXiv:2409.12191","author":"Wang Peng","year":"2024","unstructured":"Peng Wang, Shuai Bai, Sinan Tan, Shijie Wang, Zhihao Fan, Jinze Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Yang Fan, Kai Dang, Mengfei Du, Xuancheng Ren, Rui Men, Dayiheng Liu, Chang Zhou, Jingren Zhou, and Junyang Lin. 2024a. Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3591916"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73021-4_23"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3591903"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548422"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548422"},{"key":"e_1_3_2_1_38_1","unstructured":"Shengyao Zhuang Ekaterina Khramtsova Xueguang Ma Bevan Koopman Jimmy Lin and Guido Zuccon. 2025. Document Screenshot Retrievers are Vulnerable to Pixel Poisoning Attacks. arxiv:2501.16902 [cs.IR] https:\/\/arxiv.org\/abs\/2501.16902"}],"event":{"name":"SIGIR '25: The 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Padua Italy","acronym":"SIGIR '25","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3726302.3730285","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T10:02:24Z","timestamp":1755856944000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3726302.3730285"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,13]]},"references-count":38,"alternative-id":["10.1145\/3726302.3730285","10.1145\/3726302"],"URL":"https:\/\/doi.org\/10.1145\/3726302.3730285","relation":{},"subject":[],"published":{"date-parts":[[2025,7,13]]},"assertion":[{"value":"2025-07-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}