{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T11:48:51Z","timestamp":1774352931798,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":83,"publisher":"ACM","funder":[{"name":"Ahold Delhaize","award":["AIRLab Amsterdam"],"award-info":[{"award-number":["AIRLab Amsterdam"]}]},{"name":"Dutch Research Council (NWO)","award":["024.004.022, NWA.1389.20.183, KICH3.LTP.20.006"],"award-info":[{"award-number":["024.004.022, NWA.1389.20.183, KICH3.LTP.20.006"]}]},{"name":"European Union's Horizon Europe program","award":["101070212"],"award-info":[{"award-number":["101070212"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,13]]},"DOI":"10.1145\/3726302.3730290","type":"proceedings-article","created":{"date-parts":[[2025,7,14]],"date-time":"2025-07-14T01:21:38Z","timestamp":1752456098000},"page":"3183-3193","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Benchmark Granularity and Model Robustness for Image-Text Retrieval: A Reproducibility Study"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0314-2955","authenticated-orcid":false,"given":"Mariya","family":"Hendriksen","sequence":"first","affiliation":[{"name":"University of Amsterdam, Amsterdam, Netherlands"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3179-4125","authenticated-orcid":false,"given":"Shuo","family":"Zhang","sequence":"additional","affiliation":[{"name":"Bloomberg, London, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5316-2683","authenticated-orcid":false,"given":"Ridho","family":"Reinanda","sequence":"additional","affiliation":[{"name":"Bloomberg, London, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1260-8059","authenticated-orcid":false,"given":"Mohamed","family":"Yahya","sequence":"additional","affiliation":[{"name":"Bloomberg, London, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0516-3688","authenticated-orcid":false,"given":"Edgar","family":"Meij","sequence":"additional","affiliation":[{"name":"Bloomberg, London, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1086-0202","authenticated-orcid":false,"given":"Maarten","family":"de Rijke","sequence":"additional","affiliation":[{"name":"University of Amsterdam, Amsterdam, Netherlands"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,7,13]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Controlled Caption Generation for Images Through Adversarial Attacks. CoRR","author":"Aafaq Nayyer","year":"2021","unstructured":"Nayyer Aafaq, Naveed Akhtar, Wei Liu, Mubarak Shah, and Ajmal Mian. 2021. Controlled Caption Generation for Images Through Adversarial Attacks. CoRR, Vol. abs\/2107.03050 (2021)."},{"key":"e_1_3_2_1_2_1","first-page":"2131","article-title":"Fusion of Detected Objects in Text for Visual Question Answering","author":"Alberti Chris","year":"2019","unstructured":"Chris Alberti, Jeffrey Ling, Michael Collins, and David Reitter. 2019. Fusion of Detected Objects in Text for Visual Question Answering. In EMNLP. 2131-2140.","journal-title":"EMNLP."},{"key":"e_1_3_2_1_3_1","unstructured":"Association for Computing Machinery. 2020. Artifact Review and Badging - Version 1.1. https:\/\/www.acm.org\/publications\/policies\/artifact-review-and-badging-current Accessed: 2025-02-16."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.696"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3459637.3482493"},{"key":"e_1_3_2_1_7_1","first-page":"677","volume-title":"UK","volume":"12354","author":"Brown Andrew","year":"2020","unstructured":"Andrew Brown, Weidi Xie, Vicky Kalogeiton, and Andrew Zisserman. 2020. Smooth-AP: Smoothing the Path Towards Large-Scale Image Retrieval. In Computer Vision - ECCV 2020 - 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part IX, Vol. 12354. Springer, 677-694."},{"key":"e_1_3_2_1_8_1","volume-title":"Image-text Retrieval: A Survey on Recent Research and Development. In Proceedings of the Thirty-First International Joint Conference on Artificial Intelligence, IJCAI 2022","author":"Cao Min","year":"2022","unstructured":"Min Cao, Shiping Li, Juntao Li, Liqiang Nie, and Min Zhang. [n.d.]. Image-text Retrieval: A Survey on Recent Research and Development. In Proceedings of the Thirty-First International Joint Conference on Artificial Intelligence, IJCAI 2022, Vienna, Austria, 23-29 July 2022, Luc De Raedt (Ed.)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/759"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.821"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1241"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3591758"},{"key":"e_1_3_2_1_13_1","volume-title":"Microsoft COCO Captions: Data Collection and Evaluation Server. arXiv preprint arXiv:1504.00325","author":"Chen Xinlei","year":"2015","unstructured":"Xinlei Chen, Hao Fang, Tsung-Yi Lin, Ramakrishna Vedantam, Saurabh Gupta, Piotr Doll\u00e1r, and C. Lawrence Zitnick. 2015. Microsoft COCO Captions: Data Collection and Evaluation Server. arXiv preprint arXiv:1504.00325 (2015)."},{"key":"e_1_3_2_1_14_1","volume-title":"Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu.","author":"Chen Yen-Chun","year":"2020","unstructured":"Yen-Chun Chen, Linjie Li, Licheng Yu, Ahmed El Kholy, Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu. 2020. UNITER: Universal Image-Text Representation Learning. In ECCV. 104-120."},{"key":"e_1_3_2_1_15_1","volume-title":"AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities. In Findings of the Association for Computational Linguistics: ACL 2023","author":"Chen Zhongzhi","year":"2023","unstructured":"Zhongzhi Chen, Guang Liu, Bo-Wen Zhang, Qinghong Yang, and Ledell Wu. 2023a. AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities. In Findings of the Association for Computational Linguistics: ACL 2023, Toronto, Canada, July 9-14, 2023, Anna Rogers, Jordan L. Boyd-Graber, and Naoaki Okazaki (Eds.). Association for Computational Linguistics, 8666-8682."},{"key":"e_1_3_2_1_16_1","volume-title":"Thomas Dideriksen, Himanshu Arora, Matthieu Guillaumin, and Jitendra Malik.","author":"Collins Jasmine","year":"2022","unstructured":"Jasmine Collins, Shubham Goel, Kenan Deng, Achleshwar Luthra, Leon Xu, Erhan Gundogdu, Xi Zhang, Tomas F Yago Vicente, Thomas Dideriksen, Himanshu Arora, Matthieu Guillaumin, and Jitendra Malik. 2022. ABO: Dataset and Benchmarks for Real-World 3D Object Understanding. CVPR (2022)."},{"key":"e_1_3_2_1_17_1","volume-title":"VSE: Improving Visual-Semantic Embeddings with Hard Negatives. In British Machine Vision Conference 2018, BMVC 2018","author":"Faghri Fartash","year":"2018","unstructured":"Fartash Faghri, David J. Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2018. VSE: Improving Visual-Semantic Embeddings with Hard Negatives. In British Machine Vision Conference 2018, BMVC 2018, Newcastle, UK, September 3-6, 2018. BMVA Press, 12."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442381.3450009"},{"key":"e_1_3_2_1_19_1","volume-title":"Advances in Neural Information Processing Systems","volume":"26","author":"Frome Andrea","year":"2013","unstructured":"Andrea Frome, Greg S Corrado, Jon Shlens, Samy Bengio, Jeff Dean, Marc'Aurelio Ranzato, and Tomas Mikolov. 2013. Devise: A Seep Visual-Semantic Embedding Model. Advances in Neural Information Processing Systems, Vol. 26 (2013)."},{"key":"e_1_3_2_1_20_1","volume-title":"Tackling Attribute Fine-grainedness in Cross-modal Fashion Search with Multi-level Features. In SIGIR 2021 Workshop on eCommerce. ACM.","author":"Goei Kenneth","year":"2021","unstructured":"Kenneth Goei, Mariya Hendriksen, and Maarten de Rijke. 2021. Tackling Attribute Fine-grainedness in Cross-modal Fashion Search with Multi-level Features. In SIGIR 2021 Workshop on eCommerce. ACM."},{"key":"e_1_3_2_1_21_1","volume-title":"Improving Image-Sentence Embeddings Using Large Weakly Annotated Photo Collections. In European Conference on Computer Vision. Springer, 529-545","author":"Gong Yunchao","year":"2014","unstructured":"Yunchao Gong, Liwei Wang, Micah Hodosh, Julia Hockenmaier, and Svetlana Lazebnik. 2014. Improving Image-Sentence Embeddings Using Large Weakly Annotated Photo Collections. In European Conference on Computer Vision. Springer, 529-545."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the IEEE International Conference on Computer Vision. 1463-1471","author":"Han Xintong","unstructured":"Xintong Han, Zuxuan Wu, Phoenix X. Huang, Xiao Zhang, Menglong Zhu, Yuan Li, Yang Zhao, and Larry S. Davis. 2017. Automatic Spatially-Aware Fashion Concept Discovery. In Proceedings of the IEEE International Conference on Computer Vision. 1463-1471."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-020-2649-2"},{"key":"e_1_3_2_1_24_1","volume-title":"Extending CLIP for Category-to-image Retrieval in E-commerce. In ECIR 2022: 44th European Conference on Information Retrieval. Springer, 289-303","author":"Hendriksen Mariya","year":"2022","unstructured":"Mariya Hendriksen, Maurits Bleeker, Svitlana Vakulenko, Nanne van Noord, Ernst Kuiper, and Maarten de Rijke. 2022. Extending CLIP for Category-to-image Retrieval in E-commerce. In ECIR 2022: 44th European Conference on Information Retrieval. Springer, 289-303."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-short.27"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.5281\/zenodo.1212303"},{"key":"e_1_3_2_1_27_1","volume-title":"Matplotlib: A 2D graphics environment. Computing in science & engineering","author":"Hunter John D","year":"2007","unstructured":"John D Hunter. 2007. Matplotlib: A 2D graphics environment. Computing in science & engineering, Vol. 9, 03 (2007), 90-95."},{"key":"e_1_3_2_1_28_1","volume-title":"Scaling Up Visual and Vision-Language Representation Learning with Noisy Text Supervision. In International Conference on Machine Learning. PMLR, 4904-4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling Up Visual and Vision-Language Representation Learning with Noisy Text Supervision. In International Conference on Machine Learning. PMLR, 4904-4916."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6311"},{"key":"e_1_3_2_1_30_1","volume-title":"Martin","author":"Jurafsky Dan","year":"2009","unstructured":"Dan Jurafsky and James H. Martin. 2009. Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics, and Speech Recognition, 2nd Edition. Prentice Hall, Pearson Education International.","edition":"2"},{"key":"e_1_3_2_1_31_1","first-page":"3128","article-title":"Deep Visual-Semantic Alignments for Generating Image Descriptions","author":"Karpathy Andrej","year":"2015","unstructured":"Andrej Karpathy and Fei-Fei Li. 2015. Deep Visual-Semantic Alignments for Generating Image Descriptions. In CVPR. 3128-3137.","journal-title":"CVPR."},{"key":"e_1_3_2_1_32_1","volume-title":"Fisher Vectors Derived from Hybrid Gaussian-Laplacian Mixture Models for Image Annotation. arXiv preprint arXiv:1411.7399","author":"Klein Benjamin","year":"2014","unstructured":"Benjamin Klein, Guy Lev, Gil Sadeh, and Lior Wolf. 2014. Fisher Vectors Derived from Hybrid Gaussian-Laplacian Mixture Models for Image Annotation. arXiv preprint arXiv:1411.7399 (2014)."},{"key":"e_1_3_2_1_33_1","volume-title":"Jifan Chen, Eunsol Choi, Gabriella Chronis, Anubrata Das, Katrin Erk, Matthew Lease, Junyi Jessy Li, Yating Wu, and Kyle Mahowald.","author":"Kovatchev Venelin","year":"2022","unstructured":"Venelin Kovatchev, Trina Chatterjee, Venkata Subrahmanyan Govindarajan, Jifan Chen, Eunsol Choi, Gabriella Chronis, Anubrata Das, Katrin Erk, Matthew Lease, Junyi Jessy Li, Yating Wu, and Kyle Mahowald. 2022. Longhorns at DADC 2022: How Many Linguists Does It Take to Fool a Question Answering Model? A Systematic Approach to Adversarial Attacks. CoRR, Vol. abs\/2206.14729 (2022)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3159652.3159716"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cognition.2016.04.003"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Gen Li Nan Duan Yuejian Fang Daxin Jiang and Ming Zhou. 2020a. Unicoder-VL: A Universal Encoder for Vision and Language by Cross-modal Pre-training. In AAAI.","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"e_1_3_2_1_38_1","volume-title":"Align Before Fuse: Vision and Language Representation Learning with Momentum Distillation. NeurIPS","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021b. Align Before Fuse: Vision and Language Representation Learning with Momentum Distillation. NeurIPS (2021), 9694-9705."},{"key":"e_1_3_2_1_39_1","first-page":"2022","volume-title":"Adversarial VQA: A New Benchmark for Evaluating the Robustness of VQA Models. In 2021 IEEE\/CVF International Conference on Computer Vision, ICCV 2021","author":"Li Linjie","year":"2021","unstructured":"Linjie Li, Jie Lei, Zhe Gan, and Jingjing Liu. 2021a. Adversarial VQA: A New Benchmark for Evaluating the Robustness of VQA Models. In 2021 IEEE\/CVF International Conference on Computer Vision, ICCV 2021, Montreal, QC, Canada, October 10-17, 2021. IEEE, 2022-2031."},{"key":"e_1_3_2_1_40_1","volume-title":"VisualBERT: A Simple and Performant Baseline for Vision and Language. arXiv preprint arXiv:1908.03557","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019. VisualBERT: A Simple and Performant Baseline for Vision and Language. arXiv preprint arXiv:1908.03557 (2019)."},{"key":"e_1_3_2_1_41_1","first-page":"121","article-title":"OSCAR","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, et al. 2020b. OSCAR: Object-Semantics Aligned Pre-training for Vision-Language Tasks. In ECCV. 121-137.","journal-title":"Object-Semantics Aligned Pre-training for Vision-Language Tasks. In ECCV."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3473140"},{"key":"e_1_3_2_1_43_1","first-page":"740","article-title":"Microsoft COCO","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In ECCV. 740-755.","journal-title":"Common Objects in Context. In ECCV."},{"key":"e_1_3_2_1_44_1","volume-title":"Robust Information Retrieval. In SIGIR 2024: 47th international ACM SIGIR Conference on Research and Development in Information Retrieval. ACM, 3009-3012","author":"Liu Yu-An","year":"2024","unstructured":"Yu-An Liu, Ruqing Zhang, Jiafeng Guo, and Maarten de Rijke. 2024a. Robust Information Retrieval. In SIGIR 2024: 47th international ACM SIGIR Conference on Research and Development in Information Retrieval. ACM, 3009-3012."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3583780.3614793"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657704"},{"key":"e_1_3_2_1_47_1","first-page":"13","article-title":"ViLBERT","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In NeurIPS. 13-23.","journal-title":"Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In NeurIPS."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-28238-6_39"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2927476"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3451390"},{"key":"e_1_3_2_1_51_1","first-page":"529","article-title":"SLIP","author":"Mu Norman","year":"2022","unstructured":"Norman Mu, Alexander Kirillov, David A. Wagner, and Saining Xie. 2022. SLIP: Self-Supervision Meets Language-Image Pre-training. In ECCV. 529-544.","journal-title":"Self-Supervision Meets Language-Image Pre-training. In ECCV."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.232"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.70"},{"key":"e_1_3_2_1_54_1","volume-title":"Analyzing Adversarial Attacks on Sequence-to-Sequence Relevance Models. In European Conference on Information Retrieval. Springer, 286-302","author":"Parry Andrew","year":"2024","unstructured":"Andrew Parry, Maik Fr\u00f6be, Sean MacAvaney, Martin Potthast, and Matthias Hagen. 2024. Analyzing Adversarial Attacks on Sequence-to-Sequence Relevance Models. In European Conference on Information Retrieval. Springer, 286-302."},{"key":"e_1_3_2_1_55_1","volume-title":"Evaluating the Robustness of Retrieval Pipelines with Query Variation Generators. In European conference on information retrieval. Springer, 397-412","author":"Penha Gustavo","year":"2022","unstructured":"Gustavo Penha, Arthur C\u00e2mara, and Claudia Hauff. 2022. Evaluating the Robustness of Retrieval Pipelines with Query Variation Generators. In European conference on information retrieval. Springer, 397-412."},{"key":"e_1_3_2_1_56_1","volume-title":"Proceedings of the Fourth International Workshop on Designing Meaning Representations. 74-88","author":"Pesahov Leon","year":"2023","unstructured":"Leon Pesahov, Ayal Klein, and Ido Dagan. 2023. QA-Adj: Adding Adjectives to QA-based Semantics. In Proceedings of the Fourth International Workshop on Designing Meaning Representations. 74-88."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.98"},{"key":"e_1_3_2_1_58_1","unstructured":"Maciej Piasecki Bernd Broda and Stanislaw Szpakowicz. 2009. A WordNet from the Ground Up. Oficyna Wydawnicza Politechniki Wroclawskiej Wroclaw."},{"key":"e_1_3_2_1_59_1","first-page":"8748","article-title":"Learning Transferable Visual Models From Natural Language Supervision","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML. 8748-8763.","journal-title":"ICML."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_1_61_1","volume-title":"Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval. 2132-2136","author":"Sidiropoulos Georgios","year":"2022","unstructured":"Georgios Sidiropoulos and Evangelos Kanoulas. 2022. Analyzing the Robustness of Dual Encoders for Dense Retrieval Against Misspellings. In Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval. 2132-2136."},{"key":"e_1_3_2_1_62_1","unstructured":"Weijie Su Xizhou Zhu Yue Cao Bin Li Lewei Lu Furu Wei and Jifeng Dai. 2020. VL-BERT: Pre-training of Generic Visual-Linguistic Representations. In ICLR."},{"key":"e_1_3_2_1_63_1","first-page":"5099","article-title":"LXMERT: Learning Cross-Modality Encoder Representations from Transformers","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal. 2019. LXMERT: Learning Cross-Modality Encoder Representations from Transformers. In EMNLP-IJCNLP. 5099-5110.","journal-title":"EMNLP-IJCNLP."},{"key":"e_1_3_2_1_64_1","volume-title":"UK","author":"Thomas Christopher","year":"2020","unstructured":"Christopher Thomas and Adriana Kovashka. 2020. Preserving Semantic Neighborhoods for Robust Cross-Modal Retrieval. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XVIII 16. 317-335."},{"key":"e_1_3_2_1_65_1","volume-title":"CSI Communications","volume":"23","author":"Umesh P","year":"2012","unstructured":"P Umesh. 2012. Image Processing in Python. CSI Communications, Vol. 23 (2012)."},{"key":"e_1_3_2_1_66_1","volume-title":"Workshop of the cross-language evaluation forum for european languages. Springer, 355-370","author":"Voorhees Ellen M","year":"2001","unstructured":"Ellen M Voorhees. 2001. The philosophy of information retrieval evaluation. In Workshop of the cross-language evaluation forum for european languages. Springer, 355-370."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00279"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1155\/2020\/8836195"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.541"},{"key":"e_1_3_2_1_70_1","volume-title":"Saksham Singhal, Subhojit Som, et al.","author":"Wang Wenhui","year":"2022","unstructured":"Wenhui Wang, Hangbo Bao, Li Dong, Johan Bjorck, Zhiliang Peng, Qiang Liu, Kriti Aggarwal, Owais Khan Mohammed, Saksham Singhal, Subhojit Som, et al. 2022a. Image as a Foreign Language: BEiT Pretraining for All Vision and Vision-Language Tasks. arXiv preprint arXiv:2208.10442 (2022)."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548003"},{"key":"e_1_3_2_1_72_1","volume-title":"Technical Report CNS-TR-2010-001. California Institute of Technology.","author":"Welinder P.","year":"2010","unstructured":"P. Welinder, S. Branson, T. Mita, C. Wah, F. Schroff, S. Belongie, and P. Perona. 2010. Caltech-UCSD Birds 200. Technical Report CNS-TR-2010-001. California Institute of Technology."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00426"},{"key":"e_1_3_2_1_75_1","volume-title":"Challenges of Zero-Shot Recognition with Vision-Language Models: Granularity and Correctness. arXiv preprint arXiv:2306.16048","author":"Xu Zhenlin","year":"2023","unstructured":"Zhenlin Xu, Yi Zhu, Tiffany Deng, Abhay Mittal, Yanbei Chen, Manchen Wang, Paolo Favaro, Joseph Tighe, and Davide Modolo. 2023. Challenges of Zero-Shot Recognition with Vision-Language Models: Granularity and Correctness. arXiv preprint arXiv:2306.16048 (2023)."},{"key":"e_1_3_2_1_76_1","volume-title":"FILIP: Fine-grained Interactive Language-Image Pre-Training. In ICLR.","author":"Yao Lewei","year":"2022","unstructured":"Lewei Yao, Runhui Huang, Lu Hou, Guansong Lu, Minzhe Niu, Hang Xu, Xiaodan Liang, Zhenguo Li, Xin Jiang, and Chunjing Xu. 2022. FILIP: Fine-grained Interactive Language-Image Pre-Training. In ICLR."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_78_1","unstructured":"Mert Yuksekgonul Federico Bianchi Pratyusha Kalluri Dan Jurafsky and James Zou. 2023. When and Why Vision-Language Models Behave like Bags-Of-Words and What to Do About It?. In ICLR."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547801"},{"key":"e_1_3_2_1_80_1","volume-title":"Proceedings of the 29th International Conference on Computational Linguistics. 6784-6794","author":"Zhao Fei","year":"2022","unstructured":"Fei Zhao, Zhen Wu, Siyu Long, Xinyu Dai, Shujian Huang, and Jiajun Chen. 2022. Learning from Adjective-Noun Pairs: A Knowledge-Enhanced Framework for Target-Oriented Multimodal Sentiment Classification. In Proceedings of the 29th International Conference on Computational Linguistics. 6784-6794."},{"key":"e_1_3_2_1_81_1","volume-title":"On Evaluating Adversarial Robustness of Large Vision-Language Models. CoRR","author":"Zhao Yunqing","year":"2023","unstructured":"Yunqing Zhao, Tianyu Pang, Chao Du, Xiao Yang, Chongxuan Li, Ngai-Man Cheung, and Min Lin. 2023. On Evaluating Adversarial Robustness of Large Vision-Language Models. CoRR (2023)."},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.2967594"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531951"}],"event":{"name":"SIGIR '25: The 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Padua Italy","acronym":"SIGIR '25","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3726302.3730290","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T10:00:14Z","timestamp":1755856814000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3726302.3730290"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,13]]},"references-count":83,"alternative-id":["10.1145\/3726302.3730290","10.1145\/3726302"],"URL":"https:\/\/doi.org\/10.1145\/3726302.3730290","relation":{},"subject":[],"published":{"date-parts":[[2025,7,13]]},"assertion":[{"value":"2025-07-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}