{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T13:53:45Z","timestamp":1774965225496,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,7,18]],"date-time":"2023-07-18T00:00:00Z","timestamp":1689638400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 62072462"],"award-info":[{"award-number":["No. 62072462"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Key R&D Program of China","award":["No. 2020AAA0108600"],"award-info":[{"award-number":["No. 2020AAA0108600"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,7,19]]},"DOI":"10.1145\/3539618.3591758","type":"proceedings-article","created":{"date-parts":[[2023,7,19]],"date-time":"2023-07-19T00:22:23Z","timestamp":1689726143000},"page":"1241-1251","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":22,"title":["Rethinking Benchmarks for Cross-modal Image-text Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9371-5256","authenticated-orcid":false,"given":"Weijing","family":"Chen","sequence":"first","affiliation":[{"name":"Renmin University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9809-8864","authenticated-orcid":false,"given":"Linli","family":"Yao","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6486-6020","authenticated-orcid":false,"given":"Qin","family":"Jin","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,7,18]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Empirical Evaluation of Gated Recurrent Neural Network Architectures in Aviation Delay Prediction. In 2020 5th International Conference on Computing, Communication and Security (ICCCS). 1--7.","author":"Ballakur Amulya Arun","year":"2020","unstructured":"Amulya Arun Ballakur and Arti Arya. 2020. Empirical Evaluation of Gated Recurrent Neural Network Architectures in Aviation Delay Prediction. In 2020 5th International Conference on Computing, Communication and Security (ICCCS). 1--7."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks.","author":"Desai Karan","year":"2021","unstructured":"Karan Desai, Gaurav Kaul, Zubin Aysola, and Justin Johnson. 2021. RedCaps: Web-curated image-text data created by the people, for the people. In Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16209"},{"key":"e_1_3_2_1_5_1","volume-title":"Ninth International Conference on Learning Representations (ICLR).","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In Ninth International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_6_1","volume-title":"An Empirical Study of Training End-to-End Vision-and-Language Transformers. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 18145--18155","author":"Dou Zi-Yi","year":"2022","unstructured":"Zi-Yi Dou, Yichong Xu, Zhe Gan, Jianfeng Wang, Shuohang Wang, Lijuan Wang, Chenguang Zhu, Pengchuan Zhang, Lu Yuan, Nanyun Peng, Zicheng Liu, and Michael Zeng. 2022. An Empirical Study of Training End-to-End Vision-and-Language Transformers. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 18145--18155."},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the British Machine Vision Conference(BMVC).","author":"Faghri Fartash","year":"2018","unstructured":"Fartash Faghri, David J. Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2018. VSE: Improving Visual-Semantic Embeddings with Hard Negatives. In Proceedings of the British Machine Vision Conference(BMVC)."},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the 26th International Conference on Neural Information Processing Systems -","volume":"2","author":"Frome Andrea","year":"2013","unstructured":"Andrea Frome, Greg S. Corrado, Jonathon Shlens, Samy Bengio, Jeffrey Dean, Marc'Aurelio Ranzato, and Tomas Mikolov. 2013. DeViSE: A Deep Visual-Semantic Embedding Model. In Proceedings of the 26th International Conference on Neural Information Processing Systems - Volume 2. 2121--2129."},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems. 6616--6628","author":"Gan Zhe","year":"2020","unstructured":"Zhe Gan, Yen-Chun Chen, Linjie Li, Chen Zhu, Yu Cheng, and Jingjing Liu. 2020. Large-Scale Adversarial Training for Vision-and-Language Representation Learning. In Proceedings of the 34th International Conference on Neural Information Processing Systems. 6616--6628."},{"key":"e_1_3_2_1_10_1","volume-title":"Deep Residual Learning for Image Recognition. In 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 770--778","author":"He Kaiming","year":"2016","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 770--778."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the 24th International Conference on Artificial Intelligence. 4188--4192","author":"Hodosh Micah","year":"2015","unstructured":"Micah Hodosh, Peter Young, and Julia Hockenmaier. 2015. Framing Image Description as a Ranking Task: Data, Models and Evaluation Metrics. In Proceedings of the 24th International Conference on Artificial Intelligence. 4188--4192."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01278"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298990"},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning. 5583--5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision. In Proceedings of the 38th International Conference on Machine Learning. 5583--5594."},{"key":"e_1_3_2_1_16_1","volume-title":"Semi-Supervised Classification with Graph Convolutional Networks. In International Conference on Learning Representations (ICLR).","author":"Thomas","unstructured":"Thomas N. Kipf and Max Welling. 2017. Semi-Supervised Classification with Graph Convolutional Networks. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_17_1","volume-title":"Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations. International journal of computer vision","author":"Krishna Ranjay","year":"2017","unstructured":"Ranjay Krishna, Yuke Zhu, Oliver Groth, Justin Johnson, Kenji Hata, Joshua Kravitz, Stephanie Chen, Yannis Kalantidis, Li-Jia Li, David A. Shamma, Michael S. Bernstein, and Li Fei-Fei. 2017. Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations. International journal of computer vision, Vol. 123, 1 (2017), 32--73."},{"key":"e_1_3_2_1_18_1","volume-title":"Stacked Cross Attention for Image-Text Matching. In European Conference on Computer Vision. Springer, 212--218","author":"Lee Kuang-Huei","year":"2018","unstructured":"Kuang-Huei Lee, Xi Chen, Gang Hua, Houdong Hu, and Xiaodong He. 2018. Stacked Cross Attention for Image-Text Matching. In European Conference on Computer Vision. Springer, 212--218."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"e_1_3_2_1_20_1","volume-title":"Visual Semantic Reasoning for Image-Text Matching. In 2019 IEEE\/CVF International Conference on Computer Vision (ICCV). 4653--4661","author":"Li Kunpeng","year":"2019","unstructured":"Kunpeng Li, Yulun Zhang, Kai Li, Yuanyuan Li, and Yun Fu. 2019. Visual Semantic Reasoning for Image-Text Matching. In 2019 IEEE\/CVF International Conference on Computer Vision (ICCV). 4653--4661."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.202"},{"key":"e_1_3_2_1_22_1","volume-title":"Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks. In European Conference on Computer Vision. 121--137","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, Yejin Choi, and Jianfeng Gao. 2020b. Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks. In European Conference on Computer Vision. 121--137."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350869"},{"key":"e_1_3_2_1_25_1","volume-title":"Graph Structured Network for Image-Text Matching. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 10918--10927","author":"Liu Chunxiao","year":"2020","unstructured":"Chunxiao Liu, Zhendong Mao, Tianzhu Zhang, Hongtao Xie, Bin Wang, and Yongdong Zhang. 2020. Graph Structured Network for Image-Text Matching. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 10918--10927."},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems. 13--23","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In Proceedings of the 33rd International Conference on Neural Information Processing Systems. 13--23."},{"key":"e_1_3_2_1_27_1","first-page":"2579","article-title":"Visualizing Data using t-SNE","volume":"9","author":"van der Maaten Laurens","year":"2008","unstructured":"Laurens van der Maaten and Geoffrey Hinton. 2008. Visualizing Data using t-SNE. Journal of Machine Learning Research, Vol. 9, 86 (2008), 2579--2605.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_28_1","volume-title":"Dual Attention Networks for Multimodal Reasoning and Matching. In 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 2156--2164","author":"Nam Hyeonseob","year":"2017","unstructured":"Hyeonseob Nam, Jung-Woo Ha, and Jeonghee Kim. 2017. Dual Attention Networks for Multimodal Reasoning and Matching. In 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 2156--2164."},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of the 24th International Conference on Neural Information Processing Systems. 1143--1151","author":"Ordonez Vicente","year":"2011","unstructured":"Vicente Ordonez, Girish Kulkarni, and Tamara L Berg. 2011. Im2Text: Describing Images Using 1 Million Captioned Photographs. In Proceedings of the 24th International Conference on Neural Information Processing Systems. 1143--1151."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413961"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462829"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning. 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning. 8748--8763."},{"key":"e_1_3_2_1_33_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. https:\/\/cdn.openai.com\/better-language-models\/language_models_are_unsupervised_multitask_learners.pdf Last accessed on 2023-1-1."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/720"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/2812802"},{"key":"e_1_3_2_1_38_1","volume-title":"Cross-modal Scene Graph Matching for Relationship-aware Image-Text Retrieval. In 2020 IEEE Winter Conference on Applications of Computer Vision (WACV). 1497--1506","author":"Wang Sijin","year":"2020","unstructured":"Sijin Wang, Ruiping Wang, Ziwei Yao, Shiguang Shan, and Xilin Chen. 2020. Cross-modal Scene Graph Matching for Relationship-aware Image-Text Retrieval. In 2020 IEEE Winter Conference on Applications of Computer Vision (WACV). 1497--1506."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/526"},{"key":"e_1_3_2_1_40_1","volume-title":"CAMP: Cross-Modal Adaptive Message Passing for Text-Image Retrieval. In 2019 IEEE\/CVF International Conference on Computer Vision (ICCV). 5763--5772","author":"Wang Zihao","year":"2019","unstructured":"Zihao Wang, Xihui Liu, Hongsheng Li, Lu Sheng, Junjie Yan, Xiaogang Wang, and Jing Shao. 2019a. CAMP: Cross-Modal Adaptive Message Passing for Text-Image Retrieval. In 2019 IEEE\/CVF International Conference on Computer Vision (ICCV). 5763--5772."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350940"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16431"},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of the 39th International Conference on Machine Learning. 25994--26009","author":"Zeng Yan","year":"2022","unstructured":"Yan Zeng, Xinsong Zhang, and Hang Li. 2022. Multi-Grained Vision Language Pre-Training: Aligning Texts with Visual Concepts. In Proceedings of the 39th International Conference on Machine Learning. 25994--26009."},{"key":"e_1_3_2_1_45_1","volume-title":"Negative-Aware Attention Framework for Image-Text Matching. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 15640--15649","author":"Zhang Kun","year":"2022","unstructured":"Kun Zhang, Zhendong Mao, Quan Wang, and Yongdong Zhang. 2022. Negative-Aware Attention Framework for Image-Text Matching. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 15640--15649."},{"key":"e_1_3_2_1_46_1","volume-title":"VinVL: Revisiting Visual Representations in Vision-Language Models. In 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 5575--5584","author":"Zhang Pengchuan","year":"2021","unstructured":"Pengchuan Zhang, Xiujun Li, Xiaowei Hu, Jianwei Yang, Lei Zhang, Lijuan Wang, Yejin Choi, and Jianfeng Gao. 2021. VinVL: Revisiting Visual Representations in Vision-Language Models. In 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 5575--5584."}],"event":{"name":"SIGIR '23: The 46th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Taipei Taiwan","acronym":"SIGIR '23","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3539618.3591758","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3539618.3591758","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:47:01Z","timestamp":1750178821000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3539618.3591758"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,7,18]]},"references-count":46,"alternative-id":["10.1145\/3539618.3591758","10.1145\/3539618"],"URL":"https:\/\/doi.org\/10.1145\/3539618.3591758","relation":{},"subject":[],"published":{"date-parts":[[2023,7,18]]},"assertion":[{"value":"2023-07-18","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}