{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T17:20:11Z","timestamp":1775841611864,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":62,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/100017052","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376196, 62036012, U23A20387, 62106262, 62202331, 62206200, and 62276118;"],"award-info":[{"award-number":["62376196, 62036012, U23A20387, 62106262, 62202331, 62206200, and 62276118;"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/100017052","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Key Research and Development Plan of China","award":["2021ZD0112200"],"award-info":[{"award-number":["2021ZD0112200"]}]},{"name":"Tianjin Natural Science Foundation","award":["22JCYBJC00030"],"award-info":[{"award-number":["22JCYBJC00030"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680591","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"2350-2359","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["Overcoming the Pitfalls of Vision-Language Model for Image-Text Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8153-9977","authenticated-orcid":false,"given":"Feifei","family":"Zhang","sequence":"first","affiliation":[{"name":"Tianjin University of Technology, TianJin, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6285-9808","authenticated-orcid":false,"given":"Sijia","family":"Qu","sequence":"additional","affiliation":[{"name":"Tianjin University of Technology, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2074-0228","authenticated-orcid":false,"given":"Fan","family":"Shi","sequence":"additional","affiliation":[{"name":"Tianjin University of Technology, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8343-9665","authenticated-orcid":false,"given":"Changsheng","family":"Xu","sequence":"additional","affiliation":[{"name":"CASIA, UCAS, &amp; Peng Cheng Laboratory, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Grounding Everything: Emerging Localization Properties in Vision-Language Transformers. arXiv preprint arXiv:2312.00878","author":"Bousselham Walid","year":"2023","unstructured":"Walid Bousselham, Felix Petersen, Vittorio Ferrari, and Hilde Kuehne. 2023. Grounding Everything: Emerging Localization Properties in Vision-Language Transformers. arXiv preprint arXiv:2312.00878 (2023)."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01163"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01553"},{"key":"e_1_3_2_2_5_1","volume-title":"Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325","author":"Chen Xinlei","year":"2015","unstructured":"Xinlei Chen, Hao Fang, Tsung-Yi Lin, Ramakrishna Vedantam, Saurabh Gupta, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2015. Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325, Vol. 3 (2015), 1--7."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01060"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00512"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00831"},{"key":"e_1_3_2_2_9_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 6935--6944","author":"Dess Roberto","unstructured":"Roberto Dess`i, Michele Bevilacqua, Eleonora Gualdoni, Nathana\u00ebl Carraz Rakotonirina, Francesca Franzon, and Marco Baroni. 2023. Cross-domain image captioning with discriminative finetuning. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 6935--6944."},{"key":"e_1_3_2_2_10_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT). 4171--4186","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT). 4171--4186."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16209"},{"key":"e_1_3_2_2_12_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR). 1--12","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al. 2021. An image is worth 16x16 words: Transformers for image recognition at scale. In Proceedings of the International Conference on Learning Representations (ICLR). 1--12."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01763"},{"key":"e_1_3_2_2_14_1","volume-title":"British Machine Vision Conference (BMVC). 12","author":"Faghri Fartash","year":"2018","unstructured":"Fartash Faghri, David J Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2018. Vse: Improving visual-semantic embeddings with hard negatives. In British Machine Vision Conference (BMVC). 12."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01455"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01455"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01046"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02519"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02238"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29789"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01278"},{"key":"e_1_3_2_2_22_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML). 4904--4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In Proceedings of the International Conference on Machine Learning (ICML). 4904--4916."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00273"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02318"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01786"},{"key":"e_1_3_2_2_26_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML). 5583--5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In Proceedings of the International Conference on Machine Learning (ICML). 5583--5594."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612101"},{"key":"e_1_3_2_2_29_1","volume-title":"Proceedings of the Advances in Neural Information Processing Systems (NIPS). 1--14","author":"Li Hao","year":"2024","unstructured":"Hao Li, Jingkuan Song, Lianli Gao, Xiaosu Zhu, and Hengtao Shen. 2024. Prototype-based Aleatoric Uncertainty Quantification for Cross-modal Retrieval. In Proceedings of the Advances in Neural Information Processing Systems (NIPS). 1--14."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3148470"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3128744"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3289753"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/158"},{"key":"e_1_3_2_2_34_1","first-page":"17612","article-title":"Mind the gap: Understanding the modality gap in multi-modal contrastive representation learning","volume":"35","author":"Liang Victor Weixin","year":"2022","unstructured":"Victor Weixin Liang, Yuhui Zhang, Yongchan Kwon, Serena Yeung, and James Y Zou. 2022. Mind the gap: Understanding the modality gap in multi-modal contrastive representation learning. Proceedings of the Advances in Neural Information Processing Systems (NIPS), Vol. 35 (2022), 17612--17625.","journal-title":"Proceedings of the Advances in Neural Information Processing Systems (NIPS)"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3186740"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02237"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01029"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01847"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01847"},{"key":"e_1_3_2_2_40_1","volume-title":"Proceedings of the IEEE International Conference on Computer Vision (ICCV). 4157--4168","author":"Bharat Singh Koutilya PNVR","year":"2023","unstructured":"Koutilya PNVR, Bharat Singh, Pallabi Ghosh, Behjat Siddiquie, and David Jacobs. 2023. LD-ZNet: A Latent Diffusion Approach for Text-Based Image Segmentation. In Proceedings of the IEEE International Conference on Computer Vision (ICCV). 4157--4168."},{"key":"e_1_3_2_2_41_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning (ICML). 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In Proceedings of the 38th International Conference on Machine Learning (ICML). 8748--8763."},{"key":"e_1_3_2_2_42_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125, Vol. 1 (2022), 3."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00278"},{"key":"e_1_3_2_2_44_1","volume-title":"Proceedings of the Advances in Neural Information Processing Systems (NIPS). 91--99","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. In Proceedings of the Advances in Neural Information Processing Systems (NIPS). 91--99."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00271"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1162"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01438"},{"key":"e_1_3_2_2_48_1","volume-title":"Proceedings of the Advances in Neural Information Processing Systems (NIPS). 5998--6008","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Proceedings of the Advances in Neural Information Processing Systems (NIPS). 5998--6008."},{"key":"e_1_3_2_2_49_1","first-page":"3363","article-title":"Learning structural representations for recipe generation and food retrieval","volume":"45","author":"Wang Hao","year":"2022","unstructured":"Hao Wang, Guosheng Lin, Steven CH Hoi, and Chunyan Miao. 2022. Learning structural representations for recipe generation and food retrieval. IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI), Vol. 45 (2022), 3363--3377.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00277"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00289"},{"key":"e_1_3_2_2_52_1","volume-title":"Proceedings of the Advances in Neural Information Processing Systems (NIPS). 4514--4528","author":"Xue Hongwei","year":"2021","unstructured":"Hongwei Xue, Yupan Huang, Bei Liu, Houwen Peng, Jianlong Fu, Houqiang Li, and Jiebo Luo. 2021. Probing inter-modality: Visual parsing with self-attention for vision-and-language pre-training. In Proceedings of the Advances in Neural Information Processing Systems (NIPS). 4514--4528."},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00509"},{"key":"e_1_3_2_2_54_1","volume-title":"Proceedings of the Advances in Neural Information Processing Systems (NIPS). 1--14","author":"Yarom Michal","year":"2024","unstructured":"Michal Yarom, Yonatan Bitton, Soravit Changpinyo, Roee Aharoni, Jonathan Herzig, Oran Lang, Eran Ofek, and Idan Szpektor. 2024. What you see is what you read? improving text-image alignment evaluation. In Proceedings of the Advances in Neural Information Processing Systems (NIPS). 1--14."},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00683"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16431"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28538"},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01521"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"e_1_3_2_2_61_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR). 1--10","author":"Zhao Shuai","year":"2024","unstructured":"Shuai Zhao, Xiaohan Wang, Linchao Zhu, and Yi Yang. 2024. Test-time adaptation with clip reward for zero-shot generalization in vision-language models. In Proceedings of the International Conference on Learning Representations (ICLR). 1--10."},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01015"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680591","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680591","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:56Z","timestamp":1750295876000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680591"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":62,"alternative-id":["10.1145\/3664647.3680591","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680591","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}