{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:56:08Z","timestamp":1781535368910,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62272422, U22B2051, 62325602"],"award-info":[{"award-number":["62272422, U22B2051, 62325602"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Natural Science Foundation of Henan Province","award":["252300421225"],"award-info":[{"award-number":["252300421225"]}]},{"name":"Organized Young Scientific Research Team Cultivation Foundation of Zhengzhou University","award":["35220549"],"award-info":[{"award-number":["35220549"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810660","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"356-365","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Calibrate and Aggregate: Cross-Modal Retrieval with Distribution Alignment and Token Reduction"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2440-323X","authenticated-orcid":false,"given":"Ziyi","family":"Wu","sequence":"first","affiliation":[{"name":"School of Computer and Artificial Intelligence, Zhengzhou university, Zhengzhou, Henan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6840-9978","authenticated-orcid":false,"given":"Jiahao","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer and Artificial Intelligence, Zhengzhou University, Zhengzhou, Henan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4868-0709","authenticated-orcid":false,"given":"Mingyuan","family":"Jiu","sequence":"additional","affiliation":[{"name":"School of Computer and Artificial Intelligence, Zhengzhou University, Zhengzhou, Henan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1437-9456","authenticated-orcid":false,"given":"Hongru","family":"Zhao","sequence":"additional","affiliation":[{"name":"School of Computer and Artificial Intelligence, Zhengzhou University, Zhengzhou, Henan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6813-9146","authenticated-orcid":false,"given":"Hichem","family":"Sahbi","sequence":"additional","affiliation":[{"name":"CNRS LIP6, Sorbonne University, Paris, France"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6885-3451","authenticated-orcid":false,"given":"Mingliang","family":"Xu","sequence":"additional","affiliation":[{"name":"School of Computer and Artificial Intelligence, Zhengzhou University, Zhengzhou, Henan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","unstructured":"Shivaji Alaparthi and Manit Mishra. 2021. Bidirectional Encoder Representations from Transformers (BERT): A sentiment analysis odyssey. Journal of Marketing Analytics 9 2 (2021) 118\u2013126. 10.1057\/s41270-021-00109-8","DOI":"10.1057\/s41270-021-00109-8"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_3_1_4_2","unstructured":"Jinze Bai Shuai Bai Shusheng Yang Shijie Wang Sinan Tan Peng Wang Junyang Lin Chang Zhou and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.12966 (2023)."},{"key":"e_1_3_3_1_5_2","unstructured":"Min Cao Shiping Li Juntao Li Liqiang Nie and Min Zhang. 2022. Image-text retrieval: A survey on recent research and development. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2203.14713 (2022)."},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","unstructured":"Feiyu Chen Jie Shao Yonghui Zhang Xing Xu and Heng\u00a0Tao Shen. 2021. Interclass-relativity-adaptive metric learning for cross-modal matching and beyond. IEEE Transactions on Multimedia 23 (2021) 3073\u20133084. 10.1109\/TMM.2020.3019710","DOI":"10.1109\/TMM.2020.3019710"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","unstructured":"Jia Chen and Hong Zhang. 2024. Semantic enhancement and multi-level alignment network for cross-modal retrieval. Multimedia Tools and Applications 83 40 (2024) 88221\u201388243. 10.1007\/s11042-023-17956-5","DOI":"10.1007\/s11042-023-17956-5"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00831"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2305.06500"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","unstructured":"Yifei Deng Zhengyu Chen Chenglong Li and Jin Tang. 2025. Uncertainty-aware coarse-to-fine alignment for text-image person retrieval. Visual Intelligence 3 1 (2025) 1\u201314. 10.1007\/s44267-025-00078-x","DOI":"10.1007\/s44267-025-00078-x"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16209"},{"key":"e_1_3_3_1_13_2","volume-title":"British Machine Vision Conference 2018, (BMVC)","author":"Faghri Fartash","year":"2018","unstructured":"Fartash Faghri, David\u00a0J Fleet, Jamie\u00a0Ryan Kiros, and Sanja Fidler. 2018. VSE++: Improving Visual-Semantic Embeddings with Hard Negatives. In British Machine Vision Conference 2018, (BMVC) (Newcastle, UK). BMVA Press, Durham, UK, 13\u00a0pages."},{"key":"e_1_3_3_1_14_2","unstructured":"Minghui Fang Shengpeng Ji Jialong Zuo Hai Huang Yan Xia Jieming Zhu Xize Cheng Xiaoda Yang Wenrui Liu Gang Wang et\u00a0al. 2024. Ace: A generative cross-modal retrieval framework with coarse-to-fine semantic modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.17507 (2024)."},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00750"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00536"},{"key":"e_1_3_3_1_17_2","first-page":"7181","volume-title":"5th International Conference on Learning Representations (ICLR)","author":"Jang Eric","year":"2017","unstructured":"Eric Jang, Shixiang Gu, and Ben Poole. 2017. Categorical reparameterization with gumbel-softmax. In 5th International Conference on Learning Representations (ICLR) (Toulon, France). OpenReview.net, Amherst, MA, USA, 7181\u20137189."},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"crossref","unstructured":"Jinhyun Jang Jiyoung Lee and Kwanghoon Sohn. 2025. Descriptive Image-Text Matching with Graded Contextual Similarity. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.09997 (2025).","DOI":"10.2139\/ssrn.5475995"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02243"},{"key":"e_1_3_3_1_20_2","series-title":"Proceedings of Machine Learning Research (PMLR)","first-page":"5583","volume-title":"Proceedings of the 38th International Conference on Machine Learning (ICML)","volume":"139","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision. In Proceedings of the 38th International Conference on Machine Learning (ICML) (Vienna, Austria (Held Online)) (Proceedings of Machine Learning Research (PMLR), Vol.\u00a0139). PMLR, Brookline, MA, USA, 5583\u20135594. https:\/\/proceedings.mlr.press\/v139\/kim21k.html"},{"key":"e_1_3_3_1_21_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Kingma Diederik\u00a0P.","year":"2015","unstructured":"Diederik\u00a0P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In International Conference on Learning Representations (ICLR) (San Diego, CA, USA). OpenReview.net, Amherst, MA, USA. http:\/\/arxiv.org\/abs\/1412.6980"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","unstructured":"Diederik\u00a0P Kingma and Max Welling. 2014. Auto-encoding variational bayes. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1312.6114 (2014). 10.48550\/arXiv.1312.6114","DOI":"10.48550\/arXiv.1312.6114"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01891"},{"key":"e_1_3_3_1_25_2","first-page":"9694","volume-title":"Advances in neural information processing systems (NeurIPS)","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu\u00a0Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. In Advances in neural information processing systems (NeurIPS) (Sydney, Australia (Held Online)), Vol.\u00a034. Curran Associates, Inc., Red Hook, NY, USA, 9694\u20139705."},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00475"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_3_1_29_2","unstructured":"Artemis Panagopoulou Le Xue Ning Yu Junnan Li Dongxu Li Shafiq Joty Ran Xu Silvio Savarese Caiming Xiong and Juan\u00a0Carlos Niebles. 2023. X-instructblip: A framework for aligning x-modal instruction-aware representations to llms and emergent cross-modal reasoning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.18799 (2023)."},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","unstructured":"Ruitao Pu Yang Qin Dezhong Peng Xiaomin Song and Huiming Zheng. 2025. Deep reversible consistency learning for cross-modal retrieval. IEEE Transactions on Multimedia 27 (2025) 4095\u20134106. 10.1109\/TMM.2025.3535313","DOI":"10.1109\/TMM.2025.3535313"},{"key":"e_1_3_3_1_31_2","first-page":"8748","volume-title":"International conference on machine learning (ICML)","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning (ICML) (Vienna, Austria (Held Online)), Vol.\u00a0139. PMLR, Brookline, MA, USA, 8748\u20138763."},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","unstructured":"Shaoqing Ren Kaiming He Ross Girshick and Jian Sun. 2017. Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI) 39 6 (2017) 1137\u20131149. 10.1109\/TPAMI.2016.2577031","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00208"},{"key":"e_1_3_3_1_34_2","unstructured":"Quan Sun Yuxin Fang Ledell Wu Xinlong Wang and Yue Cao. 2023. Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.15389 (2023). https:\/\/arxiv.org\/abs\/2303.15389"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","unstructured":"Di Wang Jiabo Tian Xiao Liang Yumin Tian and Lihuo He. 2025. Global-aware Fragment Representation Aggregation Network for image-text retrieval. Pattern Recognition 159 (2025) 111085. 10.1016\/j.patcog.2024.111085","DOI":"10.1016\/j.patcog.2024.111085"},{"key":"e_1_3_3_1_36_2","unstructured":"Kaiye Wang Qiyue Yin Wei Wang Shu Wu and Liang Wang. 2016. A comprehensive survey on cross-modal retrieval. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1607.06215 (2016). https:\/\/arxiv.org\/abs\/1607.06215"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","unstructured":"Wenzhang Wei Zhipeng Gui Changguang Wu Anqi Zhao Dehua Peng and Huayi Wu. 2025. Dynamic visual semantic sub-embeddings and fast re-ranking for image-text retrieval. IEEE Transactions on Multimedia 27 (2025) 3781\u20133796. 10.1109\/TMM.2025.3535373","DOI":"10.1109\/TMM.2025.3535373"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00461"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"e_1_3_3_1_40_2","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi et\u00a0al. 2023. mplug-owl: Modularization empowers large language models with multimodality. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.14178 (2023)."},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","unstructured":"Peter Young Alice Lai Micah Hodosh and Julia Hockenmaier. 2014. From Image Descriptions to Visual Denotations: New Similarity Metrics for Semantic Inference over Event Descriptions. Transactions of the Association for Computational Linguistics 2 (2014) 67\u201378. 10.1162\/tacl_a_00166","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_3_1_42_2","unstructured":"Jiahui Yu Zirui Wang Vijay Vasudevan Legg Yeung Mojtaba Seyedhosseini and Yonghui Wu. 2022. Coca: Contrastive captioners are image-text foundation models. Transactions on Machine Learning Research 2022 (2022). https:\/\/openreview.net\/forum?id=Ee277P3AYC"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20235"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00359"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","unstructured":"Yan Zhang Zhong Ji Yanwei Pang and Jungong Han. 2025. Hierarchical and complementary experts transformer with momentum invariance for image-text retrieval. Knowledge-Based Systems (KBS) 309 (2025) 112912. 10.1016\/j.knosys.2024.112912","DOI":"10.1016\/j.knosys.2024.112912"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:47:14Z","timestamp":1781534834000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810660"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":44,"alternative-id":["10.1145\/3805622.3810660","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810660","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}