{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:56:32Z","timestamp":1781538992617,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"National Natural Science Foundation of China","award":["12501631"],"award-info":[{"award-number":["12501631"]}]},{"name":"the Nature Science Foundation Project of Chongqing Science and Technology Bureau","award":["CSTB2025NSCQ-GPX0985"],"award-info":[{"award-number":["CSTB2025NSCQ-GPX0985"]}]},{"name":"the Science and Technology Research Program of Chongqing Municipal Education Commission","award":["KJQN202500517"],"award-info":[{"award-number":["KJQN202500517"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810824","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"118-127","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["M-STAR: Multi-view Semantic Topology Alignment with Reasoning from VLMs for Image-Text Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-9517-7826","authenticated-orcid":false,"given":"Xuewen","family":"He","sequence":"first","affiliation":[{"name":"Chongqing Normal University, Chongqing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8078-1754","authenticated-orcid":false,"given":"Yuning","family":"Guo","sequence":"additional","affiliation":[{"name":"Chongqing Normal University, Chongqing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2081-4315","authenticated-orcid":false,"given":"Fumao","family":"Xu","sequence":"additional","affiliation":[{"name":"Chongqing Normal University, Chongqing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5517-3633","authenticated-orcid":false,"given":"Mingyong","family":"Li","sequence":"additional","affiliation":[{"name":"Chongqing Normal University, Chongqing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"crossref","unstructured":"Junyu Chen Yihua Gao Mingyuan Ge and Mingyong Li. 2025. Ambiguity-Aware and High-order Relation learning for multi-grained image\u2013text matching. Knowledge-Based Systems 316 (2025) 113355.","DOI":"10.1016\/j.knosys.2025.113355"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICME59968.2025.11209301"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01553"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00831"},{"key":"e_1_3_3_1_7_2","unstructured":"Jacob Devlin Ming-Wei Chang Kenton Lee and Kristina Toutanova. 2018. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1810.04805 (2018)."},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16209"},{"key":"e_1_3_3_1_9_2","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et\u00a0al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.11929 (2020)."},{"key":"e_1_3_3_1_10_2","unstructured":"Fartash Faghri David\u00a0J Fleet Jamie\u00a0Ryan Kiros and Sanja Fidler. 2017. Vse++: Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1707.05612 (2017)."},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"Lijie Fan Dilip Krishnan Phillip Isola Dina Katabi and Yonglong Tian. 2023. Improving clip training with language rewrites. Advances in Neural Information Processing Systems 36 (2023) 35544\u201335575.","DOI":"10.52202\/075280-1544"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01455"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3731715.3733465"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i16.33922"},{"key":"e_1_3_3_1_16_2","first-page":"4904","volume-title":"International conference on machine learning","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning. PMLR, 4904\u20134916."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00475"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","unstructured":"Kunpeng Li Yulun Zhang Kai Li Yuanyuan Li and Yun Fu. 2022. Image-text embedding learning via visual and textual semantic reasoning. IEEE transactions on pattern analysis and machine intelligence 45 1 (2022) 641\u2013656.","DOI":"10.1109\/TPAMI.2022.3148470"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"crossref","unstructured":"Mingyong Li Yihua Gao Honggang Zhao Ruiheng Li and Junyu Chen. 2025. Progressive semantic aggregation and structured cognitive enhancement for image\u2013text matching. Expert Systems with Applications 274 (2025) 126943.","DOI":"10.1016\/j.eswa.2025.126943"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Zhe Li Lei Zhang Kun Zhang Yongdong Zhang and Zhendong Mao. 2024. Fast accurate and lightweight memory-enhanced embedding learning framework for image-text retrieval. IEEE Transactions on Circuits and Systems for Video Technology 34 7 (2024) 6542\u20136558.","DOI":"10.1109\/TCSVT.2024.3358411"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Zhe Li Lei Zhang Kun Zhang Yongdong Zhang and Zhendong Mao. 2024. Improving image-text matching with bidirectional consistency of cross-modal alignment. IEEE Transactions on Circuits and Systems for Video Technology 34 7 (2024) 6590\u20136607.","DOI":"10.1109\/TCSVT.2024.3369656"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Yang Liu Wentao Feng Zhuoyao Liu Shudong Huang and Jiancheng Lv. 2025. Aligning Information Capacity Between Vision and Language via Dense-to-Sparse Feature Distillation for Image-Text Matching. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.14953 (2025).","DOI":"10.1109\/ICCV51701.2025.02013"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Manh-Duy Nguyen Binh\u00a0T Nguyen and Cathal Gurrin. 2021. A deep local and global scene-graph matching for image-text retrieval. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2106.02400 (2021).","DOI":"10.3233\/FAIA210049"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01847"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01361"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413961"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462829"},{"key":"e_1_3_3_1_32_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01850"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00056"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"crossref","unstructured":"Sheng Shen Chunyuan Li Xiaowei Hu Yujia Xie Jianwei Yang Pengchuan Zhang Zhe Gan Lijuan Wang Lu Yuan Ce Liu et\u00a0al. 2022. K-lite: Learning transferable visual models with external knowledge. Advances in Neural Information Processing Systems 35 (2022) 15558\u201315573.","DOI":"10.52202\/068431-1132"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/720"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00208"},{"key":"e_1_3_3_1_38_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58586-0_2"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093614"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i5.20536"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"crossref","unstructured":"Wenzhang Wei Zhipeng Gui Changguang Wu Anqi Zhao Dehua Peng and Huayi Wu. 2025. Dynamic visual semantic sub-embeddings and fast re-ranking for image-text retrieval. IEEE Transactions on Multimedia 27 (2025) 3781\u20133796.","DOI":"10.1109\/TMM.2025.3535373"},{"key":"e_1_3_3_1_43_2","unstructured":"Dongqing Wu Huihui Li Cang Gu Lei Guo and Hang Liu. 2024. Dual stream relation learning network for image-text retrieval. IEEE Transactions on Multimedia (2024)."},{"key":"e_1_3_3_1_44_2","unstructured":"Shitao Xiao Zheng Liu Peitian Zhang Niklas Muennighoff Defu Lian and Jian-Yun Nie. 2023. C-pack: Packed resources for general chinese embeddings. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.07597 (2023)."},{"key":"e_1_3_3_1_45_2","unstructured":"An Yang Anfeng Li Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chang Gao Chengen Huang Chenxu Lv et\u00a0al. 2025. Qwen3 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.09388 (2025)."},{"key":"e_1_3_3_1_46_2","unstructured":"Lewei Yao Runhui Huang Lu Hou Guansong Lu Minzhe Niu Hang Xu Xiaodan Liang Zhenguo Li Xin Jiang and Chunjing Xu. 2021. Filip: Fine-grained interactive language-image pre-training. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2111.07783 (2021)."},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"crossref","unstructured":"Peter Young Alice Lai Micah Hodosh and Julia Hockenmaier. 2014. From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. Transactions of the association for computational linguistics 2 (2014) 67\u201378.","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"crossref","unstructured":"Chao Zhang Zichao Yang Xiaodong He and Li Deng. 2020. Multimodal intelligence: Representation learning information fusion and applications. IEEE Journal of Selected Topics in Signal Processing 14 3 (2020) 478\u2013493.","DOI":"10.1109\/JSTSP.2020.2987728"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"crossref","unstructured":"Kun Zhang Bo Hu Huatian Zhang Zhe Li and Zhendong Mao. 2024. Enhanced semantic similarity learning framework for image-text matching. IEEE Transactions on Circuits and Systems for Video Technology 34 4 (2024) 2973\u20132988.","DOI":"10.1109\/TCSVT.2023.3307554"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01521"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"crossref","unstructured":"Yan Zhang Zhong Ji Di Wang Yanwei Pang and Xuelong Li. 2024. USER: Unified semantic enhancement with momentum contrast for image-text retrieval. IEEE Transactions on Image Processing 33 (2024) 595\u2013609.","DOI":"10.1109\/TIP.2023.3348297"},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00920"},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"crossref","unstructured":"Hongguang Zhu Chunjie Zhang Yunchao Wei Shujuan Huang and Yao Zhao. 2023. ESA: External space attention aggregation for image-text retrieval. IEEE Transactions on Circuits and Systems for Video Technology 33 10 (2023) 6131\u20136143.","DOI":"10.1109\/TCSVT.2023.3253548"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:28:13Z","timestamp":1781537293000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810824"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":52,"alternative-id":["10.1145\/3805622.3810824","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810824","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}