{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T16:06:49Z","timestamp":1780675609624,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China","award":["62120106008"],"award-info":[{"award-number":["62120106008"]}]},{"name":"Anhui Provincial Science and Technology Fortification Plan","award":["202423k09020015"],"award-info":[{"award-number":["202423k09020015"]}]},{"name":"the Hefei Key Generic Technology Research and Development Program","award":["2024SGJ010"],"award-info":[{"award-number":["2024SGJ010"]}]},{"name":"the Youth Talent Support Program of the Anhui Association for Science and Technology","award":["RCTJ202420"],"award-info":[{"award-number":["RCTJ202420"]}]},{"name":"The Key Laboratory of Knowledge Engineering with Big Data &#x28;the Ministry of Education of China&#x29;","award":["BigKEOpen2025-01"],"award-info":[{"award-number":["BigKEOpen2025-01"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,13]]},"DOI":"10.1145\/3774904.3792154","type":"proceedings-article","created":{"date-parts":[[2026,4,27]],"date-time":"2026-04-27T13:28:36Z","timestamp":1777296516000},"page":"1959-1970","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Dual-Branch Multi-Granularity Network with Structured Contrastive Ranking for Cross-Modal Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-0218-2827","authenticated-orcid":false,"given":"Zihao","family":"Chen","sequence":"first","affiliation":[{"name":"The Key Laboratory of Knowledge Engineering with Big Data (the Ministry of Education of China), Hefei University of Technology, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8203-0956","authenticated-orcid":false,"given":"Chenyang","family":"Bu","sequence":"additional","affiliation":[{"name":"The Key Laboratory of Knowledge Engineering with Big Data (the Ministry of Education of China), Hefei University of Technology, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4942-9767","authenticated-orcid":false,"given":"Shengwei","family":"Ji","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence and Big Data, Hefei University, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2396-1704","authenticated-orcid":false,"given":"Xindong","family":"Wu","sequence":"additional","affiliation":[{"name":"The Key Laboratory of Knowledge Engineering with Big Data (the Ministry of Education of China), Hefei University of Technology, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,4,12]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"16776","volume-title":"Ask in Any Modality: A Comprehensive Survey on Multimodal Retrieval-Augmented Generation. In Findings of the Association for Computational Linguistics: ACL","author":"Abootorabi Mohammad Mahdi","year":"2025","unstructured":"Mohammad Mahdi Abootorabi, Amirhosein Zobeiri, Mahdi Dehghani, Mohammadali Mohammadkhani, Bardia Mohammadi, Omid Ghahroodi, Mahdieh Soleymani Baghshah, and Ehsaneddin Asgari. 2025. Ask in Any Modality: A Comprehensive Survey on Multimodal Retrieval-Augmented Generation. In Findings of the Association for Computational Linguistics: ACL 2025. Vienna, Austria, 16776-16809."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2508.14801"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.1100"},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the IEEE Winter Conference on Applications of Computer Vision (WACV). 839-847","author":"Chattopadhay Aditya","unstructured":"Aditya Chattopadhay, Anirban Sarkar, Prantik Howlader, and Vineeth N. Balasubramanian. 2018. Grad-CAM: Generalized Gradient-Based Visual Explanations for Deep Convolutional Networks. In Proceedings of the IEEE Winter Conference on Applications of Computer Vision (WACV). 839-847."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3362065","article-title":"ACMNet: Adaptive confidence matching network for human behavior analysis via cross-modal retrieval","volume":"16","author":"Chen Hui","year":"2020","unstructured":"Hui Chen, Guiguang Ding, Zijia Lin, Sicheng Zhao, Xiaopeng Gu, Wenyuan Xu, and Jungong Han. 2020. ACMNet: Adaptive confidence matching network for human behavior analysis via cross-modal retrieval. ACM Transactions on Multimedia Computing, Communications, and Applications, Vol. 16, 1s (2020), 1-21.","journal-title":"ACM Transactions on Multimedia Computing, Communications, and Applications"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00512"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3182549"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/1646396.1646452"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00831"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3059295"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01117"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2025.3570518"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02767"},{"key":"e_1_3_2_1_14_1","first-page":"4527","article-title":"Category alignment adversarial learning for cross-modal retrieval","volume":"35","author":"He Shiyuan","year":"2022","unstructured":"Shiyuan He, Weiyang Wang, Zheng Wang, Xing Xu, Yang Yang, Xiaoming Wang, and Heng Tao Shen. 2022. Category alignment adversarial learning for cross-modal retrieval. IEEE Transactions on Knowledge and Data Engineering, Vol. 35, 5 (2022), 4527-4538.","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2019.05.017"},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the Thirteenth International Conference on Learning Representations (ICLR). https:\/\/openreview.net\/forum?id=Usklli4gMc","author":"Hu Wenbo","year":"2025","unstructured":"Wenbo Hu, Jia-Chen Gu, Zi-Yi Dou, Mohsen Fayyaz, Pan Lu, Kai-Wei Chang, and Nanyun Peng. 2025. MRAG-Bench: Vision-Centric Evaluation for Retrieval-Augmented Multimodal Models. In Proceedings of the Thirteenth International Conference on Learning Representations (ICLR). https:\/\/openreview.net\/forum?id=Usklli4gMc"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2025\/901"},{"key":"e_1_3_2_1_18_1","volume-title":"Relink: Constructing Query-Driven Evidence Graph On-the-Fly for GraphRAG. arXiv preprint arXiv:2601.07192","author":"Huang Manzong","year":"2026","unstructured":"Manzong Huang, Chenyang Bu, Yi He, Xingrui Zhuo, and Xindong Wu. 2026. Relink: Constructing Query-Driven Evidence Graph On-the-Fly for GraphRAG. arXiv preprint arXiv:2601.07192 (2026)."},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning (ICML). 4904-4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling Up Visual and Vision-Language Representation Learning with Noisy Text Supervision. In Proceedings of the 38th International Conference on Machine Learning (ICML). 4904-4916."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02243"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01744"},{"key":"e_1_3_2_1_22_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694-9705."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2025.3640697"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3152086"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01524"},{"key":"e_1_3_2_1_26_1","volume-title":"Video-rag: Visually-aligned retrieval-augmented long video comprehension. arXiv preprint arXiv:2411.13093","author":"Luo Yongdong","year":"2024","unstructured":"Yongdong Luo, Xiawu Zheng, Xiao Yang, Guilin Li, Haojia Lin, Jinfa Huang, Jiayi Ji, Fei Chao, Jiebo Luo, and Rongrong Ji. 2024. Video-rag: Visually-aligned retrieval-augmented long video comprehension. arXiv preprint arXiv:2411.13093 (2024)."},{"key":"e_1_3_2_1_27_1","first-page":"2579","article-title":"Visualizing data using t-SNE","volume":"9","author":"van der Maaten Laurens","year":"2008","unstructured":"Laurens van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research, Vol. 9, Nov (2008), 2579-2605.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2852503"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2025.3535313"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3101642"},{"key":"e_1_3_2_1_31_1","first-page":"4794","article-title":"Integrating multi-label contrastive learning with dual adversarial graph neural networks for cross-modal retrieval","volume":"45","author":"Qian Shengsheng","year":"2022","unstructured":"Shengsheng Qian, Dizhan Xue, Quan Fang, and Changsheng Xu. 2022. Integrating multi-label contrastive learning with dual adversarial graph neural networks for cross-modal retrieval. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 45, 4 (2022), 4794-4811.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.5555\/1866696.1866717"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1873987"},{"key":"e_1_3_2_1_34_1","volume-title":"Beyond text: Optimizing rag with multimodal inputs for industrial applications. arXiv preprint arXiv:2410.21943","author":"Riedler Monica","year":"2024","unstructured":"Monica Riedler and Stefan Langer. 2024. Beyond text: Optimizing rag with multimodal inputs for industrial applications. arXiv preprint arXiv:2410.21943 (2024)."},{"key":"e_1_3_2_1_35_1","first-page":"1","volume-title":"Proceedings of the Conference on Data Mining and Data Warehouses (SiKDD","author":"Rupnik Jan","year":"2010","unstructured":"Jan Rupnik and John Shawe-Taylor. 2010. Multi-View Canonical Correlation Analysis. In Proceedings of the Conference on Data Mining and Data Warehouses (SiKDD 2010). 1-4."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637442"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29867"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28324"},{"key":"e_1_3_2_1_39_1","volume-title":"MMGraphRAG: Bridging Vision and Language with Interpretable Multimodal Knowledge Graphs. arXiv preprint arXiv:2507.20804","author":"Wan Xueyao","year":"2025","unstructured":"Xueyao Wan and Hang Yu. 2025. MMGraphRAG: Bridging Vision and Language with Interpretable Multimodal Knowledge Graphs. arXiv preprint arXiv:2507.20804 (2025)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2024.3525147"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3136330"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680886"},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR).","author":"Xia Peng","year":"2025","unstructured":"Peng Xia, Kangyu Zhu, Haoran Li, Tianze Wang, Weijia Shi, Sheng Wang, Linjun Zhang, James Zou, and Huaxiu Yao. 2025. MMed-RAG: Versatile Multimodal RAG System for Medical Vision-Language Models. In Proceedings of the International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548066"},{"key":"e_1_3_2_1_45_1","volume-title":"Improving Fine-grained Understanding for Retrieval in Human Motion and Text","author":"Yan Sheng","year":"2024","unstructured":"Sheng Yan, Yong Wang, Xin Du, Hongchang Jin, and Mengyuan Liu. 2024. Improving Fine-grained Understanding for Retrieval in Human Motion and Text. IEEE Signal Processing Letters (2024)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01522"},{"key":"e_1_3_2_1_47_1","volume-title":"mKG-RAG: Multimodal Knowledge Graph-Enhanced RAG for Visual Question Answering. arXiv preprint arXiv:2508.05318","author":"Yuan Xu","year":"2025","unstructured":"Xu Yuan, Liangbo Ning, Wenqi Fan, and Qing Li. 2025. mKG-RAG: Multimodal Knowledge Graph-Enhanced RAG for Visual Question Answering. arXiv preprint arXiv:2508.05318 (2025)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2012.6288383"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3412760"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2025.3550526"},{"key":"e_1_3_2_1_51_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 3536-3545","author":"Zhang Qi","unstructured":"Qi Zhang, Zhen Lei, Zhaoxiang Zhang, and Stan Z. Li. 2020. Context-Aware Attention Network for Image-Text Retrieval. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 3536-3545."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01064"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2025\/413"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3696410.3714794"}],"event":{"name":"WWW '26: The ACM Web Conference 2026","location":"Dubai United Arab Emirates","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM Web Conference 2026"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3774904.3792154","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T15:41:11Z","timestamp":1780674071000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3774904.3792154"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,12]]},"references-count":54,"alternative-id":["10.1145\/3774904.3792154","10.1145\/3774904"],"URL":"https:\/\/doi.org\/10.1145\/3774904.3792154","relation":{},"subject":[],"published":{"date-parts":[[2026,4,12]]},"assertion":[{"value":"2026-04-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}