{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T01:10:23Z","timestamp":1755825023090,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","funder":[{"name":"Guangdong Key Lab of AI and Multi-modal Data Processing","award":["2020KSYS007"],"award-info":[{"award-number":["2020KSYS007"]}]},{"name":"National Key R&D Program of China","award":["2022YFE0201400"],"award-info":[{"award-number":["2022YFE0201400"]}]},{"name":"the Guangdong Provincial Key Laboratory of Interdisciplinary Research and Application for Data Science, BNU-HKBU United International College","award":["2022B1212010006"],"award-info":[{"award-number":["2022B1212010006"]}]},{"name":"Natural Science Foundation of China","award":["62076029"],"award-info":[{"award-number":["62076029"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733298","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:31:04Z","timestamp":1750876264000},"page":"349-357","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Contextual Reasoning for Robust Composed Image Retrieval with Vision-Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-0848-9814","authenticated-orcid":false,"given":"Peng","family":"Gao","sequence":"first","affiliation":[{"name":"Hong Kong Baptist University, Kowloon Tong, Hong Kong, China and Beijing Normal University-Hong Kong Baptist University United International College, Zhu Hai, Guang Dong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2514-3913","authenticated-orcid":false,"given":"Yujian","family":"Lee","sequence":"additional","affiliation":[{"name":"Hong Kong Baptist University, Kowloon Tong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9950-2672","authenticated-orcid":false,"given":"Xubo","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Surrey, Guildford, England, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1681-7926","authenticated-orcid":false,"given":"Hui","family":"Zhang","sequence":"additional","affiliation":[{"name":"Beijing Normal University-Hong Kong Baptist University United International College, Zhu Hai, Guang Dong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8431-5471","authenticated-orcid":false,"given":"Zailong","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Wollonggong, Wollongong, New South Wales, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3478-3616","authenticated-orcid":false,"given":"Yiyang","family":"Hu","sequence":"additional","affiliation":[{"name":"Hong Kong Baptist University, Kowloon Tong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4709-9932","authenticated-orcid":false,"given":"Guquan","family":"Jing","sequence":"additional","affiliation":[{"name":"Hong Kong Baptist University, Kowloon Tong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0770-4847","authenticated-orcid":false,"given":"Yunting","family":"Lai","sequence":"additional","affiliation":[{"name":"Hong Kong Baptist University, Kowloon Tong, Hong Kong, China"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3617597"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00127"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3607827.3616844"},{"key":"e_1_3_2_1_4_1","volume-title":"Multi-granular Semantic Mining for Composed Image Retrieval. In 2024 IEEE International Conference on Multimedia and Expo. 1--6.","author":"Chen Xiaotong","year":"2024","unstructured":"Xiaotong Chen, Shikui Wei, Gangjian Zhang, and Yao Zhao. 2024a. Multi-granular Semantic Mining for Composed Image Retrieval. In 2024 IEEE International Conference on Multimedia and Expo. 1--6."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00307"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i2.27885"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3640345"},{"key":"e_1_3_2_1_8_1","volume-title":"Gabriela Csurka, and Diane Larlus.","author":"Delmas Ginger","year":"2022","unstructured":"Ginger Delmas, Rafael Sampaio de Rezende, Gabriela Csurka, and Diane Larlus. 2022. Artemis: Attention-based retrieval with text-explicit matching and implicit similarity. arXiv preprint:2203.08101 (2022)."},{"key":"e_1_3_2_1_9_1","volume-title":"Rick Siow Mong Goh, and Yong Liu","author":"Feng Chun-Mei","year":"2023","unstructured":"Chun-Mei Feng, Yang Bai, Tao Luo, Zhen Li, Salman Khan, Wangmeng Zuo, Xinxing Xu, Rick Siow Mong Goh, and Yong Liu. 2023. Vqa4cir: Boosting composed image retrieval with visual question answering. arXiv preprint arXiv:2312.12273 (2023)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01371"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-24797-2"},{"key":"e_1_3_2_1_12_1","volume-title":"Dialog-based interactive image retrieval. Advances in neural information processing systems","author":"Guo Xiaoxiao","year":"2018","unstructured":"Xiaoxiao Guo, Hui Wu, Yu Cheng, Steven Rennie, Gerald Tesauro, and Rogerio Feris. 2018. Dialog-based interactive image retrieval. Advances in neural information processing systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_13_1","volume-title":"The fashion iq dataset: Retrieving images by combining side information and relative natural language feedback. arXiv preprint arXiv:1905.12794","author":"Guo Xiaoxiao","year":"2019","unstructured":"Xiaoxiao Guo, Hui Wu, Yupeng Gao, Steven Rennie, and Rogerio Feris. 2019. The fashion iq dataset: Retrieving images by combining side information and relative natural language feedback. arXiv preprint arXiv:1905.12794, Vol. 1, 2 (2019), 7."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00262"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.3390\/app14125068"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.28004"},{"key":"e_1_3_2_1_18_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730--19742."},{"key":"e_1_3_2_1_19_1","volume-title":"A Survey of Multimodal Composite Editing and Retrieval. arXiv preprint arXiv:2409.05405","author":"Li Suyan","year":"2024","unstructured":"Suyan Li, Fuxiang Huang, and Lei Zhang. 2024. A Survey of Multimodal Composite Editing and Retrieval. arXiv preprint arXiv:2409.05405 (2024)."},{"key":"e_1_3_2_1_20_1","first-page":"1","article-title":"Text-dominant multimodal perception network for sentiment analysis based on cross-modal semantic enhancements","volume":"55","author":"Li Zuhe","year":"2025","unstructured":"Zuhe Li, Panbo Liu, Yushan Pan, Jun Yu, Weihua Liu, Haoran Chen, Yiming Luo, and Hao Wang. 2025. Text-dominant multimodal perception network for sentiment analysis based on cross-modal semantic enhancements. Applied Intelligence, Vol. 55, 2 (2025), 1--17.","journal-title":"Applied Intelligence"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00213"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00565"},{"key":"e_1_3_2_1_23_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_24_1","volume-title":"Leveraging Large Vision-Language Model as User Intent-aware Encoder for Composed Image Retrieval. arXiv preprint arXiv:2412.11087","author":"Sun Zelong","year":"2024","unstructured":"Zelong Sun, Dong Jing, Guoxing Yang, Nanyi Fei, and Zhiwu Lu. 2024a. Leveraging Large Vision-Language Model as User Intent-aware Encoder for Composed Image Retrieval. arXiv preprint arXiv:2412.11087 (2024)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446291"},{"key":"e_1_3_2_1_26_1","volume-title":"Image search with text feedback by additive attention compositional learning. arXiv preprint arXiv:2203.03809","author":"Tian Yuxin","year":"2022","unstructured":"Yuxin Tian, Shawn Newsam, and Kofi Boakye. 2022. Image search with text feedback by additive attention compositional learning. arXiv preprint arXiv:2203.03809 (2022)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00660"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01102"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681493"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657727"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462967"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611817"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3235495"},{"key":"e_1_3_2_1_34_1","volume-title":"Modeling explicit concerning states for reinforcement learning in visual dialogue. arXiv preprint arXiv:2107.05250","author":"Xu Zipeng","year":"2021","unstructured":"Zipeng Xu, Fandong Meng, Xiaojie Wang, Duo Zheng, Chenxu Lv, and Jie Zhou. 2021. Modeling explicit concerning states for reinforcement learning in visual dialogue. arXiv preprint arXiv:2107.05250 (2021)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28479"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475483"},{"key":"e_1_3_2_1_37_1","volume-title":"He","author":"Yao Yuan","year":"2024","unstructured":"Yuan Yao, Tianyu Yu, Ao Zhang, Chongyi Wang, Junbo Cui, Hongji Zhu, Tianchi Cai, Haoyu Li, Weilin Zhao, and Zhihui et al. He. 2024. Minicpm-v: A gpt-4v level mllm on your phone. arXiv preprint:2408.01800 (2024)."},{"key":"e_1_3_2_1_38_1","volume-title":"Mm-llms: Recent advances in multimodal large language models. arXiv preprint arXiv:2401.13601","author":"Zhang Duzhen","year":"2024","unstructured":"Duzhen Zhang, Yahan Yu, Jiahua Dong, Chenxing Li, Dan Su, Chenhui Chu, and Dong Yu. 2024. Mm-llms: Recent advances in multimodal large language models. arXiv preprint arXiv:2401.13601 (2024)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475659"},{"key":"e_1_3_2_1_40_1","volume-title":"Deep multimodal data fusion. ACM computing surveys","author":"Zhao Fei","year":"2024","unstructured":"Fei Zhao, Chengcui Zhang, and Baocheng Geng. 2024. Deep multimodal data fusion. ACM computing surveys, Vol. 56, 9 (2024), 1--36."},{"key":"e_1_3_2_1_41_1","volume-title":"Enhancing visual dialog questioner with entity-based strategy learning and augmented guesser. arXiv preprint arXiv:2109.02297","author":"Zheng Duo","year":"2021","unstructured":"Duo Zheng, Zipeng Xu, Fandong Meng, Xiaojie Wang, Jiaan Wang, and Jie Zhou. 2021. Enhancing visual dialog questioner with entity-based strategy learning and augmented guesser. arXiv preprint arXiv:2109.02297 (2021)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"e_1_3_2_1_43_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Zhou Yongchao","year":"2022","unstructured":"Yongchao Zhou, Andrei Ioan Muresanu, Ziwen Han, Keiran Paster, Silviu Pitis, Harris Chan, and Jimmy Ba. 2022a. Large language models are human-level prompt engineers. In The Eleventh International Conference on Learning Representations."}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Chicago IL USA","acronym":"ICMR '25"},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733298","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:02:44Z","timestamp":1755748964000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733298"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":43,"alternative-id":["10.1145\/3731715.3733298","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733298","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}