{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T05:46:21Z","timestamp":1777873581135,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["Grant No.62176043, No.62072077, No.U22A2097"],"award-info":[{"award-number":["Grant No.62176043, No.62072077, No.U22A2097"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,3]]},"DOI":"10.1145\/3711896.3736982","type":"proceedings-article","created":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T13:30:13Z","timestamp":1754055013000},"page":"334-344","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Generative Thinking, Corrective Action: User-Friendly Composed Image Retrieval via Automatic Multi-Agent Collaboration"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0097-3617","authenticated-orcid":false,"given":"Zhangtao","family":"Cheng","sequence":"first","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, Sichuan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6134-8359","authenticated-orcid":false,"given":"Yuhao","family":"Ma","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, Sichuan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0876-0497","authenticated-orcid":false,"given":"Jian","family":"Lang","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, Sichuan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1474-3169","authenticated-orcid":false,"given":"Kunpeng","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Maryland, College Park, MD, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8163-3146","authenticated-orcid":false,"given":"Ting","family":"Zhong","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, Sichuan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8699-8355","authenticated-orcid":false,"given":"Yong","family":"Wang","sequence":"additional","affiliation":[{"name":"Aiwen Tech, Zhengzhou, Henan, China and Hong Kong University of Science and Technology, Clear Water Bay, Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8038-8150","authenticated-orcid":false,"given":"Fan","family":"Zhou","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, Sichuan, China and Key Laboratory of Intelligent Digital Media Technology of Sichuan Province, Chengdu, Sichuan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,8,3]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_2_2_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang et al. 2023. Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01407"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02080"},{"key":"e_1_3_2_2_5_1","unstructured":"Tom B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems (Neurips)."},{"key":"e_1_3_2_2_6_1","volume-title":"CPSNet: Comprehensive Enhancement Representation for Polyp Segmentation Task. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 1-5.","author":"Cai Jiati","year":"2025","unstructured":"Jiati Cai, Xiaogang Liu, Hongjie Yang, Yi Ding, Ting Zhong, and Zhen Qin. 2025. CPSNet: Comprehensive Enhancement Representation for Polyp Segmentation Task. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 1-5."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3672065"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1002\/widm.1488"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1369-5"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3690624.3709308"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2025.3568289"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20044-1_32"},{"key":"e_1_3_2_2_13_1","volume-title":"ARTEMIS: Attention-based Retrieval with Text-Explicit Matching and Implicit Similarity. In International Conference on Learning Representations (ICLR).","author":"Delmas Ginger","year":"2022","unstructured":"Ginger Delmas, Rafael Sampaio de Rezende, Gabriela Csurka, and Diane Larlus. 2022. ARTEMIS: Attention-based Retrieval with Text-Explicit Matching and Implicit Similarity. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11280-023-01219-2"},{"key":"e_1_3_2_2_15_1","volume-title":"CompoDiff: Versatile Composed Image Retrieval With Latent Diffusion. Transactions on Machine Learning Research (TMLR) 2024","author":"Gu Geonmo","year":"2024","unstructured":"Geonmo Gu, Sanghyuk Chun, Wonjae Kim, HeeJae Jun, Yoohoon Kang, and Sangdoo Yun. 2024. CompoDiff: Versatile Composed Image Retrieval With Latent Diffusion. Transactions on Machine Learning Research (TMLR) 2024 (2024)."},{"key":"e_1_3_2_2_16_1","volume-title":"Language-only Efficient Training of Zero-shot Composed Image Retrieval. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 13225-13234","author":"Gu Geonmo","year":"2024","unstructured":"Geonmo Gu, Sanghyuk Chun, Wonjae Kim, Yoohoon Kang, and Sangdoo Yun. 2024. Language-only Efficient Training of Zero-shot Composed Image Retrieval. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 13225-13234."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.163"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01866"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i11.33284"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","unstructured":"Gabriel Ilharco Mitchell Wortsman Ross Wightman Cade Gordon Nicholas Carlini Rohan Taori Achal Dave Vaishaal Shankar Hongseok Namkoong John Miller Hannaneh Hajishirzi Ali Farhadi and Ludwig Schmidt. [n.d.]. Openclip. https:\/\/doi.org\/10.5281\/zenodo.5143773","DOI":"10.5281\/zenodo.5143773"},{"key":"e_1_3_2_2_21_1","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV).","author":"Jang Young Kyun","year":"2024","unstructured":"Young Kyun Jang, Dat Huynh, Ashish Shah, Wen-Kai Chen, and Ser-Nam Lim. 2024. Spherical Linear Interpolation and Text-Anchoring for Zero-shot Composed Image Retrieval. In Proceedings of the European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_2_2_22_1","volume-title":"CaLa: Complementary Association Learning for Augmenting Comoposed Image Retrieval. In Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR). 2177-2187","author":"Jiang Xintong","year":"2024","unstructured":"Xintong Jiang, Yaxiong Wang, Mengjian Li, Yujiao Wu, Bingwen Hu, and Xueming Qian. 2024. CaLa: Complementary Association Learning for Augmenting Comoposed Image Retrieval. In Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR). 2177-2187."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1012933107"},{"key":"e_1_3_2_2_24_1","volume-title":"Vision-by-Language for Training-Free Compositional Image Retrieval. In International Conference on Learning Representations (ICLR).","author":"Karthik Shyamgopal","year":"2024","unstructured":"Shyamgopal Karthik, Karsten Roth, Massimiliano Mancini, and Zeynep Akata. 2024. Vision-by-Language for Training-Free Compositional Image Retrieval. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_2_25_1","volume-title":"Retrieval-Augmented Dynamic Prompt Tuning for Incomplete Multimodal Learning. In AAAI Conference on Artificial Intelligence (AAAI). 18035-18043","author":"Lang Jian","year":"2025","unstructured":"Jian Lang, Zhangtao Cheng, Ting Zhong, and Fan Zhou. 2025. Retrieval-Augmented Dynamic Prompt Tuning for Incomplete Multimodal Learning. In AAAI Conference on Artificial Intelligence (AAAI). 18035-18043."},{"key":"e_1_3_2_2_26_1","volume-title":"Cross-modal retrieval: a systematic review of methods and future directions. arXiv preprint arXiv:2308.14263","author":"Li Fengling","year":"2023","unstructured":"Fengling Li, Lei Zhu, Tianshi Wang, Jingjing Li, Zheng Zhang, and Heng Tao Shen. 2023. Cross-modal retrieval: a systematic review of methods and future directions. arXiv preprint arXiv:2308.14263 (2023)."},{"key":"e_1_3_2_2_27_1","volume-title":"International Conference on Machine Learning (ICML). 19730-19742","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International Conference on Machine Learning (ICML). 19730-19742."},{"key":"e_1_3_2_2_28_1","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume":"34","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in Neural Information Processing Systems (Neurips) 34 (2021), 9694-9705.","journal-title":"Advances in Neural Information Processing Systems (Neurips)"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3696410.3714621"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_2_31_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems (Neurips) 36","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems (Neurips) 36 (2024)."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00213"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657741"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00301"},{"key":"e_1_3_2_2_35_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in Neural Information Processing Systems (Neurips) 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in Neural Information Processing Systems (Neurips) 32 (2019)."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3592000"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1167\/jov.20.11.1521"},{"key":"e_1_3_2_2_38_1","volume-title":"International Conference on Machine Learning (ICML). 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning (ICML). 8748-8763."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01850"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1644"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00660"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657727"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01115"},{"key":"e_1_3_2_2_44_1","first-page":"80","article-title":"LDRE","author":"Yang Zhenyu","year":"2024","unstructured":"Zhenyu Yang, Dizhan Xue, Shengsheng Qian, Weiming Dong, and Changsheng Xu. 2024. LDRE: LLM-based Divergent Reasoning and Ensemble for Zero-Shot Composed Image Retrieval. In Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR). 80-90.","journal-title":"In Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR)."},{"key":"e_1_3_2_2_45_1","volume-title":"CoCa: Contrastive Captioners are Image-Text Foundation Models. Transactions on Machine Learning Research (TMLR) 2022","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, ZiruiWang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. CoCa: Contrastive Captioners are Image-Text Foundation Models. Transactions on Machine Learning Research (TMLR) 2022 (2022)."},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671869"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3369699"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3433000"}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Toronto ON Canada","acronym":"KDD '25","sponsor":["SIGKDD ACM Special Interest Group on Knowledge Discovery in Data","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3711896.3736982","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T18:04:15Z","timestamp":1777572255000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3711896.3736982"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,3]]},"references-count":48,"alternative-id":["10.1145\/3711896.3736982","10.1145\/3711896"],"URL":"https:\/\/doi.org\/10.1145\/3711896.3736982","relation":{},"subject":[],"published":{"date-parts":[[2025,8,3]]},"assertion":[{"value":"2025-08-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}