{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,2]],"date-time":"2025-08-02T19:44:38Z","timestamp":1754163878510,"version":"3.41.2"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3733566.3734428","type":"proceedings-article","created":{"date-parts":[[2025,6,23]],"date-time":"2025-06-23T09:09:47Z","timestamp":1750669787000},"page":"1-8","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CSD: Cross-Modal Similarity Distillation for Zero-Shot Composed Image Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-1059-6549","authenticated-orcid":false,"given":"Shuping","family":"Hui","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3048-6980","authenticated-orcid":false,"given":"Min","family":"Wang","sequence":"additional","affiliation":[{"name":"Hefei Comprehensive National Science Center, Hefei, Anhui, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2353-3508","authenticated-orcid":false,"given":"Hui","family":"Wu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1690-9836","authenticated-orcid":false,"given":"Wengang","family":"Zhou","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2188-3028","authenticated-orcid":false,"given":"Houqiang","family":"Li","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"isearle: Improving textual inversion for zero-shot composed image retrieval. arXiv:2405.02951","author":"Agnolucci Lorenzo","year":"2024","unstructured":"Lorenzo Agnolucci, Alberto Baldrati, Marco Bertini, and Alberto Del Bimbo. 2024. isearle: Improving textual inversion for zero-shot composed image retrieval. arXiv:2405.02951 (2024)."},{"key":"e_1_3_2_1_2_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a Visual Language Model for Few-Shot Learning. In NeurIPS. 23716--23736."},{"key":"e_1_3_2_1_3_1","unstructured":"Muhammad Umer Anwaar Egor Labintcev and Martin Kleinsteuber. 2021. Compositional learning of image-text query for image retrieval. In WACV. 1140--1149."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Alberto Baldrati Lorenzo Agnolucci Marco Bertini and Alberto Del Bimbo. 2023. Zero-Shot Composed Image Retrieval with Textual Inversion. In ICCV. 15338--15347.","DOI":"10.1109\/ICCV51070.2023.01407"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Alberto Baldrati Marco Bertini Tiberio Uricchio and Alberto Del Bimbo. 2022. Effective conditioned and composed image retrieval combining CLIP-based features. In CVPR. 21466--21474.","DOI":"10.1109\/CVPR52688.2022.02080"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Yanbei Chen Shaogang Gong and Loris Bazzani. 2020. Image search with text feedback by visiolinguistic attention learning. In CVPR. 3001--3011.","DOI":"10.1109\/CVPR42600.2020.00307"},{"key":"e_1_3_2_1_7_1","unstructured":"Yiyang Chen Zhedong Zheng Wei Ji Leigang Qu and Tat-Seng Chua. 2024. Composed image retrieval with text feedback via multi-grained uncertainty regularization. In ICLR."},{"key":"e_1_3_2_1_8_1","volume-title":"Multi-Level Optimal Transport for Universal Cross-Tokenizer Knowledge Distillation on Language Models. arXiv:2412.14528","author":"Cui Xiao","year":"2024","unstructured":"Xiao Cui, Mo Zhu, Yulei Qin, Liang Xie, Wengang Zhou, and Houqiang Li. 2024. Multi-Level Optimal Transport for Universal Cross-Tokenizer Knowledge Distillation on Language Models. arXiv:2412.14528 (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"Gabriela Csurka, and Diane Larlus.","author":"Delmas Ginger","year":"2022","unstructured":"Ginger Delmas, Rafael Sampaio de Rezende, Gabriela Csurka, and Diane Larlus. 2022. Artemis: Attention-based retrieval with text-explicit matching and implicit similarity. In ICLR."},{"key":"e_1_3_2_1_10_1","unstructured":"Yongchao Du Min Wang Wengang Zhou Shuping Hui and Houqiang Li. 2024. Image2Sentence based Asymmetrical Zero-Shot Composed Image Retrieval. In ICLR."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Zhangchi Feng Richong Zhang and Zhijie Nie. 2024. Improving Composed Image Retrieval via Contrastive Learning with Scaling Positives and Negatives. In MM. 1632--1641.","DOI":"10.1145\/3664647.3680808"},{"key":"e_1_3_2_1_12_1","volume-title":"Fashionvlp: Vision language transformer for fashion retrieval with feedback. In CVPR. 14105--14115.","author":"Goenka Sonam","year":"2022","unstructured":"Sonam Goenka, Zhaoheng Zheng, Ayush Jaiswal, Rakesh Chada, Yue Wu, Varsha Hedau, and Pradeep Natarajan. 2022. Fashionvlp: Vision language transformer for fashion retrieval with feedback. In CVPR. 14105--14115."},{"key":"e_1_3_2_1_13_1","volume-title":"Compodiff: Versatile composed image retrieval with latent diffusion. TMLR","author":"Gu Geonmo","year":"2024","unstructured":"Geonmo Gu, Sanghyuk Chun, Wonjae Kim, HeeJae Jun, Yoohoon Kang, and Sangdoo Yun. 2024. Compodiff: Versatile composed image retrieval with latent diffusion. TMLR (2024)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Geonmo Gu Sanghyuk Chun Wonjae Kim Yoohoon Kang and Sangdoo Yun. 2024. Language-only Efficient Training of Zero-shot Composed Image Retrieval. In CVPR. 13225--13234.","DOI":"10.1109\/CVPR52733.2024.01256"},{"key":"e_1_3_2_1_15_1","volume-title":"Fame-vil: Multi-tasking vision-language model for heterogeneous fashion tasks. In CVPR. 2669--2680.","author":"Han Xiao","year":"2023","unstructured":"Xiao Han, Xiatian Zhu, Licheng Yu, Li Zhang, Yi-Zhe Song, and Tao Xiang. 2023. Fame-vil: Multi-tasking vision-language model for heterogeneous fashion tasks. In CVPR. 2669--2680."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Young Kyun Jang Dat Huynh Ashish Shah Wen-Kai Chen and Ser-Nam Lim. 2025. Spherical Linear Interpolation and Text-Anchoring for Zero-shot Composed Image Retrieval. In ECCV. 239--254.","DOI":"10.1007\/978-3-031-72655-2_14"},{"key":"e_1_3_2_1_17_1","volume-title":"Cosmo: Content-style modulation for image retrieval with text feedback. In CVPR. 802--812.","author":"Lee Seungmin","year":"2021","unstructured":"Seungmin Lee, Dongwan Kim, and Bohyung Han. 2021. Cosmo: Content-style modulation for image retrieval with text feedback. In CVPR. 802--812."},{"key":"e_1_3_2_1_18_1","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In ICML. 19730--19742."},{"key":"e_1_3_2_1_19_1","volume-title":"BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In ICML. 12888--12900.","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In ICML. 12888--12900."},{"key":"e_1_3_2_1_20_1","unstructured":"Junnan Li Ramprasaath Selvaraju Akhilesh Gotmare Shafiq Joty Caiming Xiong and Steven Chu Hong Hoi. 2021. Align before Fuse: Vision and Language Representation Learning with Momentum Distillation. In NeurIPS. 9694--9705."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Shenshen Li. 2023. Dual-Path Semantic Construction Network for Composed Query-Based Image Retrieval. In ICMR. 636--639.","DOI":"10.1145\/3591106.3592245"},{"key":"e_1_3_2_1_22_1","unstructured":"Zheyuan Liu Cristian Rodriguez-Opazo Damien Teney and Stephen Gould. 2021. Image Retrieval on Real-life Images with Pre-trained Vision-and-Language Models. In ICCV. 2125--2134."},{"key":"e_1_3_2_1_23_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML. 8748--8763."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Kuniaki Saito Kihyuk Sohn Xiang Zhang Chun-Liang Li Chen-Yu Lee Kate Saenko and Tomas Pfister. 2023. Pic2Word: Mapping Pictures to Words for Zero-shot Composed Image Retrieval. In CVPR. 19305--19314.","DOI":"10.1109\/CVPR52729.2023.01850"},{"key":"e_1_3_2_1_25_1","unstructured":"Adam Santoro David Raposo David G Barrett Mateusz Malinowski Razvan Pascanu Peter Battaglia and Timothy Lillicrap. 2017. A simple neural network module for relational reasoning. In NeurIPS."},{"key":"e_1_3_2_1_26_1","unstructured":"Yucheng Suo Fan Ma Linchao Zhu and Yi Yang. 2024. Knowledge-Enhanced Dual-stream Zero-shot Composed Image Retrieval. In CVPR. 26951--26962."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Yuanmin Tang Jing Yu Keke Gai Jiamin Zhuang Gang Xiong Yue Hu and Qi Wu. 2024. Context-I2W: Mapping Images to Context-dependent Words for Accurate Zero-Shot Composed Image Retrieval. In AAAI. 5180--5188.","DOI":"10.1609\/aaai.v38i6.28324"},{"key":"e_1_3_2_1_28_1","volume-title":"Genecis: A benchmark for general conditional image similarity. In CVPR. 6862--6872.","author":"Vaze Sagar","year":"2023","unstructured":"Sagar Vaze, Nicolas Carion, and Ishan Misra. 2023. Genecis: A benchmark for general conditional image similarity. In CVPR. 6862--6872."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Nam Vo Lu Jiang Chen Sun Kevin Murphy Li-Jia Li Li Fei-Fei and James Hays. 2019. Composing Text and Image for Image Retrieval-An Empirical Odyssey. In CVPR. 6439--6448.","DOI":"10.1109\/CVPR.2019.00660"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3346434"},{"key":"e_1_3_2_1_31_1","volume-title":"Fashion IQ: A New Dataset Towards Retrieving Images by Natural Language Feedback. In CVPR. 11307--11317.","author":"Wu Hui","year":"2021","unstructured":"Hui Wu, Yupeng Gao, Xiaoxiao Guo, Ziad Al-Halah, Steven Rennie, Kristen Grauman, and Rogerio Feris. 2021. Fashion IQ: A New Dataset Towards Retrieving Images by Natural Language Feedback. In CVPR. 11307--11317."},{"key":"e_1_3_2_1_32_1","volume-title":"Chun-Mei Feng, et al.","author":"Xu Xinxing","year":"2024","unstructured":"Xinxing Xu, Yong Liu, Salman Khan, Fahad Khan, Wangmeng Zuo, Rick Siow Mong Goh, Chun-Mei Feng, et al. 2024. Sentence-level Prompts Benefit Composed Image Retrieval. In ICLR."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Zhenyu Yang Shengsheng Qian Dizhan Xue Jiahong Wu Fan Yang Weiming Dong and Changsheng Xu. 2024. Semantic editing increment benefits zero-shot composed image retrieval. In MM. 1245--1254.","DOI":"10.1145\/3664647.3681649"},{"key":"e_1_3_2_1_34_1","volume-title":"CoCa: Contrastive Captioners are Image-Text Foundation Models. TMLR","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. CoCa: Contrastive Captioners are Image-Text Foundation Models. TMLR (2022)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Huaying Zhang Rintaro Yanagi Ren Togo Takahiro Ogawa and Miki Haseyama. 2024. Zero-Shot composed image retrieval considering query-target relationship leveraging masked image-text pairs. In ICIP. 2431--2437.","DOI":"10.1109\/ICIP51287.2024.10648023"}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Chicago IL USA","acronym":"ICMR '25"},"container-title":["Proceedings of the 6th Workshop on Intelligent Cross-Data Analysis and Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3733566.3734428","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,31]],"date-time":"2025-07-31T16:19:43Z","timestamp":1753978783000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3733566.3734428"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":35,"alternative-id":["10.1145\/3733566.3734428","10.1145\/3733566"],"URL":"https:\/\/doi.org\/10.1145\/3733566.3734428","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}