{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T10:01:07Z","timestamp":1775815267434,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"National Key R&D Program of China","award":["2022YFB4701400\/4701402"],"award-info":[{"award-number":["2022YFB4701400\/4701402"]}]},{"name":"Beijing Key Lab of Networked Multimedia"},{"name":"SSTIC Grant","award":["KJZD20230923115106012, KJZD20230923114916032"],"award-info":[{"award-number":["KJZD20230923115106012, KJZD20230923114916032"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681493","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"5575-5583","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Semantic Distillation from Neighborhood for Composed Image Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6719-5063","authenticated-orcid":false,"given":"Yifan","family":"Wang","sequence":"first","affiliation":[{"name":"Tsinghua Shenzhen International School, Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6378-2498","authenticated-orcid":false,"given":"Wuliang","family":"Huang","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4514-3617","authenticated-orcid":false,"given":"Lei","family":"Li","sequence":"additional","affiliation":[{"name":"Tsinghua Shenzhen International School, Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3590-6676","authenticated-orcid":false,"given":"Chun","family":"Yuan","sequence":"additional","affiliation":[{"name":"Tsinghua Shenzhen International School, Tsinghua University, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Compositional Learning of Image-Text Query for Image Retrieval","author":"Anwaar Muhammad Umer","unstructured":"Muhammad Umer Anwaar, Egor Labintcev, and Martin Kleinsteuber. 2021. Compositional Learning of Image-Text Query for Image Retrieval. In WACV. IEEE, 1139--1148."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Yang Bai Jingyao Wang Min Cao Chen Chen Ziqiang Cao Liqiang Nie and Min Zhang. 2023. Text-based Person Search without Parallel Image-Text Data. In ACM MM. ACM.","DOI":"10.1145\/3581783.3612285"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00543"},{"key":"e_1_3_2_1_4_1","volume-title":"Learn and Reason: Exploring the Cross-modal Semantic Correlations for Language-guided HOI detection","author":"Cao Yichao","unstructured":"Yichao Cao, Qingfei Tang, Feng Yang, Xiu Su, Shan You, Xiaobo Lu, and Chang Xu. 2023. Re-mine, Learn and Reason: Exploring the Cross-modal Semantic Correlations for Language-guided HOI detection. In ICCV. IEEE, 23435--23446."},{"key":"e_1_3_2_1_5_1","volume-title":"Leveraging Style and Content Features for Text Conditioned Image Retrieval. In CVPR Workshops. IEEE, 3978--3982","author":"Chawla Pranit","year":"2021","unstructured":"Pranit Chawla, Surgan Jandial, Pinkesh Badjatiya, Ayush Chopra, Mausoom Sarkar, and Balaji Krishnamurthy. 2021. Leveraging Style and Content Features for Text Conditioned Image Retrieval. In CVPR Workshops. IEEE, 3978--3982."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3185668"},{"key":"e_1_3_2_1_7_1","volume-title":"IMRAM: Iterative Matching With Recurrent Attention Memory for Cross-Modal Image-Text Retrieval","author":"Chen Hui","year":"2020","unstructured":"Hui Chen, Guiguang Ding, Xudong Liu, Zijia Lin, Ji Liu, and Jungong Han. 2020. IMRAM: Iterative Matching With Recurrent Attention Memory for Cross-Modal Image-Text Retrieval. In CVPR. IEEE, 12652--12660."},{"key":"e_1_3_2_1_8_1","volume-title":"Ranking-aware Uncertainty for Text-guided Image Retrieval. CoRR","author":"Chen Junyang","year":"2023","unstructured":"Junyang Chen and Hanjiang Lai. 2023. Ranking-aware Uncertainty for Text-guided Image Retrieval. CoRR, Vol. abs\/2308.08131 (2023)."},{"key":"e_1_3_2_1_9_1","volume-title":"Image Search With Text Feedback by Visiolinguistic Attention Learning","author":"Chen Yanbei","unstructured":"Yanbei Chen, Shaogang Gong, and Loris Bazzani. 2020. Image Search With Text Feedback by Visiolinguistic Attention Learning. In CVPR. IEEE, 2998--3008."},{"key":"e_1_3_2_1_10_1","volume-title":"Composed Image Retrieval with Text Feedback via Multi-grained Uncertainty Regularization. CoRR","author":"Chen Yiyang","year":"2022","unstructured":"Yiyang Chen, Zhedong Zheng, Wei Ji, Leigang Qu, and Tat-Seng Chua. 2022. Composed Image Retrieval with Text Feedback via Multi-grained Uncertainty Regularization. CoRR, Vol. abs\/2211.07394 (2022)."},{"key":"e_1_3_2_1_11_1","unstructured":"Yiyang Chen Zhedong Zheng Wei Ji Leigang Qu and Tat-Seng Chua. 2024. Composed Image Retrieval with Text Feedback via Multi-grained Uncertainty Regularization. In ICLR. OpenReview.net."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3640345"},{"key":"e_1_3_2_1_13_1","volume-title":"Gabriela Csurka, and Diane Larlus.","author":"Delmas Ginger","year":"2022","unstructured":"Ginger Delmas, Rafael Sampaio de Rezende, Gabriela Csurka, and Diane Larlus. 2022. ARTEMIS: Attention-based Retrieval with Text-Explicit Matching and Implicit Similarity. In ICLR. OpenReview.net."},{"key":"e_1_3_2_1_14_1","volume-title":"Modality-Agnostic Attention Fusion for visual search with text feedback. CoRR","author":"Dodds Eric","year":"2020","unstructured":"Eric Dodds, Jack Culpepper, Simao Herdade, Yang Zhang, and Kofi Boakye. 2020. Modality-Agnostic Attention Fusion for visual search with text feedback. CoRR, Vol. abs\/2007.00145 (2020)."},{"key":"e_1_3_2_1_15_1","volume-title":"FashionVLP: Vision Language Transformer for Fashion Retrieval with Feedback","author":"Goenka Sonam","unstructured":"Sonam Goenka, Zhaoheng Zheng, Ayush Jaiswal, Rakesh Chada, Yue Wu, Varsha Hedau, and Pradeep Natarajan. 2022. FashionVLP: Vision Language Transformer for Fashion Retrieval with Feedback. In CVPR. IEEE, 14085--14095."},{"key":"e_1_3_2_1_16_1","unstructured":"Chunbin Gu Jiajun Bu Zhen Zhang Zhi Yu Dongfang Ma and Wei Wang. 2021. Image Search with Text Feedback by Deep Hierarchical Attention Mutual Information Maximization. In ACM MM. ACM 4600--4609."},{"key":"e_1_3_2_1_17_1","volume-title":"Exploiting the Social-Like Prior in Transformer for Visual Reasoning","author":"Han Yudong","year":"2058","unstructured":"Yudong Han, Yupeng Hu, Xuemeng Song, Haoyu Tang, Mingzhu Xu, and Liqiang Nie. 2024. Exploiting the Social-Like Prior in Transformer for Visual Reasoning. In AAAI. AAAI Press, 2058--2066."},{"key":"e_1_3_2_1_18_1","volume-title":"Girshick","author":"He Kaiming","year":"2020","unstructured":"Kaiming He, Haoqi Fan, Yuxin Wu, Saining Xie, and Ross B. Girshick. 2020. Momentum Contrast for Unsupervised Visual Representation Learning. In CVPR. IEEE, 9726--9735."},{"key":"e_1_3_2_1_19_1","volume-title":"Deep Residual Learning for Image Recognition","author":"He Kaiming","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In CVPR. IEEE, 770--778."},{"key":"e_1_3_2_1_20_1","volume-title":"Dynamic Weighted Combiner for Mixed-Modal Image Retrieval","author":"Huang Fuxiang","unstructured":"Fuxiang Huang, Lei Zhang, Xiaowei Fu, and Suqi Song. 2024. Dynamic Weighted Combiner for Mixed-Modal Image Retrieval. In AAAI. AAAI Press, 2303--2311."},{"key":"e_1_3_2_1_21_1","volume-title":"Maintaining Reasoning Consistency in Compositional Visual Question Answering","author":"Jing Chenchen","unstructured":"Chenchen Jing, Yunde Jia, Yuwei Wu, Xinyu Liu, and Qi Wu. 2022. Maintaining Reasoning Consistency in Compositional Visual Question Answering. In CVPR. IEEE, 5089--5098."},{"key":"e_1_3_2_1_22_1","volume-title":"Dual Compositional Learning in Interactive Image Retrieval","author":"Kim Jongseok","unstructured":"Jongseok Kim, Youngjae Yu, Hoeseong Kim, and Gunhee Kim. 2021. Dual Compositional Learning in Interactive Image Retrieval. In AAAI. AAAI Press, 1771--1779."},{"key":"e_1_3_2_1_23_1","volume-title":"Beyond a Pre-Trained Object Detector: Cross-Modal Textual and Visual Context for Image Captioning","author":"Kuo Chia-Wen","unstructured":"Chia-Wen Kuo and Zsolt Kira. 2022. Beyond a Pre-Trained Object Detector: Cross-Modal Textual and Visual Context for Image Captioning. In CVPR. IEEE, 17948--17958."},{"key":"e_1_3_2_1_24_1","volume-title":"CoSMo: Content-Style Modulation for Image Retrieval With Text Feedback","author":"Lee Seungmin","unstructured":"Seungmin Lee, Dongwan Kim, and Bohyung Han. 2021. CoSMo: Content-Style Modulation for Image Retrieval With Text Feedback. In CVPR. IEEE, 802--812."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19821-2_28"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Haoxuan Li Yi Bin Junrong Liao Yang Yang and Heng Tao Shen. 2023. Your Negative May not Be True Negative: Boosting Image-Text Matching with False Negative Elimination. In ACM MM. ACM 924--934.","DOI":"10.1145\/3581783.3612101"},{"key":"e_1_3_2_1_27_1","volume-title":"Multi-Grained Attention Network with Mutual Exclusion for Composed Query-Based Image Retrieval","author":"Li Shenshen","year":"2023","unstructured":"Shenshen Li, Xing Xu, Xun Jiang, Fumin Shen, Xin Liu, and Heng Tao Shen. 2023. Multi-Grained Attention Network with Mutual Exclusion for Composed Query-Based Image Retrieval. IEEE Transactions on Circuits and Systems for Video Technology (2023), 1--1."},{"key":"e_1_3_2_1_28_1","volume-title":"Hypergraph-Induced Semantic Tuplet Loss for Deep Metric Learning","author":"Lim Jongin","unstructured":"Jongin Lim, Sangdoo Yun, Seulki Park, and Jin Young Choi. 2022. Hypergraph-Induced Semantic Tuplet Loss for Deep Metric Learning. In CVPR. IEEE, 212--222."},{"key":"e_1_3_2_1_29_1","volume-title":"Damien Teney, and Stephen Gould.","author":"Liu Zheyuan","year":"2021","unstructured":"Zheyuan Liu, Cristian Rodriguez Opazo, Damien Teney, and Stephen Gould. 2021. Image Retrieval on Real-life Images with Pre-trained Vision-and-Language Models. In ICCV. IEEE, 2105--2114."},{"key":"e_1_3_2_1_30_1","volume-title":"Bi-directional Training for Composed Image Retrieval via Text Prompt Learning","author":"Liu Zheyuan","unstructured":"Zheyuan Liu, Weixuan Sun, Yicong Hong, Damien Teney, and Stephen Gould. 2024. Bi-directional Training for Composed Image Retrieval via Text Prompt Learning. In WACV. IEEE, 5741--5750."},{"key":"e_1_3_2_1_31_1","volume-title":"A fast and simple algorithm for training neural probabilistic language models","author":"Mnih Andriy","unstructured":"Andriy Mnih and Yee Whye Teh. 2012. A fast and simple algorithm for training neural probabilistic language models. In ICML. Omnipress, 419--426."},{"key":"e_1_3_2_1_32_1","volume-title":"ICML","volume":"139","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML, Vol. 139. PMLR, 8748--8763."},{"key":"e_1_3_2_1_33_1","volume-title":"FaceNet: A unified embedding for face recognition and clustering","author":"Schroff Florian","unstructured":"Florian Schroff, Dmitry Kalenichenko, and James Philbin. 2015. FaceNet: A unified embedding for face recognition and clustering. In CVPR. IEEE, 815--823."},{"key":"e_1_3_2_1_34_1","volume-title":"Grad-CAM: Visual Explanations from Deep Networks via Gradient-Based Localization","author":"Selvaraju Ramprasaath R.","unstructured":"Ramprasaath R. Selvaraju, Michael Cogswell, Abhishek Das, Ramakrishna Vedantam, Devi Parikh, and Dhruv Batra. 2017. Grad-CAM: Visual Explanations from Deep Networks via Gradient-Based Localization. In ICCV. IEEE, 618--626."},{"key":"e_1_3_2_1_35_1","volume-title":"A Corpus for Reasoning about Natural Language Grounded in Photographs","author":"Suhr Alane","unstructured":"Alane Suhr, Stephanie Zhou, Ally Zhang, Iris Zhang, Huajun Bai, and Yoav Artzi. 2019. A Corpus for Reasoning about Natural Language Grounded in Photographs. In ACL. Association for Computational Linguistics, 6418--6428."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3268004"},{"key":"e_1_3_2_1_37_1","volume-title":"Semantic Relation-aware Difference Representation Learning for Change Captioning","author":"Tu Yunbin","unstructured":"Yunbin Tu, Tingting Yao, Liang Li, Jiedong Lou, Shengxiang Gao, Zhengtao Yu, and Chenggang Yan. 2021. Semantic Relation-aware Difference Representation Learning for Change Captioning. In ACL\/IJCNLP. Association for Computational Linguistics, 63--73."},{"key":"e_1_3_2_1_38_1","volume-title":"Composing Text and Image for Image Retrieval - an Empirical Odyssey","author":"Vo Nam","unstructured":"Nam Vo, Lu Jiang, Chen Sun, Kevin Murphy, Li-Jia Li, Li Fei-Fei, and James Hays. 2019. Composing Text and Image for Image Retrieval - an Empirical Odyssey. In CVPR. IEEE, 6439--6448."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3369898"},{"key":"e_1_3_2_1_40_1","unstructured":"Haokun Wen Xuemeng Song Xin Yang Yibing Zhan and Liqiang Nie. 2021. Comprehensive Linguistic-Visual Composition Network for Image Retrieval. In SIGIR. ACM 1369--1378."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Haokun Wen Xian Zhang Xuemeng Song Yinwei Wei and Liqiang Nie. 2023. Target-Guided Composed Image Retrieval. In ACM MM. ACM 915--923.","DOI":"10.1145\/3581783.3611817"},{"key":"e_1_3_2_1_42_1","volume-title":"Fashion IQ: A New Dataset Towards Retrieving Images by Natural Language Feedback","author":"Wu Hui","year":"2021","unstructured":"Hui Wu, Yupeng Gao, Xiaoxiao Guo, Ziad Al-Halah, Steven Rennie, Kristen Grauman, and Rog\u00e9rio Feris. 2021. Fashion IQ: A New Dataset Towards Retrieving Images by Natural Language Feedback. In CVPR. IEEE, 11307--11317."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.102065"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3299791"},{"key":"e_1_3_2_1_45_1","volume-title":"Foreground Object Search by Distilling Composite Image Feature","author":"Zhang Bo","unstructured":"Bo Zhang, Jiacheng Sui, and Li Niu. 2023. Foreground Object Search by Distilling Composite Image Feature. In ICCV. IEEE, 22929--22938."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Feifei Zhang Ming Yan Ji Zhang and Changsheng Xu. 2022. Comprehensive Relationship Reasoning for Composed Query Based Image Retrieval. In ACM MM. ACM 4655--4664.","DOI":"10.1145\/3503161.3548126"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3359062"},{"key":"e_1_3_2_1_48_1","volume-title":"Relieving Triplet Ambiguity: Consensus Network for Language-Guided Image Retrieval. CoRR","author":"Zhang Xu","year":"2092","unstructured":"Xu Zhang, Zhedong Zheng, Xiaohan Wang, and Yi Yang. 2023. Relieving Triplet Ambiguity: Consensus Network for Language-Guided Image Retrieval. CoRR, Vol. abs\/2306.02092 (2023)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Minyi Zhao Jinpeng Wang Dongliang Liao Yiru Wang Huanzhong Duan and Shuigeng Zhou. 2023. Keyword-Based Diverse Image Retrieval by Semantics-aware Contrastive Learning and Transformer. In ACM SIGIR. ACM 1262--1272.","DOI":"10.1145\/3539618.3591705"},{"key":"e_1_3_2_1_50_1","volume-title":"Patel","author":"Zhou Mo","year":"2022","unstructured":"Mo Zhou and Vishal M. Patel. 2022. Enhancing Adversarial Robustness for Deep Metric Learning. In CVPR. IEEE, 15304--15313."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681493","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681493","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:47Z","timestamp":1750294667000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681493"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":50,"alternative-id":["10.1145\/3664647.3681493","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681493","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}