{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T10:00:04Z","timestamp":1775815204138,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,7,10]],"date-time":"2024-07-10T00:00:00Z","timestamp":1720569600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,7,10]]},"DOI":"10.1145\/3626772.3657823","type":"proceedings-article","created":{"date-parts":[[2024,7,11]],"date-time":"2024-07-11T12:40:05Z","timestamp":1720701605000},"page":"2177-2187","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":14,"title":["CaLa: Complementary Association Learning for Augmenting Comoposed Image Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-8664-460X","authenticated-orcid":false,"given":"Xintong","family":"Jiang","sequence":"first","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6596-8117","authenticated-orcid":false,"given":"Yaxiong","family":"Wang","sequence":"additional","affiliation":[{"name":"Hefei University of Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9698-8508","authenticated-orcid":false,"given":"Mengjian","family":"Li","sequence":"additional","affiliation":[{"name":"Zhejiang Lab, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6366-9834","authenticated-orcid":false,"given":"Yujiao","family":"Wu","sequence":"additional","affiliation":[{"name":"CSRIO, Hobart, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4890-1775","authenticated-orcid":false,"given":"Bingwen","family":"Hu","sequence":"additional","affiliation":[{"name":"Anhui University of Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3173-6307","authenticated-orcid":false,"given":"Xueming","family":"Qian","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]}],"member":"320","published-online":{"date-parts":[[2024,7,11]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Rick Siow Mong Goh, and Chun-Mei Feng","author":"Bai Yang","year":"2023","unstructured":"Yang Bai, Xinxing Xu, Yong Liu, Salman Khan, Fahad Khan, Wangmeng Zuo, Rick Siow Mong Goh, and Chun-Mei Feng. 2023. Sentence-level Prompts Benefit Composed Image Retrieval. arXiv preprint arXiv:2310.05473 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Alberto Baldrati Marco Bertini Tiberio Uricchio and Alberto Del Bimbo. 2021. Conditioned image retrieval for fashion using contrastive learning and CLIPbased features. In ACM Multimedia Asia. 1--5.","DOI":"10.1145\/3469877.3493593"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00543"},{"key":"e_1_3_2_1_4_1","volume-title":"SPIRIT: Style-guided Patch Interaction for Fashion Image Retrieval with Text Feedback. ACM Transactions on Multimedia Computing, Communications and Applications","author":"Chen Yanzhe","year":"2024","unstructured":"Yanzhe Chen, Jiahuan Zhou, and Yuxin Peng. 2024. SPIRIT: Style-guided Patch Interaction for Fashion Image Retrieval with Text Feedback. ACM Transactions on Multimedia Computing, Communications and Applications (2024)."},{"key":"e_1_3_2_1_5_1","volume-title":"ARTEMIS: Attention-based Retrieval with Text-Explicit Matching and Implicit Similarity. In International Conference on Learning Representations. https:\/\/ openreview.net\/forum?id=CVfLvQq9gLo","author":"Delmas Ginger","year":"2022","unstructured":"Ginger Delmas, Rafael S. Rezende, Gabriela Csurka, and Diane Larlus. 2022. ARTEMIS: Attention-based Retrieval with Text-Explicit Matching and Implicit Similarity. In International Conference on Learning Representations. https:\/\/ openreview.net\/forum?id=CVfLvQq9gLo"},{"key":"e_1_3_2_1_6_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_7_1","volume-title":"9th International Conference on Learning Representations, ICLR 2021","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3-7, 2021. OpenReview.net. https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"e_1_3_2_1_8_1","volume-title":"Dialog-based interactive image retrieval. Advances in neural information processing systems 31","author":"Guo Xiaoxiao","year":"2018","unstructured":"Xiaoxiao Guo, Hui Wu, Yu Cheng, Steven Rennie, Gerald Tesauro, and Rogerio Feris. 2018. Dialog-based interactive image retrieval. Advances in neural information processing systems 31 (2018)."},{"key":"e_1_3_2_1_9_1","volume-title":"Logical Entity Representation in Knowledge-Graphs for Differentiable Rule Learning. ICLR","author":"Han Chi","year":"2023","unstructured":"Chi Han, Qizheng He, Charles Yu, Xinya Du, Hanghang Tong, and Heng Ji. 2023. Logical Entity Representation in Knowledge-Graphs for Differentiable Rule Learning. ICLR (2023)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.163"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00067"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16271"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00086"},{"key":"e_1_3_2_1_14_1","volume-title":"Data Roaming and Early Fusion for Composed Image Retrieval. arXiv preprint arXiv:2303.09429","author":"Levy Matan","year":"2023","unstructured":"Matan Levy, Rami Ben-Ari, Nir Darshan, and Dani Lischinski. 2023. Data Roaming and Early Fusion for Composed Image Retrieval. arXiv preprint arXiv:2303.09429 (2023)."},{"key":"e_1_3_2_1_15_1","volume-title":"International Conference on Machine Learning, ICML 2023","volume":"19742","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven C. H. Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In International Conference on Machine Learning, ICML 2023, 23-29 July 2023, Honolulu, Hawaii, USA (Proceedings of Machine Learning Research, Vol. 202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 19730--19742. https: \/\/proceedings.mlr.press\/v202\/li23q.html"},{"key":"e_1_3_2_1_16_1","volume-title":"BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In International Conference on Machine Learning, ICML 2022","volume":"12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven C. H. Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In International Conference on Machine Learning, ICML 2022, 17-23 July 2022, Baltimore, Maryland, USA (Proceedings of Machine Learning Research, Vol. 162), Kamalika Chaudhuri, Stefanie Jegelka, Le Song, Csaba Szepesv\u00e1ri, Gang Niu, and Sivan Sabato (Eds.). PMLR, 12888--12900."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00213"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00565"},{"key":"e_1_3_2_1_20_1","volume-title":"Candidate Set Re-ranking for Composed Image Retrieval with Dual Multi-modal Encoder. arXiv preprint arXiv:2305.16304","author":"Liu Zheyuan","year":"2023","unstructured":"Zheyuan Liu, Weixuan Sun, Damien Teney, and Stephen Gould. 2023. Candidate Set Re-ranking for Composed Image Retrieval with Dual Multi-modal Encoder. arXiv preprint arXiv:2305.16304 (2023)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3591712"},{"key":"e_1_3_2_1_22_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021","volume":"8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event (Proceedings of Machine Learning Research, Vol. 139), Marina Meila and Tong Zhang (Eds.). PMLR, 8748--8763."},{"key":"e_1_3_2_1_24_1","volume-title":"Joint language semantic and structure embedding for knowledge graph completion. COLING","author":"Shen Jianhao","year":"2022","unstructured":"Jianhao Shen, Chenguang Wang, Linyuan Gong, and Dawn Song. 2022. Joint language semantic and structure embedding for knowledge graph completion. COLING (2022)."},{"key":"e_1_3_2_1_25_1","volume-title":"Rtic: Residual learning for text and image composition using graph convolutional network. arXiv preprint arXiv:2104.03015","author":"Shin Minchul","year":"2021","unstructured":"Minchul Shin, Yoonjae Cho, Byungsoo Ko, and Geonmo Gu. 2021. Rtic: Residual learning for text and image composition using graph convolutional network. arXiv preprint arXiv:2104.03015 (2021)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00562"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1644"},{"key":"e_1_3_2_1_28_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_29_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention Is All You Need. arXiv:1706.03762 [cs.CL]"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00638"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3024822"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/526"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611817"},{"key":"e_1_3_2_1_35_1","volume-title":"The Fashion IQ Dataset: Retrieving Images by Combining Side Information and Relative Natural Language Feedback. CVPR","author":"Wu Hui","year":"2021","unstructured":"Hui Wu, Yupeng Gao, Xiaoxiao Guo, Ziad Al-Halah, Steven Rennie, Kristen Grauman, and Rogerio Feris. 2021. The Fashion IQ Dataset: Retrieving Images by Combining Side Information and Relative Natural Language Feedback. CVPR (2021)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611709"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2023.STARSEM-1.20"},{"key":"e_1_3_2_1_38_1","volume-title":"CoCa: Contrastive Captioners are Image-Text Foundation Models. Trans. Mach. Learn. Res. 2022","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. CoCa: Contrastive Captioners are Image-Text Foundation Models. Trans. Mach. Learn. Res. 2022 (2022)."},{"key":"e_1_3_2_1_39_1","volume-title":"Relieving Triplet Ambiguity: Consensus Network for Language-Guided Image Retrieval. arXiv preprint arXiv:2306.02092","author":"Zhang Xu","year":"2023","unstructured":"Xu Zhang, Zhedong Zheng, Xiaohan Wang, and Yi Yang. 2023. Relieving Triplet Ambiguity: Consensus Network for Language-Guided Image Retrieval. arXiv preprint arXiv:2306.02092 (2023)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1016\/J.KNOSYS.2023.110280"}],"event":{"name":"SIGIR 2024: The 47th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Washington DC USA","acronym":"SIGIR 2024","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3626772.3657823","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3626772.3657823","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T05:39:03Z","timestamp":1755841143000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3626772.3657823"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,10]]},"references-count":40,"alternative-id":["10.1145\/3626772.3657823","10.1145\/3626772"],"URL":"https:\/\/doi.org\/10.1145\/3626772.3657823","relation":{},"subject":[],"published":{"date-parts":[[2024,7,10]]},"assertion":[{"value":"2024-07-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}