{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T21:12:17Z","timestamp":1770066737837,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":24,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819645886","type":"print"},{"value":"9789819645893","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-4589-3_6","type":"book-chapter","created":{"date-parts":[[2025,4,12]],"date-time":"2025-04-12T21:29:54Z","timestamp":1744493394000},"page":"77-90","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Training-Free Zero-Shot Composed Image Retrieval via\u00a0Weighted Modality Fusion and\u00a0Similarity"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-4564-8790","authenticated-orcid":false,"given":"Ren-Di","family":"Wu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1515-0366","authenticated-orcid":false,"given":"Yu-Yen","family":"Lin","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8261-6965","authenticated-orcid":false,"given":"Huei-Fang","family":"Yang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,4,13]]},"reference":[{"key":"6_CR1","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: ICCV, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"6_CR2","doi-asserted-by":"crossref","unstructured":"Baldrati, A., Agnolucci, L., Bertini, M., Del\u00a0Bimbo, A.: Zero-shot composed image retrieval with textual inversion. In: ICCV, pp. 15338\u201315347 (2023)","DOI":"10.1109\/ICCV51070.2023.01407"},{"key":"6_CR3","doi-asserted-by":"crossref","unstructured":"Baldrati, A., Bertini, M., Uricchio, T., Del\u00a0Bimbo, A.: Effective conditioned and composed image retrieval combining clip-based features. In: CVPR, pp. 21466\u201321474 (2022)","DOI":"10.1109\/CVPR52688.2022.02080"},{"key":"6_CR4","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners. In: NeurIPS, pp. 1877\u20131901 (2020)"},{"key":"6_CR5","doi-asserted-by":"crossref","unstructured":"Cimino, M.G.C.A., Galatolo, F.A., Vaglini, G.: Generating images from caption and vice versa via clip-guided generative latent space search. In: IMPROVE, pp. 166\u2013174 (2021)","DOI":"10.5220\/0010503701660174"},{"key":"6_CR6","unstructured":"Delmas, G., Rezende, R.S., Csurka, G., Larlus, D.: Artemis: attention-based retrieval with text-explicit matching and implicit similarity. In: ICLR (2022)"},{"key":"6_CR7","unstructured":"Gemini Team:\u00a0Anil, R., et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"6_CR8","unstructured":"Gu, G., Chun, S., Kim, W., Jun, H., Kang, Y., Yun, S.: CompoDiff: versatile composed image retrieval with latent diffusion. TMLR (2024)"},{"key":"6_CR9","doi-asserted-by":"crossref","unstructured":"Gu, G., Chun, S., Kim, W., Kang, Y., Yun, S.: Language-only training of zero-shot composed image retrieval. In: CVPR, pp. 13225\u201313234 (2024)","DOI":"10.1109\/CVPR52733.2024.01256"},{"key":"6_CR10","unstructured":"Karthik, S., Roth, K., Mancini, M., Akata, Z.: Vision-by-language for training-free compositional image retrieval. In: ICLR (2024)"},{"key":"6_CR11","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: ICML, pp. 12888\u201312900 (2022)"},{"key":"6_CR12","doi-asserted-by":"crossref","unstructured":"Li, M., et al..: Clip-event: connecting text and images with event structures. In: CVPR, pp. 16420\u201316429 (2022)","DOI":"10.1109\/CVPR52688.2022.01593"},{"key":"6_CR13","doi-asserted-by":"crossref","unstructured":"Liu, Z., Rodriguez-Opazo, C., Teney, D., Gould, S.: Image retrieval on real-life images with pre-trained vision-and-language models. In: ICCV, pp. 2125\u20132134 (2021)","DOI":"10.1109\/ICCV48922.2021.00213"},{"key":"6_CR14","doi-asserted-by":"crossref","unstructured":"Liu, Z., Sun, W., Hong, Y., Teney, D., Gould, S.: Bi-directional training for composed image retrieval via text prompt learning. In: WACV, pp. 5753\u20135762 (2024)","DOI":"10.1109\/WACV57701.2024.00565"},{"key":"6_CR15","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763 (2021)"},{"key":"6_CR16","unstructured":"Radford, A., et\u00a0al.: Language models are unsupervised multitask learners. OpenAI Blog, p.\u00a09 (2019)"},{"key":"6_CR17","doi-asserted-by":"crossref","unstructured":"Saito, K., et al.: Pic2Word: mapping pictures to words for zero-shot composed image retrieval. In: CVPR, pp. 19305\u201319314 (2023)","DOI":"10.1109\/CVPR52729.2023.01850"},{"key":"6_CR18","unstructured":"Schuhmann, C., et\u00a0al.: LAION-5B: an open large-scale dataset for training next generation image-text models. In: NeurIPS, pp. 25278\u201325294 (2022)"},{"key":"6_CR19","doi-asserted-by":"crossref","unstructured":"Suhr, A., Zhou, S., Zhang, A., Zhang, I., Bai, H., Artzi, Y.: A corpus for reasoning about natural language grounded in photographs. In: NeurIPS, pp. 6418\u20136428 (2019)","DOI":"10.18653\/v1\/P19-1644"},{"key":"6_CR20","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: lessons learned from the 2015 mscoco image captioning challenge. IEEE TPAMI, pp. 652\u2013663 (2016)","DOI":"10.1109\/TPAMI.2016.2587640"},{"key":"6_CR21","doi-asserted-by":"crossref","unstructured":"Vo, N., et al.: Composing text and image for image retrieval - an empirical odyssey. In: CVPR, pp. 6439\u20136448 (2019)","DOI":"10.1109\/CVPR.2019.00660"},{"key":"6_CR22","doi-asserted-by":"crossref","unstructured":"Wu, H., et al.: Fashion IQ: a new dataset towards retrieving images by natural language feedback. In: CVPR, pp. 11307\u201311317 (2021)","DOI":"10.1109\/CVPR46437.2021.01115"},{"key":"6_CR23","doi-asserted-by":"crossref","unstructured":"Yang, Z., Xue, D., Qian, S., Dong, W., Xu, C.: LDRE: LLM-based divergent reasoning and ensemble for zero-shot composed image retrieval. In: SIGIR, pp. 80\u201390 (2024)","DOI":"10.1145\/3626772.3657740"},{"key":"6_CR24","unstructured":"Zhang, K., et al.: Magiclens: self-supervised image retrieval with open-ended instructions. arXiv preprint arXiv:2403.19651 (2024)"}],"container-title":["Communications in Computer and Information Science","Technologies and Applications of Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-4589-3_6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,12]],"date-time":"2025-04-12T21:30:11Z","timestamp":1744493411000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-4589-3_6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819645886","9789819645893"],"references-count":24,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-4589-3_6","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"value":"1865-0929","type":"print"},{"value":"1865-0937","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"13 April 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"TAAI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Technologies and Applications of Artificial Intelligence","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hsinchu","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Taiwan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"taai2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}