{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T17:39:50Z","timestamp":1775324390972,"version":"3.50.1"},"publisher-location":"Singapore","reference-count":28,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819533923","type":"print"},{"value":"9789819533930","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-3393-0_7","type":"book-chapter","created":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T09:50:59Z","timestamp":1761904259000},"page":"80-91","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Multi-view Captioning with Semantic Delta Re-ranking for Zero-Shot Composed Video Retrieval"],"prefix":"10.1007","author":[{"given":"Zhixiang","family":"Ding","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lilong","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhenyu","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shengsheng","family":"Qian","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,11,1]]},"reference":[{"key":"7_CR1","first-page":"5270","volume":"38","author":"L Ventura","year":"2024","unstructured":"Ventura, L., Yang, A., Schmid, C., Varol, G.: Covr: learning composed video retrieval from web video captions. Proc. AAAI Conf. Artif. Intell. 38, 5270\u20135279 (2024)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"7_CR2","doi-asserted-by":"crossref","unstructured":"Hummel, T., Karthik, S., Georgescu, M.I., Akata, Z.: EgoCVR: an egocentric benchmark for fine-grained composed video retrieval. In: European Conference on Computer Vision, pp. 1\u201317. Springer (2024)","DOI":"10.1007\/978-3-031-72913-3_1"},{"key":"7_CR3","doi-asserted-by":"crossref","unstructured":"Thawakar, O., et al.: Composed video retrieval via enriched context and discriminative embeddings. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26896\u201326906 (2024)","DOI":"10.1109\/CVPR52733.2024.02540"},{"key":"7_CR4","unstructured":"Yue, W., Qi, Z., Wu, Y., Sun, J., Wang, Y., Wang, S.: Learning fine-grained representations through textual token disentanglement in composed video retrieval. In: The Thirteenth International Conference on Learning Representations (2025)"},{"key":"7_CR5","doi-asserted-by":"crossref","unstructured":"Song, X., Lin, H., Wen, H., Hou, B., Xu, M., Nie, L.: A comprehensive survey on composed image retrieval. arXiv preprint arXiv:2502.18495 (2025)","DOI":"10.1145\/3767328"},{"key":"7_CR6","doi-asserted-by":"crossref","unstructured":"Zhenyu Yang, et al.: Semantic editing increment benefits zero-shot composed image retrieval. In: Proceedings of the 32nd ACM International Conference on Multimedia, pp. 1245\u20131254 (2024)","DOI":"10.1145\/3664647.3681649"},{"key":"7_CR7","doi-asserted-by":"crossref","unstructured":"Schroff, F., Kalenichenko, D., Philbin, J.: FaceNet: a unified embedding for face recognition and clustering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 815\u2013823 (2015)","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"7_CR8","doi-asserted-by":"crossref","unstructured":"Vo, N., et al.: Composing text and image for image retrieval-an empirical odyssey. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6439\u20136448 (2019)","DOI":"10.1109\/CVPR.2019.00660"},{"key":"7_CR9","doi-asserted-by":"crossref","unstructured":"Baldrati, A., Bertini, M., Uricchio, T., Bimbo, A,D.: Effective conditioned and composed image retrieval combining clip-based features. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21466\u201321474 (2022)","DOI":"10.1109\/CVPR52688.2022.02080"},{"key":"7_CR10","unstructured":"Delmas, G., de Rezende, R.S., Csurka, G., Larlus, D.: Artemis: attention-based retrieval with text-explicit matching and implicit similarity. arXiv preprint arXiv:2203.08101 (2022)"},{"key":"7_CR11","doi-asserted-by":"crossref","unstructured":"Saito, K., et al.: Pic2Word: mapping pictures to words for zero-shot composed image retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19305\u201319314 (2023)","DOI":"10.1109\/CVPR52729.2023.01850"},{"key":"7_CR12","unstructured":"Yang, Z., et al.:. SVBench: a benchmark with temporal multi-turn dialogues for streaming video understanding. arXiv preprint arXiv:2502.10810 (2025)"},{"key":"7_CR13","doi-asserted-by":"crossref","unstructured":"Karthik, S., Mancini, M., Akata, Z.: KG-SP: knowledge guided simple primitives for open world compositional zero-shot learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9336\u20139345 (2022)","DOI":"10.1109\/CVPR52688.2022.00912"},{"key":"7_CR14","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"7_CR15","doi-asserted-by":"crossref","unstructured":"Yang, Z., Xue, D., Qian, S., Dong, W., Xu, C.: LDRE: Llm-based divergent reasoning and ensemble for zero-shot composed image retrieval. In: Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 80\u201390 (2024)","DOI":"10.1145\/3626772.3657740"},{"key":"7_CR16","doi-asserted-by":"crossref","unstructured":"Liu, Y., Xiong, P., Xu, L., Cao, S., Jin, Q.: TS2-Net: token shift and selection transformer for text-video retrieval. In: European Conference on Computer Vision, pp. 319\u2013335. Springer (2022)","DOI":"10.1007\/978-3-031-19781-9_19"},{"key":"7_CR17","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1016\/j.neucom.2022.07.028","volume":"508","author":"H Luo","year":"2022","unstructured":"Luo, H., et al.: CLIP4CLIP: an empirical study of CLIP for end to end video clip retrieval and captioning. Neurocomputing 508, 293\u2013304 (2022)","journal-title":"Neurocomputing"},{"issue":"1","key":"7_CR18","first-page":"1","volume":"3","author":"X Yingjia","year":"2025","unstructured":"Yingjia, X., Mengxia, W., Guo, Z., Cao, M., Ye, M., Laaksonen, J.: Efficient text-to-video retrieval via multi-modal multi-tagger derived pre-screening. Vis. Intell. 3(1), 1\u201313 (2025)","journal-title":"Vis. Intell."},{"key":"7_CR19","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, pp. 19730\u201319742. PMLR (2023)"},{"key":"7_CR20","doi-asserted-by":"crossref","unstructured":"Rasheed, H., Khattak, M.U., Maaz, M., Khan, S., Khan, F.S.: Fine-tuned CLIP models are efficient video learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6545\u20136554 (2023)","DOI":"10.1109\/CVPR52729.2023.00633"},{"key":"7_CR21","unstructured":"Xue, H., et al.: CLIP-VIP: adapting pre-trained image-text model to video-language representation alignment. arXiv preprint arXiv:2209.06430 (2022)"},{"key":"7_CR22","unstructured":"Yao, L., et al.:. FILIP: fine-grained interactive language-image pre-training. arXiv preprint arXiv:2111.07783 (2021)"},{"key":"7_CR23","unstructured":"Hurst, A., et\u00a0al.: GPT-4o system card. arXiv preprint arXiv:2410.21276 (2024)"},{"key":"7_CR24","unstructured":"Achiam, J., et\u00a0al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"7_CR25","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: a joint video and image encoder for end-to-end retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1728\u20131738 (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"7_CR26","unstructured":"Grauman, K., et\u00a0al.: EGO4D: around the world in 3,000 hours of egocentric video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18995\u201319012 (2022)"},{"key":"7_CR27","unstructured":"Bin Zhu, et\u00a0al.: LanguageBind: extending video-language pretraining to N-modality by language-based semantic alignment. arXiv preprint arXiv:2310.01852 (2023)"},{"key":"7_CR28","unstructured":"Karthik, S., Roth, K., Mancini, M., Akata, Z.: Vision-by-language for training-free compositional image retrieval. arXiv preprint arXiv:2310.09291 (2023)"}],"container-title":["Lecture Notes in Computer Science","Image and Graphics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-3393-0_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T16:29:31Z","timestamp":1775320171000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-3393-0_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,1]]},"ISBN":["9789819533923","9789819533930"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-3393-0_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,1]]},"assertion":[{"value":"1 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICIG","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Image and Graphics","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Xuzhou","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 November 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icig2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icig.csig.org.cn\/2025\/index.html","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}