{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,10]],"date-time":"2026-02-10T04:41:06Z","timestamp":1770698466228,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":32,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819569564","type":"print"},{"value":"9789819569571","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-6957-1_42","type":"book-chapter","created":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T10:45:45Z","timestamp":1770633945000},"page":"588-601","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Hierarchical Cross-Modality Interaction for\u00a0Unified Video-Text Retrieval Modeling"],"prefix":"10.1007","author":[{"given":"Tianshi","family":"Xu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhengzheng","family":"Sun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yizheng","family":"Hu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junyuan","family":"Shang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Si","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,2,10]]},"reference":[{"key":"42_CR1","unstructured":"Alayrac, J.B., et\u00a0al.: Flamingo: a visual language model for few-shot learning. arXiv preprint arXiv:2204.14198 (2022)"},{"key":"42_CR2","doi-asserted-by":"crossref","unstructured":"Anne\u00a0Hendricks, L., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5803\u20135812 (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"42_CR3","unstructured":"Chen, D., Dolan, W.B.: Collecting highly parallel data for paraphrase evaluation. In: Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, pp. 190\u2013200 (2011)"},{"issue":"7","key":"42_CR4","doi-asserted-by":"publisher","first-page":"6559","DOI":"10.1109\/TCSVT.2024.3360530","volume":"34","author":"L Chen","year":"2024","unstructured":"Chen, L., Deng, Z., Liu, L., Yin, S.: Multilevel semantic interaction alignment for video-text cross-modal retrieval. IEEE Trans. Circuits Syst. Video Technol. 34(7), 6559\u20136575 (2024)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"42_CR5","doi-asserted-by":"crossref","unstructured":"Chen, S., Zhao, Y., Jin, Q., Wu, Q.: Fine-grained video-text retrieval with hierarchical graph reasoning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10638\u201310647 (2020)","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"42_CR6","unstructured":"Fang, H., Xiong, P., Xu, L., Chen, Y.: Clip2video: Mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097 (2021)"},{"key":"42_CR7","doi-asserted-by":"crossref","unstructured":"Gorti, S.K., Vouitsis, N., Ma, J., Golestan, K., Volkovs, M., Garg, A., Yu, G.: X-pool: cross-modal language-video attention for text-video retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5006\u20135015 (2022)","DOI":"10.1109\/CVPR52688.2022.00495"},{"key":"42_CR8","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2024.111144","volume":"159","author":"H Jiang","year":"2025","unstructured":"Jiang, H., Zhang, J., Huang, R., Ge, C., Ni, Z., Song, S., Huang, G.: Cross-modal adapter for vision-language retrieval. Patt. Recogn. 159, 111144 (2025)","journal-title":"Patt. Recogn."},{"key":"42_CR9","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2025.103151","volume":"121","author":"X Jing","year":"2025","unstructured":"Jing, X., Yang, G., Chu, J.: Tc-MGC: text-conditioned multi-grained contrastive learning for text-video retrieval. Inf. Fusion 121, 103151 (2025)","journal-title":"Inf. Fusion"},{"key":"42_CR10","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., Carlos\u00a0Niebles, J.: Dense-captioning events in videos. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 706\u2013715 (2017)","DOI":"10.1109\/ICCV.2017.83"},{"key":"42_CR11","unstructured":"Lei, J., Chen, X., Zhang, N., Wang, M., Bansal, M., Berg, T.L., Yu, L.: Loopitr: combining dual and cross encoder architectures for image-text retrieval. arXiv preprint arXiv:2203.05465 (2022)"},{"key":"42_CR12","doi-asserted-by":"crossref","unstructured":"Lei, J., Li, L., Zhou, L., Gan, Z., Berg, T.L., Bansal, M., Liu, J.: Less is more: clipbert for video-and-language learning via sparse sampling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7331\u20137341 (2021)","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"42_CR13","unstructured":"Li, W., Wang, S., Zhao, D., Xu, S., Pan, Z., Zhang, Z.: Multi-granularity and multi-modal feature interaction approach for text video retrieval. arXiv preprint arXiv:2407.12798 (2024)"},{"key":"42_CR14","doi-asserted-by":"crossref","unstructured":"Liu, S., Fan, H., Qian, S., Chen, Y., Ding, W., Wang, Z.: Hit: hierarchical transformer with momentum contrast for video-text retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11915\u201311925 (2021)","DOI":"10.1109\/ICCV48922.2021.01170"},{"key":"42_CR15","doi-asserted-by":"crossref","unstructured":"Liu, Y., Xiong, P., Xu, L., Cao, S., Jin, Q.: Ts2-net: Token shift and selection transformer for text-video retrieval. In: European Conference on Computer Vision, pp. 319\u2013335. Springer (2022)","DOI":"10.1007\/978-3-031-19781-9_19"},{"key":"42_CR16","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1016\/j.neucom.2022.07.028","volume":"508","author":"H Luo","year":"2022","unstructured":"Luo, H., et al.: Clip4clip: an empirical study of clip for end to end video clip retrieval and captioning. Neurocomputing 508, 293\u2013304 (2022)","journal-title":"Neurocomputing"},{"key":"42_CR17","doi-asserted-by":"crossref","unstructured":"Ma, Y., Xu, G., Sun, X., Yan, M., Zhang, J., Ji, R.: X-clip: end-to-end multi-grained contrastive learning for video-text retrieval. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 638\u2013647 (2022)","DOI":"10.1145\/3503161.3547910"},{"key":"42_CR18","doi-asserted-by":"crossref","unstructured":"Park, Y., et al.: Normalized contrastive learning for text-video retrieval. arXiv preprint arXiv:2212.11790 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.17"},{"key":"42_CR19","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"42_CR20","unstructured":"Torabi, A., Tandon, N., Sigal, L.: Learning language-visual embedding for movie understanding with natural-language. arXiv preprint arXiv:1609.08124 (2016)"},{"key":"42_CR21","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"42_CR22","unstructured":"Wang, Q., Zhang, Y., Zheng, Y., Pan, P., Hua, X.S.: Disentangled representation learning for text-video retrieval. arXiv preprint arXiv:2203.07111 (2022)"},{"key":"42_CR23","doi-asserted-by":"crossref","unstructured":"Wang, X., Wu, J., Chen, J., Li, L., Wang, Y.F., Wang, W.Y.: Vatex: a large-scale, high-quality multilingual dataset for video-and-language research. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4581\u20134591 (2019)","DOI":"10.1109\/ICCV.2019.00468"},{"key":"42_CR24","doi-asserted-by":"crossref","unstructured":"Wang, Z., Sung, Y.L., Cheng, F., Bertasius, G., Bansal, M.: Unified coarse-to-fine alignment for video-text retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2816\u20132827 (2023)","DOI":"10.1109\/ICCV51070.2023.00264"},{"key":"42_CR25","doi-asserted-by":"crossref","unstructured":"Wu, P., He, X., Tang, M., Lv, Y., Liu, J.: Hanet: hierarchical alignment networks for video-text retrieval. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 3518\u20133527 (2021)","DOI":"10.1145\/3474085.3475515"},{"key":"42_CR26","doi-asserted-by":"crossref","unstructured":"Xu, H., et al.: Videoclip: contrastive pre-training for zero-shot video-text understanding. arXiv preprint arXiv:2109.14084 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"42_CR27","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"42_CR28","doi-asserted-by":"crossref","unstructured":"Yu, Y., Kim, J., Kim, G.: A joint sequence fusion model for video question answering and retrieval. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 471\u2013487 (2018)","DOI":"10.1007\/978-3-030-01234-2_29"},{"key":"42_CR29","doi-asserted-by":"crossref","unstructured":"Zellers, R., et al.: Merlot reserve: neural script knowledge through vision and language and sound. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16375\u201316387 (2022)","DOI":"10.1109\/CVPR52688.2022.01589"},{"key":"42_CR30","doi-asserted-by":"crossref","unstructured":"Zhang, D., Wang, Z., Hu, Z., Wu, X.J.: HTVR: hierarchical text-to-video retrieval based on relative similarity. Patt. Recogn., 112145 (2025)","DOI":"10.1016\/j.patcog.2025.112145"},{"key":"42_CR31","doi-asserted-by":"crossref","unstructured":"Zhao, S., Zhu, L., Wang, X., Yang, Y.: Centerclip: token clustering for efficient text-video retrieval. In: Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 970\u2013981 (2022)","DOI":"10.1145\/3477495.3531950"},{"key":"42_CR32","unstructured":"Zou, X., Wu, C., Cheng, L., Wang, Z.: Tokenflow: rethinking fine-grained cross-modal alignment in vision-language retrieval. arXiv preprint arXiv:2209.13822 (2022)"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-6957-1_42","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T10:45:56Z","timestamp":1770633956000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-6957-1_42"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819569564","9789819569571"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-6957-1_42","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"10 February 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Prague","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Czech Republic","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2026","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 January 2026","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31 January 2026","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"32","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2026","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/mmm2026.cz\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}