{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,12]],"date-time":"2026-07-12T02:32:05Z","timestamp":1783823525896,"version":"3.55.0"},"publisher-location":"Cham","reference-count":68,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729423","type":"print"},{"value":"9783031729430","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:00:00Z","timestamp":1732838400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:00:00Z","timestamp":1732838400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72943-0_5","type":"book-chapter","created":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T13:41:04Z","timestamp":1732801264000},"page":"76-94","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["EA-VTR: Event-Aware Video-Text Retrieval"],"prefix":"10.1007","author":[{"given":"Zongyang","family":"Ma","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ziqi","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yuxin","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhongang","family":"Qi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chunfeng","family":"Yuan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bing","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yingmin","family":"Luo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xu","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaojuan","family":"Qi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ying","family":"Shan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Weiming","family":"Hu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,11,29]]},"reference":[{"key":"5_CR1","doi-asserted-by":"crossref","unstructured":"Anne\u00a0Hendricks, L., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5803\u20135812 (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"5_CR2","doi-asserted-by":"crossref","unstructured":"Bagad, P., Tapaswi, M., Snoek, C.G.: Test of time: instilling video-language models with a sense of time. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2503\u20132516 (2023)","DOI":"10.1109\/CVPR52729.2023.00247"},{"key":"5_CR3","unstructured":"Bai, J., et al.: Lat: latent translation with cycle-consistency for video-text retrieval. arXiv preprint arXiv:2207.04858 (2022)"},{"key":"5_CR4","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: a joint video and image encoder for end-to-end retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1728\u20131738 (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"5_CR5","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? In: ICML, vol.\u00a02, p.\u00a04 (2021)"},{"key":"5_CR6","doi-asserted-by":"crossref","unstructured":"Caba\u00a0Heilbron, F., Escorcia, V., Ghanem, B., Carlos\u00a0Niebles, J.: Activitynet: a large-scale video benchmark for human activity understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 961\u2013970 (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"5_CR7","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"38","DOI":"10.1007\/978-3-031-19809-0_3","volume-title":"European Conference on Computer Vision","author":"M Cao","year":"2022","unstructured":"Cao, M., Yang, T., Weng, J., Zhang, C., Wang, J., Zou, Y.: LocVTP: video-text pre-training for temporal localization. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, pp. 38\u201356. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19809-0_3"},{"key":"5_CR8","unstructured":"Chen, D., Dolan, W.B.: Collecting highly parallel data for paraphrase evaluation. In: Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, pp. 190\u2013200 (2011)"},{"key":"5_CR9","doi-asserted-by":"crossref","unstructured":"Chen, Y., et al.: Vilem: visual-language error modeling for image-text retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 11018\u201311027 (2023)","DOI":"10.1109\/CVPR52729.2023.01060"},{"key":"5_CR10","unstructured":"Ego4D Consortium: Egocentric live 4D perception (Ego4D) database: a large-scale first-person video database, supporting research in multi-modal machine perception for daily life activity"},{"key":"5_CR11","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"5_CR12","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"5_CR13","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"5_CR14","doi-asserted-by":"crossref","unstructured":"Freitag, M., Al-Onaizan, Y.: Beam search strategies for neural machine translation. arXiv preprint arXiv:1702.01806 (2017)","DOI":"10.18653\/v1\/W17-3207"},{"key":"5_CR15","unstructured":"Fu, T.J., et al.: Violet: end-to-end video-language transformers with masked visual-token modeling. arXiv preprint arXiv:2111.12681 (2021)"},{"key":"5_CR16","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"214","DOI":"10.1007\/978-3-030-58548-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"V Gabeur","year":"2020","unstructured":"Gabeur, V., Sun, C., Alahari, K., Schmid, C.: Multi-modal transformer for video retrieval. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12349, pp. 214\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58548-8_13"},{"key":"5_CR17","doi-asserted-by":"crossref","unstructured":"Ge, Y., et al.: Bridging video-text retrieval with multiple choice questions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16167\u201316176 (2022)","DOI":"10.1109\/CVPR52688.2022.01569"},{"key":"5_CR18","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"691","DOI":"10.1007\/978-3-031-19833-5_40","volume-title":"European Conference on Computer Vision","author":"Y Ge","year":"2022","unstructured":"Ge, Y., et al.: MILES: visual BERT pre-training with injected language semantics for video-text retrieval. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13695, pp. 691\u2013708. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19833-5_40"},{"key":"5_CR19","first-page":"22605","volume":"33","author":"S Ging","year":"2020","unstructured":"Ging, S., Zolfaghari, M., Pirsiavash, H., Brox, T.: Coot: cooperative hierarchical transformer for video-text representation learning. Adv. Neural. Inf. Process. Syst. 33, 22605\u201322618 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"5_CR20","unstructured":"Grauman, K., et\u00a0al.: Ego4d: around the world in 3,000 hours of egocentric video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18995\u201319012 (2022)"},{"key":"5_CR21","unstructured":"Holtzman, A., Buys, J., Du, L., Forbes, M., Choi, Y.: The curious case of neural text degeneration. arXiv preprint arXiv:1904.09751 (2019)"},{"key":"5_CR22","doi-asserted-by":"crossref","unstructured":"Huang, J., Li, Y., Feng, J., Wu, X., Sun, X., Ji, R.: Clover: towards a unified video-language alignment and fusion model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14856\u201314866 (2023)","DOI":"10.1109\/CVPR52729.2023.01427"},{"key":"5_CR23","unstructured":"Jozefowicz, R., Vinyals, O., Schuster, M., Shazeer, N., Wu, Y.: Exploring the limits of language modeling. arXiv preprint arXiv:1602.02410 (2016)"},{"key":"5_CR24","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: a large video database for human motion recognition. In: 2011 International Conference on Computer Vision, pp. 2556\u20132563. IEEE (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"5_CR25","unstructured":"Lei, J., Berg, T.L., Bansal, M.: Revealing single frame bias for video-and-language learning. arXiv preprint arXiv:2206.03428 (2022)"},{"key":"5_CR26","doi-asserted-by":"crossref","unstructured":"Lei, J., et al.: Less is more: clipbert for video-and-language learning via sparse sampling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7331\u20137341 (2021)","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"5_CR27","doi-asserted-by":"crossref","unstructured":"Li, D., Li, J., Li, H., Niebles, J.C., Hoi, S.C.: Align and prompt: video-and-language pre-training with entity prompts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4953\u20134963 (2022)","DOI":"10.1109\/CVPR52688.2022.00490"},{"key":"5_CR28","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"5_CR29","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"5_CR30","doi-asserted-by":"crossref","unstructured":"Li, L., Chen, Y.C., Cheng, Y., Gan, Z., Yu, L., Liu, J.: Hero: hierarchical encoder for video+ language omni-representation pre-training. arXiv preprint arXiv:2005.00200 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"5_CR31","doi-asserted-by":"crossref","unstructured":"Li, L., et al.: Lavender: unifying video-language understanding as masked language modeling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23119\u201323129 (2023)","DOI":"10.1109\/CVPR52729.2023.02214"},{"key":"5_CR32","doi-asserted-by":"crossref","unstructured":"Li, Y., Min, K., Tripathi, S., Vasconcelos, N.: Svitt: temporal learning of sparse video-text transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18919\u201318929 (2023)","DOI":"10.1109\/CVPR52729.2023.01814"},{"key":"5_CR33","unstructured":"Liu, Y., Albanie, S., Nagrani, A., Zisserman, A.: Use what you have: video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487 (2019)"},{"key":"5_CR34","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Video swin transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3202\u20133211 (2022)","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"5_CR35","doi-asserted-by":"crossref","unstructured":"Lu, H., Fei, N., Huo, Y., Gao, Y., Lu, Z., Wen, J.R.: Cots: collaborative two-stream vision-language pre-training model for cross-modal retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15692\u201315701 (2022)","DOI":"10.1109\/CVPR52688.2022.01524"},{"key":"5_CR36","doi-asserted-by":"crossref","unstructured":"Miech, A., Alayrac, J.B., Smaira, L., Laptev, I., Sivic, J., Zisserman, A.: End-to-end learning of visual representations from uncurated instructional videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9879\u20139889 (2020)","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"5_CR37","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: Howto100m: learning a text-video embedding by watching hundred million narrated video clips. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2630\u20132640 (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"5_CR38","unstructured":"Oord, A.V.D., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"5_CR39","unstructured":"Patrick, M., et al.: Support-set bottlenecks for video-text representation learning. arXiv preprint arXiv:2010.02824 (2020)"},{"key":"5_CR40","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"issue":"8","key":"5_CR41","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al.: Language models are unsupervised multitask learners. OpenAI Blog 1(8), 9 (2019)","journal-title":"OpenAI Blog"},{"key":"5_CR42","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing Systems, vol. 28 (2015)"},{"key":"5_CR43","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Tandon, N., Schiele, B.: A dataset for movie description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3202\u20133212 (2015)","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"5_CR44","doi-asserted-by":"crossref","unstructured":"Rouditchenko, A., et\u00a0al.: Avlnet: learning audio-visual language representations from instructional videos. arXiv preprint arXiv:2006.09199 (2020)","DOI":"10.21437\/Interspeech.2021-1312"},{"key":"5_CR45","unstructured":"Sanh, V., Debut, L., Chaumond, J., Wolf, T.: Distilbert, a distilled version of bert: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)"},{"key":"5_CR46","doi-asserted-by":"crossref","unstructured":"Shao, D., Xiong, Y., Zhao, Y., Huang, Q., Qiao, Y., Lin, D.: Find and focus: retrieve and localize video events with natural language queries. In: Proceedings of the European Conference on Computer Vision (ECCV) (2018)","DOI":"10.1007\/978-3-030-01240-3_13"},{"key":"5_CR47","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2556\u20132565 (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"5_CR48","doi-asserted-by":"crossref","unstructured":"Shi, Y., et\u00a0al.: Learning semantics-grounded vocabulary representation for video-text retrieval. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 4460\u20134470 (2023)","DOI":"10.1145\/3581783.3612537"},{"key":"5_CR49","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)"},{"key":"5_CR50","doi-asserted-by":"crossref","unstructured":"Tewel, Y., Shalev, Y., Schwartz, I., Wolf, L.: Zerocap: zero-shot image-to-text generation for visual-semantic arithmetic. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17918\u201317928 (2022)","DOI":"10.1109\/CVPR52688.2022.01739"},{"key":"5_CR51","doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: Object-aware video-language pre-training for retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3313\u20133322 (2022)","DOI":"10.1109\/CVPR52688.2022.00331"},{"key":"5_CR52","doi-asserted-by":"crossref","unstructured":"Wang, J., et\u00a0al.: All in one: exploring unified video-language pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6598\u20136608 (2023)","DOI":"10.1109\/CVPR52729.2023.00638"},{"key":"5_CR53","unstructured":"Wang, P., et al.: OFA: unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: International Conference on Machine Learning, pp. 23318\u201323340. PMLR (2022)"},{"key":"5_CR54","doi-asserted-by":"crossref","unstructured":"Wu, W., Luo, H., Fang, B., Wang, J., Ouyang, W.: Cap4video: what can auxiliary captions do for text-video retrieval? In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10704\u201310713 (2023)","DOI":"10.1109\/CVPR52729.2023.01031"},{"key":"5_CR55","doi-asserted-by":"crossref","unstructured":"Wu, X., Gao, C., Lin, Z., Wang, Z., Han, J., Hu, S.: Rap: redundancy-aware video-language pre-training for text-video retrieval. arXiv preprint arXiv:2210.06881 (2022)","DOI":"10.18653\/v1\/2022.findings-emnlp.221"},{"key":"5_CR56","doi-asserted-by":"crossref","unstructured":"Xu, H., et al.: Videoclip: contrastive pre-training for zero-shot video-text understanding. arXiv preprint arXiv:2109.14084 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"5_CR57","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"5_CR58","doi-asserted-by":"crossref","unstructured":"Xu, M., et al.: Boundary-sensitive pre-training for temporal localization in videos. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7220\u20137230 (2021)","DOI":"10.1109\/ICCV48922.2021.00713"},{"key":"5_CR59","unstructured":"Xue, H., et al.: CLIP-ViP: adapting pre-trained image-text model to video-language representation alignment. arXiv preprint arXiv:2209.06430 (2022)"},{"key":"5_CR60","doi-asserted-by":"crossref","unstructured":"Yan, R., et al.: Video-text pre-training with learned regions for retrieval. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a037, pp. 3100\u20133108 (2023)","DOI":"10.1609\/aaai.v37i3.25414"},{"key":"5_CR61","doi-asserted-by":"crossref","unstructured":"Yang, J., Bisk, Y., Gao, J.: Taco: token-aware cascade contrastive learning for video-text alignment. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11562\u201311572 (2021)","DOI":"10.1109\/ICCV48922.2021.01136"},{"key":"5_CR62","doi-asserted-by":"crossref","unstructured":"Yang, X., et al.: Learning trajectory-word alignments for video-language tasks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 2504\u20132514 (2023)","DOI":"10.1109\/ICCV51070.2023.00237"},{"key":"5_CR63","doi-asserted-by":"crossref","unstructured":"Ye, Q., et al.: Hitea: hierarchical temporal-aware video-language pre-training. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15405\u201315416 (2023)","DOI":"10.1109\/ICCV51070.2023.01413"},{"key":"5_CR64","doi-asserted-by":"crossref","unstructured":"Zhang, G., Ren, J., Gu, J., Tresp, V.: Multi-event video-text retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 22113\u201322123 (2023)","DOI":"10.1109\/ICCV51070.2023.02021"},{"key":"5_CR65","doi-asserted-by":"crossref","unstructured":"Zhang, H., Liu, D., Lv, Z., Su, B., Tao, D.: Exploring temporal concurrency for video-language representation learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 15568\u201315578 (2023)","DOI":"10.1109\/ICCV51070.2023.01427"},{"key":"5_CR66","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Misra, I., Kr\u00e4henb\u00fchl, P., Girdhar, R.: Learning video representations from large language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6586\u20136597 (2023)","DOI":"10.1109\/CVPR52729.2023.00637"},{"key":"5_CR67","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"},{"key":"5_CR68","doi-asserted-by":"crossref","unstructured":"Zhu, L., Yang, Y.: Actbert: learning global-local video-text representations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8746\u20138755 (2020)","DOI":"10.1109\/CVPR42600.2020.00877"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72943-0_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T14:16:10Z","timestamp":1732803370000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72943-0_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,29]]},"ISBN":["9783031729423","9783031729430"],"references-count":68,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72943-0_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,29]]},"assertion":[{"value":"29 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}