{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T05:16:22Z","timestamp":1755839782784,"version":"3.40.3"},"publisher-location":"Cham","reference-count":94,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031729669"},{"type":"electronic","value":"9783031729676"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72967-6_21","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T19:03:43Z","timestamp":1730574223000},"page":"375-396","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Learning Video Context as\u00a0Interleaved Multimodal Sequences"],"prefix":"10.1007","author":[{"given":"Kevin Qinghong","family":"Lin","sequence":"first","affiliation":[]},{"given":"Pengchuan","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Difei","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Xide","family":"Xia","sequence":"additional","affiliation":[]},{"given":"Joya","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Ziteng","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Jinheng","family":"Xie","sequence":"additional","affiliation":[]},{"given":"Xuhong","family":"Xiao","sequence":"additional","affiliation":[]},{"given":"Mike Zheng","family":"Shou","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"21_CR1","doi-asserted-by":"crossref","unstructured":"Ahmadyan, A., Zhang, L., Ablavatski, A., Wei, J., Grundmann, M.: Objectron: a large scale dataset of object-centric videos in the wild with pose annotations. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2021)","DOI":"10.1109\/CVPR46437.2021.00773"},{"key":"21_CR2","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"21_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"382","DOI":"10.1007\/978-3-319-46454-1_24","volume-title":"Computer Vision \u2013 ECCV 2016","author":"P Anderson","year":"2016","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S.: SPICE: semantic propositional image caption evaluation. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 382\u2013398. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_24"},{"key":"21_CR4","doi-asserted-by":"crossref","unstructured":"Argaw, D.M., Lee, J.Y., Woodson, M., Kweon, I.S., Heilbron, F.C.: Long-range multimodal pretraining for movie understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13392\u201313403 (2023)","DOI":"10.1109\/ICCV51070.2023.01232"},{"key":"21_CR5","unstructured":"Awadalla, A., et\u00a0al.: Openflamingo: an open-source framework for training large autoregressive vision-language models. arXiv preprint arXiv:2308.01390 (2023)"},{"key":"21_CR6","doi-asserted-by":"crossref","unstructured":"Bain, M., Huh, J., Han, T., Zisserman, A.: Whisperx: time-accurate speech transcription of long-form audio (2023)","DOI":"10.21437\/Interspeech.2023-78"},{"key":"21_CR7","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Brown, A., Zisserman, A.: Condensed movies: story based retrieval with contextual embeddings. In: Proceedings of the Asian Conference on Computer Vision (2020)","DOI":"10.1007\/978-3-030-69541-5_28"},{"key":"21_CR8","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: a joint video and image encoder for end-to-end retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1728\u20131738 (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"21_CR9","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: A clip-hitchhiker\u2019s guide to long video retrieval. arXiv preprint arXiv:2205.08508 (2022)"},{"key":"21_CR10","unstructured":"Banerjee, S., Lavie, A.: Meteor: An automatic metric for mt evaluation with improved correlation with human judgments. In: Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization, pp. 65\u201372 (2005)"},{"key":"21_CR11","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? In: ICML, vol.\u00a02, p.\u00a04 (2021)"},{"key":"21_CR12","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners. In: NeurIPS, pp. 1877\u20131901 (2020)"},{"key":"21_CR13","doi-asserted-by":"crossref","unstructured":"Caba\u00a0Heilbron, F., Escorcia, V., Ghanem, B., Carlos\u00a0Niebles, J.: Activitynet: a large-scale video benchmark for human activity understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 961\u2013970 (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"21_CR14","doi-asserted-by":"crossref","unstructured":"Chen, J., et al.: Videollm-online: online video large language model for streaming video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18407\u201318418 (2024)","DOI":"10.1109\/CVPR52733.2024.01742"},{"key":"21_CR15","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: unleashing multimodal llm\u2019s referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)"},{"key":"21_CR16","doi-asserted-by":"crossref","unstructured":"Croitoru, I., et al.: Teachtext: crossmodal generalized distillation for text-video retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11583\u201311593 (2021)","DOI":"10.1109\/ICCV48922.2021.01138"},{"key":"21_CR17","unstructured":"Dai, W., et al.: Instructblip: towards general-purpose vision-language models with instruction tuning. arXiv preprint arXiv:2305.06500 (2023)"},{"key":"21_CR18","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: Slowfast networks for video recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6202\u20136211 (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"21_CR19","unstructured":"Gao, D., et al.: Assistgpt: a general multi-modal assistant that can plan, execute, inspect, and learn. arXiv:2306.08640 (2023)"},{"key":"21_CR20","doi-asserted-by":"crossref","unstructured":"Gao, D., Zhou, L., Ji, L., Zhu, L., Yang, Y., Shou, M.Z.: Mist: multi-modal iterative spatial-temporal transformer for long-form video question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14773\u201314783 (2023)","DOI":"10.1109\/CVPR52729.2023.01419"},{"key":"21_CR21","doi-asserted-by":"crossref","unstructured":"Goyal, R., et\u00a0al.: The \u201csomething something\u201d video database for learning and evaluating visual common sense. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5842\u20135850 (2017)","DOI":"10.1109\/ICCV.2017.622"},{"key":"21_CR22","unstructured":"Guu, K., Lee, K., Tung, Z., Pasupat, P., Chang, M.: Retrieval augmented language model pre-training. In: International Conference on Machine Learning, pp. 3929\u20133938. PMLR (2020)"},{"key":"21_CR23","doi-asserted-by":"crossref","unstructured":"Han, T., Bain, M., Nagrani, A., Varol, G., Xie, W., Zisserman, A.: Autoad ii: the sequel-who, when, and what in movie audio description. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13645\u201313655 (2023)","DOI":"10.1109\/ICCV51070.2023.01255"},{"key":"21_CR24","doi-asserted-by":"crossref","unstructured":"Han, T., Bain, M., Nagrani, A., Varol, G., Xie, W., Zisserman, A.: AutoAD: movie description in context. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01815"},{"key":"21_CR25","doi-asserted-by":"crossref","unstructured":"Han, T., Xie, W., Zisserman, A.: Temporal alignment networks for long-term video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2906\u20132916 (2022)","DOI":"10.1109\/CVPR52688.2022.00292"},{"key":"21_CR26","doi-asserted-by":"crossref","unstructured":"He, B., et al.: Ma-lmm: memory-augmented large multimodal model for long-term video understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13504\u201313514 (2024)","DOI":"10.1109\/CVPR52733.2024.01282"},{"key":"21_CR27","unstructured":"Hu, E.J., et al.: Lora: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"21_CR28","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"709","DOI":"10.1007\/978-3-030-58548-8_41","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Q Huang","year":"2020","unstructured":"Huang, Q., Xiong, Yu., Rao, A., Wang, J., Lin, D.: MovieNet: a holistic dataset for movie understanding. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12349, pp. 709\u2013727. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58548-8_41"},{"key":"21_CR29","doi-asserted-by":"publisher","unstructured":"Islam, M.M., Bertasius, G.: Long movie clip classification with state-space video models. In: European Conference on Computer Vision, pp. 87\u2013104. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19833-5_6","DOI":"10.1007\/978-3-031-19833-5_6"},{"key":"21_CR30","doi-asserted-by":"crossref","unstructured":"Islam, M.M., Hasan, M., Athrey, K.S., Braskich, T., Bertasius, G.: Efficient movie scene detection using state-space transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18749\u201318758 (2023)","DOI":"10.1109\/CVPR52729.2023.01798"},{"key":"21_CR31","unstructured":"Kay, W., et\u00a0al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"21_CR32","doi-asserted-by":"crossref","unstructured":"Kim, J., Ma, M., Kim, K., Kim, S., Yoo, C.D.: Progressive attention memory network for movie story question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8337\u20138346 (2019)","DOI":"10.1109\/CVPR.2019.00853"},{"key":"21_CR33","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: Hmdb: a large video database for human motion recognition. In: 2011 International Conference on Computer Vision, pp. 2556\u20132563. IEEE (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"21_CR34","doi-asserted-by":"crossref","unstructured":"Lei, J., Wang, L., Shen, Y., Yu, D., Berg, T.L., Bansal, M.: Mart: memory-augmented recurrent transformer for coherent video paragraph captioning. arXiv preprint arXiv:2005.05402 (2020)","DOI":"10.18653\/v1\/2020.acl-main.233"},{"key":"21_CR35","doi-asserted-by":"crossref","unstructured":"Lei, J., Yu, L., Bansal, M., Berg, T.L.: Tvqa: localized, compositional video question answering. arXiv preprint arXiv:1809.01696 (2018)","DOI":"10.18653\/v1\/D18-1167"},{"key":"21_CR36","first-page":"9459","volume":"33","author":"P Lewis","year":"2020","unstructured":"Lewis, P., et al.: Retrieval-augmented generation for knowledge-intensive nlp tasks. Adv. Neural. Inf. Process. Syst. 33, 9459\u20139474 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"21_CR37","unstructured":"Li, B., Zhang, Y., Chen, L., Wang, J., Yang, J., Liu, Z.: Otter: a multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726 (2023)"},{"key":"21_CR38","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"21_CR39","unstructured":"Li, K., et al.: Videochat: chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)"},{"key":"21_CR40","doi-asserted-by":"crossref","unstructured":"Li, L., Chen, Y.C., Cheng, Y., Gan, Z., Yu, L., Liu, J.: Hero: hierarchical encoder for video+ language omni-representation pre-training. arXiv preprint arXiv:2005.00200 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"21_CR41","unstructured":"Li, L., et\u00a0al.: Value: a multi-task benchmark for video-and-language understanding evaluation. arXiv preprint arXiv:2106.04632 (2021)"},{"key":"21_CR42","doi-asserted-by":"crossref","unstructured":"Li, Y., Wang, C., Jia, J.: Llama-vid: an image is worth 2 tokens in large language models. arXiv preprint arXiv:2311.17043 (2023)","DOI":"10.1007\/978-3-031-72952-2_19"},{"key":"21_CR43","unstructured":"Lin, C.Y.: Rouge: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"21_CR44","unstructured":"Lin, K., et\u00a0al.: Mm-vid: advancing video understanding with gpt-4v (ision). arXiv preprint arXiv:2310.19773 (2023)"},{"key":"21_CR45","doi-asserted-by":"crossref","unstructured":"Lin, K., et al.: Swinbert: end-to-end transformers with sparse attention for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17949\u201317958 (2022)","DOI":"10.1109\/CVPR52688.2022.01742"},{"key":"21_CR46","first-page":"7575","volume":"35","author":"KQ Lin","year":"2022","unstructured":"Lin, K.Q., et al.: Egocentric video-language pretraining. Adv. Neural. Inf. Process. Syst. 35, 7575\u20137586 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"21_CR47","doi-asserted-by":"crossref","unstructured":"Lin, K.Q., et al.: Univtg: towards unified video-language temporal grounding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2794\u20132804 (2023)","DOI":"10.1109\/ICCV51070.2023.00262"},{"key":"21_CR48","doi-asserted-by":"publisher","unstructured":"Lin, Y.B., Lei, J., Bansal, M., Bertasius, G.: Eclipse: efficient long-range video retrieval using sight and sound. In: European Conference on Computer Vision, pp. 413\u2013430. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19830-4_24","DOI":"10.1007\/978-3-031-19830-4_24"},{"key":"21_CR49","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)"},{"key":"21_CR50","doi-asserted-by":"crossref","unstructured":"Liu, Y., Li, S., Wu, Y., Chen, C.W., Shan, Y., Qie, X.: Umt: unified multi-modal transformers for joint video moment retrieval and highlight detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3042\u20133051 (2022)","DOI":"10.1109\/CVPR52688.2022.00305"},{"key":"21_CR51","unstructured":"Lu, Y., Li, C., Liu, H., Yang, J., Gao, J., Shen, Y.: An empirical study of scaling instruct-tuned large multimodal models. arXiv:2309.09958 (2023)"},{"key":"21_CR52","doi-asserted-by":"crossref","unstructured":"Maaz, M., Rasheed, H., Khan, S., Khan, F.S.: Video-chatgpt: towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424 (2023)","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"21_CR53","unstructured":"Mangalam, K., Akshulakov, R., Malik, J.: Egoschema: a diagnostic benchmark for very long-form video language understanding. arXiv preprint arXiv:2308.09126 (2023)"},{"key":"21_CR54","doi-asserted-by":"crossref","unstructured":"Miech, A., Alayrac, J.B., Smaira, L., Laptev, I., Sivic, J., Zisserman, A.: End-to-end learning of visual representations from uncurated instructional videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9879\u20139889 (2020)","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"21_CR55","unstructured":"Miech, A., Laptev, I., Sivic, J.: Learning a text-video embedding from incomplete and heterogeneous data. arXiv preprint arXiv:1804.02516 (2018)"},{"key":"21_CR56","unstructured":"Mokady, R., Hertz, A., Bermano, A.H.: ClipCap: CLIP prefix for image captioning. arXiv preprint arXiv:2111.09734 (2021)"},{"key":"21_CR57","doi-asserted-by":"crossref","unstructured":"Nukrai, D., Mokady, R., Globerson, A.: Text-only training for image captioning using noise-injected CLIP. arXiv preprint arXiv:2211.00575 (2022)","DOI":"10.18653\/v1\/2022.findings-emnlp.299"},{"key":"21_CR58","unstructured":"OpenAI: Gpt-4 technical report (2023)"},{"key":"21_CR59","unstructured":"OpenAI: Introducing chatgpt (2023). https:\/\/openai.com\/blog\/chatgpt\/"},{"key":"21_CR60","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"21_CR61","doi-asserted-by":"crossref","unstructured":"Perazzi, F., Pont-Tuset, J., McWilliams, B., Van\u00a0Gool, L., Gross, M., Sorkine-Hornung, A.: A benchmark dataset and evaluation methodology for video object segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 724\u2013732 (2016)","DOI":"10.1109\/CVPR.2016.85"},{"key":"21_CR62","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Meila, M., Zhang, T. (eds.) ICML, pp. 8748\u20138763 (2021)"},{"key":"21_CR63","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision. In: International Conference on Machine Learning, pp. 28492\u201328518. PMLR (2023)"},{"key":"21_CR64","unstructured":"Radford, A., et\u00a0al.: Language models are unsupervised multitask learners. OpenAI Blog p.\u00a09 (2019)"},{"key":"21_CR65","doi-asserted-by":"crossref","unstructured":"Real, E., Shlens, J., Mazzocchi, S., Pan, X., Vanhoucke, V.: Youtube-boundingboxes: a large high-precision human-annotated data set for object detection in video. In: proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5296\u20135305 (2017)","DOI":"10.1109\/CVPR.2017.789"},{"key":"21_CR66","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Tandon, N., Schiele, B.: A dataset for movie description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3202\u20133212 (2015)","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"21_CR67","doi-asserted-by":"crossref","unstructured":"Soldan, M., et al.: Mad: a scalable dataset for language grounding in videos from movie audio descriptions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5026\u20135035 (2022)","DOI":"10.1109\/CVPR52688.2022.00497"},{"key":"21_CR68","doi-asserted-by":"crossref","unstructured":"Song, E., et\u00a0al.: Moviechat: from dense token to sparse memory for long video understanding. arXiv preprint arXiv:2307.16449 (2023)","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"21_CR69","doi-asserted-by":"crossref","unstructured":"Srivastava, D., Singh, A.K., Tapaswi, M.: How you feelin\u2019? learning emotions and mental states in movie scenes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2517\u20132528 (2023)","DOI":"10.1109\/CVPR52729.2023.00248"},{"key":"21_CR70","first-page":"38032","volume":"35","author":"Y Sun","year":"2022","unstructured":"Sun, Y., et al.: Long-form video-language pre-training with multimodal temporal contrastive learning. Adv. Neural. Inf. Process. Syst. 35, 38032\u201338045 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"21_CR71","doi-asserted-by":"crossref","unstructured":"Sur\u00eds, D., Menon, S., Vondrick, C.: Vipergpt: visual inference via python execution for reasoning. arXiv:2303.08128 (2023)","DOI":"10.1109\/ICCV51070.2023.01092"},{"key":"21_CR72","doi-asserted-by":"crossref","unstructured":"Tapaswi, M., Zhu, Y., Stiefelhagen, R., Torralba, A., Urtasun, R., Fidler, S.: Movieqa: understanding stories in movies through question-answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4631\u20134640 (2016)","DOI":"10.1109\/CVPR.2016.501"},{"key":"21_CR73","first-page":"10078","volume":"35","author":"Z Tong","year":"2022","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: Videomae: masked autoencoders are data-efficient learners for self-supervised video pre-training. Adv. Neural. Inf. Process. Syst. 35, 10078\u201310093 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"21_CR74","unstructured":"Touvron, H., et\u00a0al.: Llama: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"21_CR75","unstructured":"Touvron, H., et al.: Llama 2: open foundation and fine-tuned chat models. arXiv:2307.09288 (2023)"},{"key":"21_CR76","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: Cider: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"21_CR77","doi-asserted-by":"crossref","unstructured":"Vicol, P., Tapaswi, M., Castrejon, L., Fidler, S.: Moviegraphs: towards understanding human-centric situations from videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8581\u20138590 (2018)","DOI":"10.1109\/CVPR.2018.00895"},{"key":"21_CR78","unstructured":"Wang, A.J., et al.: Cosmo: contrastive streamlined multimodal model with interleaved pre-training. arXiv preprint arXiv:2401.00849 (2024)"},{"key":"21_CR79","doi-asserted-by":"publisher","first-page":"489","DOI":"10.1109\/TIP.2019.2931534","volume":"29","author":"A Wang","year":"2019","unstructured":"Wang, A., et al.: Holistic multi-modal memory network for movie question answering. IEEE Trans. Image Process. 29, 489\u2013499 (2019)","journal-title":"IEEE Trans. Image Process."},{"key":"21_CR80","doi-asserted-by":"crossref","unstructured":"Wang, J., et\u00a0al.: All in one: exploring unified video-language pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6598\u20136608 (2023)","DOI":"10.1109\/CVPR52729.2023.00638"},{"key":"21_CR81","doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: Selective structured state-spaces for long-form video understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6387\u20136397 (2023)","DOI":"10.1109\/CVPR52729.2023.00618"},{"key":"21_CR82","unstructured":"Wang, Y., et\u00a0al.: Internvideo: general video foundation models via generative and discriminative learning. arXiv preprint arXiv:2212.03191 (2022)"},{"key":"21_CR83","doi-asserted-by":"crossref","unstructured":"Wu, C.Y., et al.: Long-term feature banks for detailed video understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 284\u2013293 (2019)","DOI":"10.1109\/CVPR.2019.00037"},{"key":"21_CR84","doi-asserted-by":"crossref","unstructured":"Wu, C.Y., Krahenbuhl, P.: Towards long-form video understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1884\u20131894 (2021)","DOI":"10.1109\/CVPR46437.2021.00192"},{"key":"21_CR85","doi-asserted-by":"crossref","unstructured":"Wu, C.Y., et al.: Memvit: memory-augmented multiscale vision transformer for efficient long-term video recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13587\u201313597 (2022)","DOI":"10.1109\/CVPR52688.2022.01322"},{"key":"21_CR86","unstructured":"Xie, J., et\u00a0al.: Learning long-form video prior via generative pre-training. arXiv preprint arXiv:2404.15909 (2024)"},{"key":"21_CR87","first-page":"124","volume":"35","author":"A Yang","year":"2022","unstructured":"Yang, A., Miech, A., Sivic, J., Laptev, I., Schmid, C.: Zero-shot video question answering via frozen bidirectional language models. Adv. Neural. Inf. Process. Syst. 35, 124\u2013141 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"21_CR88","unstructured":"Yang, A., Nagrani, A., Laptev, I., Sivic, J., Schmid, C.: Vidchapters-7m: video chapters at scale. arXiv preprint arXiv:2309.13952 (2023)"},{"key":"21_CR89","unstructured":"Yang, Z., et al.: Mm-react: prompting chatgpt for multimodal reasoning and action. arXiv:2303.11381 (2023)"},{"key":"21_CR90","doi-asserted-by":"crossref","unstructured":"Zellers, R., et al.: Merlot reserve: neural script knowledge through vision and language and sound. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16375\u201316387 (2022)","DOI":"10.1109\/CVPR52688.2022.01589"},{"key":"21_CR91","doi-asserted-by":"crossref","unstructured":"Zhang, C., et al.: Mm-narrator: narrating long-form videos with multimodal in-context learning. arXiv preprint arXiv:2311.17435 (2023)","DOI":"10.1109\/CVPR52733.2024.01295"},{"key":"21_CR92","unstructured":"Zhang, S., et al.: Gpt4roi: instruction tuning large language model on region-of-interest. arXiv:2307.03601 (2023)"},{"key":"21_CR93","unstructured":"Zhang, T., Kishore, V., Wu, F., Weinberger, K.Q., Artzi, Y.: Bertscore: evaluating text generation with bert. arXiv preprint arXiv:1904.09675 (2019)"},{"key":"21_CR94","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72967-6_21","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T19:16:14Z","timestamp":1730574974000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72967-6_21"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031729669","9783031729676"],"references-count":94,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72967-6_21","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}