{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:45:07Z","timestamp":1777657507660,"version":"3.51.4"},"publisher-location":"Cham","reference-count":114,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730122","type":"print"},{"value":"9783031730139","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:00:00Z","timestamp":1732665600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:00:00Z","timestamp":1732665600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73013-9_23","type":"book-chapter","created":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T07:54:44Z","timestamp":1732607684000},"page":"396-416","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":151,"title":["InternVideo2: Scaling Foundation Models for\u00a0Multimodal Video Understanding"],"prefix":"10.1007","author":[{"given":"Yi","family":"Wang","sequence":"first","affiliation":[]},{"given":"Kunchang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Xinhao","family":"Li","sequence":"additional","affiliation":[]},{"given":"Jiashuo","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Yinan","family":"He","sequence":"additional","affiliation":[]},{"given":"Guo","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Baoqi","family":"Pei","sequence":"additional","affiliation":[]},{"given":"Rongkun","family":"Zheng","sequence":"additional","affiliation":[]},{"given":"Zun","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Yansong","family":"Shi","sequence":"additional","affiliation":[]},{"given":"Tianxiang","family":"Jiang","sequence":"additional","affiliation":[]},{"given":"Songze","family":"Li","sequence":"additional","affiliation":[]},{"given":"Jilan","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Hongjie","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Yifei","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Qiao","sequence":"additional","affiliation":[]},{"given":"Yali","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Limin","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,27]]},"reference":[{"key":"23_CR1","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. ArXiv abs\/2204.14198 (2022)"},{"key":"23_CR2","doi-asserted-by":"crossref","unstructured":"Anne\u00a0Hendricks, L., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: ICCV, pp. 5803\u20135812 (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"23_CR3","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: a joint video and image encoder for end-to-end retrieval. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"23_CR4","unstructured":"Bardes, A., et al.: V-JEPA: latent video prediction for visual representation learning (2024). https:\/\/openreview.net\/forum?id=WFYbBOEOtv"},{"key":"23_CR5","unstructured":"Behera, S.R., Injeti, K.M., Patibandla, J.S.K., Pokala, P.K., Pailla, B.R.: Aquallm: audio question answering data generation using large language models. arXiv preprint arXiv:2312.17343 (2023)"},{"key":"23_CR6","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners. In: NeurIPS (2020)"},{"key":"23_CR7","unstructured":"Bruce, J., et\u00a0al.: Genie: generative interactive environments. arXiv preprint arXiv:2402.15391 (2024)"},{"key":"23_CR8","unstructured":"Carreira, J., Noland, E., Banki-Horvath, A., Hillier, C., Zisserman, A.: A short note about kinetics-600. ArXiv abs\/1808.01340 (2018)"},{"key":"23_CR9","unstructured":"Carreira, J., Noland, E., Hillier, C., Zisserman, A.: A short note on the kinetics-700 human action dataset. ArXiv abs\/1907.06987 (2019)"},{"key":"23_CR10","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"23_CR11","unstructured":"Chen, D.L., Dolan, W.B.: Collecting highly parallel data for paraphrase evaluation. In: Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, vol. 1, pp. 190\u2013200. Association for Computational Linguistics (2011)"},{"key":"23_CR12","doi-asserted-by":"crossref","unstructured":"Chen, G., Zheng, Y.D., Wang, L., Lu, T.: DCAN: improving temporal action detection via dual context aggregation. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a036, pp. 248\u2013257 (2022)","DOI":"10.1609\/aaai.v36i1.19900"},{"key":"23_CR13","unstructured":"Chen, S., et al.: BEATs: audio pre-training with acoustic tokenizers. In: Proceedings of the 40th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0202, pp. 5178\u20135193. PMLR (2023)"},{"key":"23_CR14","unstructured":"Chen, S., et al.: Valor: Vision-audio-language omni-perception pretraining model and dataset. arXiv preprint arXiv:2304.08345 (2023)"},{"key":"23_CR15","unstructured":"Chen, S., et al.: VAST: a vision-audio-subtitle-text omni-modality foundation model and dataset. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"23_CR16","unstructured":"Chen, Z., et al.: Vision transformer adapter for dense predictions. arXiv preprint arXiv:2205.08534 (2022)"},{"key":"23_CR17","doi-asserted-by":"crossref","unstructured":"Chen, Z., et al.: InternVL: scaling up vision foundation models and aligning for generic visual-linguistic tasks. arXiv preprint arXiv:2312.14238 (2023)","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"23_CR18","unstructured":"Cheng, B., Schwing, A.G., Kirillov, A.: Per-pixel classification is not all you need for semantic segmentation. In: NeurIPS (2021)"},{"key":"23_CR19","doi-asserted-by":"crossref","unstructured":"Cheng, F., Wang, X., Lei, J., Crandall, D.J., Bansal, M., Bertasius, G.: Vindlu: a recipe for effective video-and-language pretraining. ArXiv abs\/2212.05051 (2022)","DOI":"10.1109\/CVPR52729.2023.01034"},{"key":"23_CR20","doi-asserted-by":"crossref","unstructured":"Cherti, M., et al.: Reproducible scaling laws for contrastive language-image learning. In: CVPR, pp. 2818\u20132829 (2023)","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"23_CR21","unstructured":"Dai, W., et al.: Instructblip: towards general-purpose vision-language models with instruction tuning. In: NeurIPS (2023)"},{"key":"23_CR22","first-page":"16344","volume":"35","author":"T Dao","year":"2022","unstructured":"Dao, T., Fu, D., Ermon, S., Rudra, A., R\u00e9, C.: FlashAttention: fast and memory-efficient exact attention with IO-awareness. NeurIPS 35, 16344\u201316359 (2022)","journal-title":"NeurIPS"},{"key":"23_CR23","unstructured":"Dehghani, M., et al.: Scaling vision transformers to 22 billion parameters. In: ICML (2023)"},{"key":"23_CR24","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. ArXiv abs\/1810.04805 (2018)"},{"key":"23_CR25","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. ArXiv abs\/2010.11929 (2020)"},{"key":"23_CR26","unstructured":"Driess, D., et al.: PaLM-e: an embodied multimodal language model. In: ICML (2023)"},{"key":"23_CR27","doi-asserted-by":"crossref","unstructured":"Drossos, K., Lipping, S., Virtanen, T.: Clotho: An audio captioning dataset. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 736\u2013740. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"23_CR28","unstructured":"Feichtenhofer, C., Fan, H., Li, Y., He, K.: Masked autoencoders as spatiotemporal learners. In: NeurIPS (2022)"},{"key":"23_CR29","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: Slowfast networks for video recognition. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"23_CR30","unstructured":"Fu, T.J., et al.: Violet: end-to-end video-language transformers with masked visual-token modeling. arXiv preprint arXiv:2111.12681 (2021)"},{"key":"23_CR31","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"214","DOI":"10.1007\/978-3-030-58548-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"V Gabeur","year":"2020","unstructured":"Gabeur, V., Sun, C., Alahari, K., Schmid, C.: Multi-modal transformer for video retrieval. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020 Part IV. LNCS, vol. 12349, pp. 214\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58548-8_13"},{"key":"23_CR32","doi-asserted-by":"crossref","unstructured":"Gao, J., Sun, C., Yang, Z., Nevatia, R.: Tall: temporal activity localization via language query. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.563"},{"key":"23_CR33","unstructured":"Gong, T., et al.: Multimodal-GPT: a vision and language model for dialogue with humans. ArXiv abs\/2305.04790 (2023)"},{"key":"23_CR34","doi-asserted-by":"crossref","unstructured":"Gong, Y., Chung, Y.A., Glass, J.: AST: audio spectrogram transformer. arXiv preprint arXiv:2104.01778 (2021)","DOI":"10.21437\/Interspeech.2021-698"},{"key":"23_CR35","doi-asserted-by":"crossref","unstructured":"Goyal, R., et al.: The \u201csomething something\u201d video database for learning and evaluating visual common sense. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.622"},{"key":"23_CR36","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the v in VQA matter: elevating the role of image understanding in visual question answering. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"23_CR37","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"23_CR38","doi-asserted-by":"crossref","unstructured":"Heilbron, F.C., Escorcia, V., Ghanem, B., Niebles, J.C.: ActivityNet: a large-scale video benchmark for human activity understanding. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"23_CR39","doi-asserted-by":"crossref","unstructured":"Idrees, H., et al.: The THUMOS challenge on action recognition for videos \u201cin the wild\u201d. Comput. Vis. Image Underst. (2017)","DOI":"10.1016\/j.cviu.2016.10.018"},{"key":"23_CR40","unstructured":"Jiang, A.Q., et\u00a0al.: Mistral 7b. arXiv preprint arXiv:2310.06825 (2023)"},{"key":"23_CR41","unstructured":"Kim, C.D., Kim, B., Lee, H., Kim, G.: AudioCaps: generating captions for audios in the wild. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 119\u2013132 (2019)"},{"key":"23_CR42","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., Carlos\u00a0Niebles, J.: Dense-captioning events in videos. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.83"},{"key":"23_CR43","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: a large video database for human motion recognition. In: 2011 International Conference on Computer Vision, pp. 2556\u20132563. IEEE (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"23_CR44","unstructured":"Lei, J., Berg, T.L., Bansal, M.: QVHighlights: detecting moments and highlights in videos via natural language queries (2021)"},{"key":"23_CR45","doi-asserted-by":"crossref","unstructured":"Li, G., Xu, Y., Hu, D.: Multi-scale attention for audio question answering. arXiv preprint arXiv:2305.17993 (2023)","DOI":"10.21437\/Interspeech.2023-1606"},{"key":"23_CR46","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.C.H.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: ICML (2022)"},{"key":"23_CR47","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.C.H.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: ICML (2022)"},{"key":"23_CR48","unstructured":"Li, K., et al.: VideoChat: chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)"},{"key":"23_CR49","doi-asserted-by":"crossref","unstructured":"Li, K., et\u00a0al.: MVBench: a comprehensive multi-modal video understanding benchmark. arXiv preprint arXiv:2311.17005 (2023)","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"23_CR50","unstructured":"Li, K., et al.: UniFormerv2: spatiotemporal learning by arming image ViTS with video uniformer. arXiv preprint arXiv:2211.09552 (2022)"},{"key":"23_CR51","doi-asserted-by":"crossref","unstructured":"Li, K., et al.: Unmasked teacher: towards training-efficient video foundation models. arXiv preprint arXiv:2303.16058 (2023)","DOI":"10.1109\/ICCV51070.2023.01826"},{"key":"23_CR52","unstructured":"Li, T., Wang, L.: Learning spatiotemporal features via video and text pair discrimination. CoRR abs\/2001.05691 (2020), https:\/\/arxiv.org\/abs\/2001.05691"},{"key":"23_CR53","doi-asserted-by":"crossref","unstructured":"Lin, K.Q., et al.: UniVTG: towards unified video-language temporal grounding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2794\u20132804 (2023)","DOI":"10.1109\/ICCV51070.2023.00262"},{"key":"23_CR54","doi-asserted-by":"crossref","unstructured":"Lin, T., Liu, X., Li, X., Ding, E., Wen, S.: BMN: boundary-matching network for temporal action proposal generation (2019)","DOI":"10.1109\/ICCV.2019.00399"},{"key":"23_CR55","doi-asserted-by":"crossref","unstructured":"Lipping, S., Sudarsanam, P., Drossos, K., Virtanen, T.: Clotho-AQA: a crowdsourced dataset for audio question answering. In: 2022 30th European Signal Processing Conference (EUSIPCO), pp. 1140\u20131144. IEEE (2022)","DOI":"10.23919\/EUSIPCO55093.2022.9909680"},{"key":"23_CR56","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning (2023)"},{"key":"23_CR57","doi-asserted-by":"publisher","first-page":"6937","DOI":"10.1109\/TIP.2022.3217368","volume":"31","author":"Y Liu","year":"2022","unstructured":"Liu, Y., Wang, L., Wang, Y., Ma, X., Qiao, Y.: FineAction: a fine-grained video dataset for temporal action localization. IEEE Trans. Image Process. 31, 6937\u20136950 (2022)","journal-title":"IEEE Trans. Image Process."},{"key":"23_CR58","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"23_CR59","doi-asserted-by":"crossref","unstructured":"Luo, H., et al.: CLIP4Clip: an empirical study of clip for end to end video clip retrieval and captioning. Neurocomputing (2022)","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"23_CR60","unstructured":"Luo, R., et al.: Valley: video assistant with large language model enhanced ability. ArXiv abs\/2306.07207 (2023)"},{"key":"23_CR61","unstructured":"Maaz, M., Rasheed, H.A., Khan, S., Khan, F.S.: Video-chatGPT: towards detailed video understanding via large vision and language models. ArXiv abs\/2306.05424 (2023)"},{"key":"23_CR62","unstructured":"Mangalam, K., Akshulakov, R., Malik, J.: EgoSchema: a diagnostic benchmark for very long-form video language understanding. In: Oh, A., Neumann, T., Globerson, A., Saenko, K., Hardt, M., Levine, S. (eds.) NeurIPS, pp. 46212\u201346244 (2023)"},{"key":"23_CR63","doi-asserted-by":"crossref","unstructured":"Marino, K., Rastegari, M., Farhadi, A., Mottaghi, R.: OK-VQA: a visual question answering benchmark requiring external knowledge. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00331"},{"key":"23_CR64","doi-asserted-by":"crossref","unstructured":"Mei, X., et al.: WavCaps: a ChatGPT-assisted weakly-labelled audio captioning dataset for audio-language multimodal research (2023)","DOI":"10.1109\/TASLP.2024.3419446"},{"key":"23_CR65","doi-asserted-by":"crossref","unstructured":"Monfort, M., et al.: Moments in time dataset: One million videos for event understanding. TPAMI (2020)","DOI":"10.1109\/TPAMI.2019.2901464"},{"key":"23_CR66","unstructured":"Moon, W., Hyun, S., Lee, S., Heo, J.P.: Correlation-guided query-dependency calibration in video representation learning for temporal grounding. arXiv preprint arXiv:2311.08835 (2023)"},{"key":"23_CR67","unstructured":"OpenAI: GPT-4 technical report. ArXiv abs\/2303.08774 (2023)"},{"key":"23_CR68","unstructured":"OpenAI: GPT-4V(ision) system card (2023). https:\/\/api.semanticscholar.org\/CorpusID:263218031"},{"key":"23_CR69","unstructured":"Oquab, M., et\u00a0al.: Dinov2: learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)"},{"key":"23_CR70","unstructured":"Patraucean, V., et al.: Perception test : a diagnostic benchmark for multimodal models. In: NeurIPS (2023)"},{"key":"23_CR71","doi-asserted-by":"crossref","unstructured":"Piczak, K.J.: ESC: dataset for environmental sound classification. In: Proceedings of the 23rd ACM International Conference on Multimedia, pp. 1015\u20131018 (2015)","DOI":"10.1145\/2733373.2806390"},{"key":"23_CR72","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"23_CR73","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"23_CR74","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763. PMLR (2021)"},{"key":"23_CR75","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision. In: International Conference on Machine Learning, pp. 28492\u201328518. PMLR (2023)"},{"key":"23_CR76","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. JMLR (2020)"},{"key":"23_CR77","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Tandon, N., Schiele, B.: A dataset for movie description. In: CVPR, pp. 3202\u20133212 (2015)","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"23_CR78","unstructured":"Ryali, C., et al.: Hiera: a hierarchical vision transformer without the bells-and-whistles (2023)"},{"key":"23_CR79","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)"},{"key":"23_CR80","unstructured":"Sun, Q., et\u00a0al.: Generative multimodal models are in-context learners. arXiv preprint arXiv:2312.13286 (2023)"},{"key":"23_CR81","unstructured":"Sun, Q., Fang, Y., Wu, L., Wang, X., Cao, Y.: EVA-CLIP: improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389 (2023)"},{"key":"23_CR82","unstructured":"Sun, Q., et al.: EVA-CLIP-18B: scaling clip to 18 billion parameters. arXiv preprint arXiv:2402.04252 (2024)"},{"key":"23_CR83","unstructured":"Sun, Q., et al.: Generative pretraining in multimodality. arXiv preprint arXiv:2307.05222 (2023)"},{"key":"23_CR84","unstructured":"Team, G., et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"23_CR85","first-page":"10078","volume":"35","author":"Z Tong","year":"2022","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: VideoMAE: masked autoencoders are data-efficient learners for self-supervised video pre-training. NeurIPS 35, 10078\u201310093 (2022)","journal-title":"NeurIPS"},{"key":"23_CR86","unstructured":"Touvron, H., et al.: LLaMA: open and efficient foundation language models. ArXiv abs\/2302.13971 (2023)"},{"key":"23_CR87","unstructured":"Touvron, H., et al.: Llama 2: Open foundation and fine-tuned chat models. ArXiv abs\/2307.09288 (2023)"},{"key":"23_CR88","doi-asserted-by":"crossref","unstructured":"Tran, D., Xiu Wang, H., Torresani, L., Ray, J., LeCun, Y., Paluri, M.: A closer look at spatiotemporal convolutions for action recognition. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00675"},{"key":"23_CR89","doi-asserted-by":"crossref","unstructured":"Wang, J., Y., et\u00a0al.: All in one: exploring unified video-language pre-training. In: CVPR, pp. 6598\u20136608 (2023)","DOI":"10.1109\/CVPR52729.2023.00638"},{"key":"23_CR90","doi-asserted-by":"crossref","unstructured":"Wang, L., et al.: VideoMAE V2: scaling video masked autoencoders with dual masking. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"23_CR91","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1007\/978-3-319-46484-8_2","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Wang","year":"2016","unstructured":"Wang, L., et al.: Temporal segment networks: towards good practices for deep action recognition. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 20\u201336. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_2"},{"key":"23_CR92","doi-asserted-by":"crossref","unstructured":"Wang, R., et al.: Masked video distillation: rethinking masked feature modeling for self-supervised video representation learning. In: CVPR, pp. 6312\u20136322 (2023)","DOI":"10.1109\/CVPR52729.2023.00611"},{"key":"23_CR93","unstructured":"Wang, Y., et\u00a0al.: InternVid: a large-scale video-text dataset for multimodal understanding and generation. arXiv preprint arXiv:2307.06942 (2023)"},{"key":"23_CR94","unstructured":"Wang, Y., et al.: InternVideo: general video foundation models via generative and discriminative learning. arXiv preprint arXiv:2212.03191 (2022)"},{"key":"23_CR95","unstructured":"Xu, H., et\u00a0al.: mPLUG-2: a modularized multi-modal foundation model across text, image and video. arXiv preprint arXiv:2302.00402 (2023)"},{"key":"23_CR96","doi-asserted-by":"crossref","unstructured":"Xu, H., et al.: Videoclip: contrastive pre-training for zero-shot video-text understanding. arXiv preprint arXiv:2109.14084 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"23_CR97","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: CVPR, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"23_CR98","unstructured":"Yan, S., et al.: Video-text modeling with zero-shot transfer from contrastive captioners. ArXiv abs\/2212.04979 (2022)"},{"key":"23_CR99","doi-asserted-by":"crossref","unstructured":"Yang, L., Fan, Y., Xu, N.: Video instance segmentation. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00529"},{"key":"23_CR100","doi-asserted-by":"crossref","unstructured":"Yang, M., Chen, G., Zheng, Y.D., Lu, T., Wang, L.: BasicTAD: an astounding RGB-only baseline for temporal action detection. Comput. Vis. Image Understanding, 103692 (2023)","DOI":"10.1016\/j.cviu.2023.103692"},{"key":"23_CR101","unstructured":"Ye, Q., et al.: mPLUG-Owl: modularization empowers large language models with multimodality (2023)"},{"key":"23_CR102","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: COCA: contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)"},{"issue":"10","key":"23_CR103","doi-asserted-by":"publisher","first-page":"2425","DOI":"10.1007\/s11263-022-01657-x","volume":"130","author":"\u00c9 Zablocki","year":"2022","unstructured":"Zablocki, \u00c9., Ben-Younes, H., P\u00e9rez, P., Cord, M.: Explainability of deep vision-based autonomous driving systems: review and challenges. Int. J. Comput. Vis. 130(10), 2425\u20132452 (2022)","journal-title":"Int. J. Comput. Vis."},{"key":"23_CR104","doi-asserted-by":"crossref","unstructured":"Zellers, R., et al.: Merlot reserve: neural script knowledge through vision and language and sound. In: CVPR, pp. 16375\u201316387 (2022)","DOI":"10.1109\/CVPR52688.2022.01589"},{"key":"23_CR105","unstructured":"Zeng, Z., Ge, Y., Tong, Z., Liu, X., Xia, S.T., Shan, Y.: TVTSv2: learning out-of-the-box spatiotemporal visual representations at scale. arXiv preprint arXiv:2305.14173 (2023)"},{"key":"23_CR106","unstructured":"Zhang, B., et al.: Co-training transformer with videos and images improves action recognition. ArXiv abs\/2112.07175 (2021)"},{"key":"23_CR107","doi-asserted-by":"crossref","unstructured":"Zhang, C.L., Wu, J.Z., Li, Y.: ActionFormer: localizing moments of actions with transformers. ArXiv abs\/2202.07925 (2022)","DOI":"10.1007\/978-3-031-19772-7_29"},{"key":"23_CR108","doi-asserted-by":"crossref","unstructured":"Zhao, H., Torralba, A., Torresani, L., Yan, Z.: HACS: human action clips and segments dataset for recognition and temporal localization. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00876"},{"key":"23_CR109","unstructured":"Zhao, L., et al.: VideoPrism: a foundational visual encoder for video understanding (2024)"},{"key":"23_CR110","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Hessel, J., Yu, Y., Lu, X., Zellers, R., Choi, Y.: Connecting the dots between audio and text without parallel data through visual knowledge transfer. arXiv preprint arXiv:2112.08995 (2021)","DOI":"10.18653\/v1\/2022.naacl-main.333"},{"key":"23_CR111","unstructured":"Zheng, L., et\u00a0al.: Judging LLM-as-a-judge with MT-bench and chatbot arena. arXiv preprint arXiv:2306.05685 (2023)"},{"key":"23_CR112","unstructured":"Zhu, B., et\u00a0al.: LanguageBind: extending video-language pretraining to n-modality by language-based semantic alignment. arXiv preprint arXiv:2310.01852 (2023)"},{"key":"23_CR113","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. ArXiv abs\/2304.10592 (2023)"},{"key":"23_CR114","doi-asserted-by":"crossref","unstructured":"Zhu, W., et al.: AutoShot: a short video dataset and state-of-the-art shot boundary detection. In: CVPR, pp. 2237\u20132246 (2023)","DOI":"10.1109\/CVPRW59228.2023.00218"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73013-9_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T08:32:53Z","timestamp":1732609973000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73013-9_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,27]]},"ISBN":["9783031730122","9783031730139"],"references-count":114,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73013-9_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,27]]},"assertion":[{"value":"27 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}