{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T16:33:50Z","timestamp":1778258030130,"version":"3.51.4"},"reference-count":95,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T00:00:00Z","timestamp":1770595200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T00:00:00Z","timestamp":1770595200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s11263-025-02620-2","type":"journal-article","created":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T12:36:27Z","timestamp":1770640587000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Kangaroo: A Powerful Video-Language Model Supporting Long-context Video Input"],"prefix":"10.1007","volume":"134","author":[{"given":"Jiajun","family":"Liu","sequence":"first","affiliation":[]},{"given":"Yibing","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Hanghang","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Xiaoping","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Xiaoqi","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Xiaoming","family":"Wei","sequence":"additional","affiliation":[]},{"given":"Jianbin","family":"Jiao","sequence":"additional","affiliation":[]},{"given":"Enhua","family":"Wu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5150-1003","authenticated-orcid":false,"given":"Jie","family":"Hu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,2,9]]},"reference":[{"key":"2620_CR1","unstructured":"AI@Meta: Llama 3 Model Card. https:\/\/github.com\/meta-llama\/llama3\/blob\/main\/MODEL_CARD.md (2024)"},{"key":"2620_CR2","doi-asserted-by":"publisher","first-page":"23716","DOI":"10.52202\/068431-1723","volume":"35","author":"J-B Alayrac","year":"2022","unstructured":"Alayrac, J.-B., Donahue, J., Luc, P., Miech, A., Barr, I., Hasson, Y., Lenc, K., Mensch, A., Millican, K., Reynolds, M., Ring, R., Rutherford, E., Cabi, S., Han, T., Gong, Z., Samangooei, S., Monteiro, M., Menick, J., Borgeaud, S., \u2026 Simonyan, K. (2022). Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems, 35, 23716\u201323736.","journal-title":"Advances in neural information processing systems"},{"key":"2620_CR3","unstructured":"Anthropic: Introducing the next generation of Claude. https:\/\/www.anthropic.com\/news\/claude-3-family (2024)"},{"key":"2620_CR4","unstructured":"Awadalla, A., Gao, I., Gardner, J., Hessel, J., Hanafy, Y., Zhu, W., Marathe, K., Bitton, Y., Gadre, S., Sagawa, S., Jitsev, J., Kornblith, S., Koh, P.W., Ilharco, G., Wortsman, M., & Schmidt, L. (2023). Openflamingo: An open-source framework for training large autoregressive vision-language models. arXiv preprint arXiv:2308.01390"},{"key":"2620_CR5","unstructured":"Bai, J., Bai, S., Yang, S., Wang, S., Tan, S., Wang, P., Lin, J., Zhou, C., & Zhou, J. (2023). Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966"},{"key":"2620_CR6","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., & Zisserman, A. (2021) Frozen in time: A joint video and image encoder for end-to-end retrieval. In: IEEE International Conference on Computer Vision","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"2620_CR7","unstructured":"Cai, Z., Cao, M., Chen, H., Chen, K., Chen, K., Chen, X., Chen, X., Chen, Z., Chen, Z., Chu, P., Dong, X., Duan, H., Fan, Q., Fei, Z., Gao, Y., Ge, J., Gu, C., Gu, Y., Gui, T., Guo, A., Guo, Q., He, C., Hu, Y., Huang, T., Jiang, T., Jiao, P., Jin, Z., Lei, Z., Li, J., Li, J., Li, L., Li, S., Li, W., Li, Y., Liu, H., Liu, J., Hong, J., Liu, K., Liu, K., Liu, X., Lv, C., Lv, H., Lv, K., Ma, L., Ma, R., Ma, Z., Ning, W., Ouyang, L., Qiu, J., Qu, Y., Shang, F., Shao, Y., Song, D., Song, Z., Sui, Z., Sun, P., Sun, Y., Tang, H., Wang, B., Wang, G., Wang, J., Wang, J., Wang, R., Wang, Y., Wang, Z., Wei, X., Weng, Q., Wu, F., Xiong, Y., Xu, C., Xu, R., Yan, H., Yan, Y., Yang, X., Ye, H., Ying, H., Yu, J., Yu, J., Zang, Y., Zhang, C., Zhang, L., Zhang, P., Zhang, P., Zhang, R., Zhang, S., Zhang, S., Zhang, W., Zhang, W., Zhang, X., Zhang, X., Zhao, H., Zhao, Q., Zhao, X., Zhou, F., Zhou, Z., Zhuo, J., Zou, Y., Qiu, X., Qiao, Y., & Lin, D. (2024). Internlm2 technical report. arXiv preprint arXiv:2403.17297"},{"key":"2620_CR8","unstructured":"Cai, M., Tan, R., Zhang, J., Zou, B., Zhang, K., Yao, F., Zhu, F., Gu, J., Zhong, Y., Shang, Y., Dou, Y., Park, J., Gao, J., Lee, Y.J., & Yang, J. (2024). Temporalbench: Benchmarking fine-grained temporal understanding for multimodal video models. arXiv preprint arXiv:2410.10818"},{"key":"2620_CR9","unstructured":"Castellano, B. PySceneDetect. https:\/\/github.com\/Breakthrough\/PySceneDetect"},{"key":"2620_CR10","doi-asserted-by":"crossref","unstructured":"Chen, T.-S., Siarohin, A., Menapace, W., Deyneka, E., Chao, H.-w., Jeon, B.E., Fang, Y., Lee, H.-Y., Ren, J., Yang, M.-H., & Tulyakov, S. (2024) Panda-70m: Captioning 70m videos with multiple cross-modality teachers. arXiv preprint arXiv:2402.19479","DOI":"10.1109\/CVPR52733.2024.01265"},{"key":"2620_CR11","doi-asserted-by":"crossref","unstructured":"Chen, A., Wang, Z., Dong, C., Tian, K., Zhao, R., Liang, X., Kang, Z., & Li, X. (2023). Chinaopen: A dataset for open-world multimodal learning. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 6432\u20136440","DOI":"10.1145\/3581783.3612156"},{"key":"2620_CR12","doi-asserted-by":"crossref","unstructured":"Chen, Z., Wang, W., Tian, H., Ye, S., Gao, Z., Cui, E., Tong, W., Hu, K., Luo, J., Ma, Z., Ma, J., Wang, J., Dong, X., Yan, H., Guo, H., He, C., Shi, B., Jin, Z., Xu, C., Wang, B., Wei, X., Li, W., Zhang, W., Zhang, B., Cai, P., Wen, L., Yan, X., Dou, M., Lu, L., Zhu, X., Lu, T., Lin, D., Qiao, Y., Dai, J., & Wang, W. (2024). How far are we to gpt-4v? closing the gap to commercial multimodal models with open-source suites. arXiv preprint arXiv:2404.16821","DOI":"10.1007\/s11432-024-4231-5"},{"key":"2620_CR13","doi-asserted-by":"crossref","unstructured":"Chen, L., Wei, X., Li, J., Dong, X., Zhang, P., Zang, Y., Chen, Z., Duan, H., Lin, B., Tang, Z., Yuan, L., Qiao, Y., Lin, D., Zhao, F., & Wang, J. (2024). Sharegpt4video: Improving video understanding and generation with better captions. arXiv preprint arXiv:2406.04325","DOI":"10.52202\/079017-0614"},{"key":"2620_CR14","doi-asserted-by":"crossref","unstructured":"Chen, Z., Wu, J., Wang, W., Su, W., Chen, G., Xing, S., Zhong, M., Zhang, Q., Zhu, X., Lu, L., Li, B., Luo, P., Lu, T., Qiao, Y., & Dai, J. (2024). Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 24185\u201324198","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"2620_CR15","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., & Zhao, R. (2023). Shikra: Unleashing multimodal llm\u2019s referential dialogue magic. arXiv preprint arXiv:2306.15195"},{"key":"2620_CR16","unstructured":"Chen, J., Zhu, D., Shen, X., Li, X., Liu, Z., Zhang, P., Krishnamoorthi, R., Chandra, V., Xiong, Y., & Elhoseiny, M. (2023) Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478"},{"key":"2620_CR17","unstructured":"Cheng, Z., Leng, S., Zhang, H., Xin, Y., Li, X., Chen, G., Zhu, Y., Zhang, W., Luo, Z., Zhao, D., & Bing, L. (2024). Videollama 2: Advancing spatial-temporal modeling and audio understanding in video-llms. arXiv preprint arXiv:2406.07476"},{"key":"2620_CR18","doi-asserted-by":"crossref","unstructured":"Chu, X., Su, J., Zhang, B., & Shen, C. (2024). Visionllama: A unified llama backbone for vision tasks. In: European Conference on Computer Vision","DOI":"10.1007\/978-3-031-72848-8_1"},{"key":"2620_CR19","unstructured":"Cui, E., He, Y., Ma, Z., Chen, Z., Tian, H., Wang, W., Li, K., Wang, Y., Wang, W., Zhu, X., Lu, L., Lu, T., Wang, Y., Wang, L., Qiao, Y., & Dai, J. (2024). ShareGPT-4o: Comprehensive Multimodal Annotations With GPT-4o. https:\/\/sharegpt4o.github.io\/"},{"key":"2620_CR20","doi-asserted-by":"crossref","unstructured":"Dai, W., Li, J., Li, D., Tiong, A.M.H., Zhao, J., Wang, W., Li, B., Fung, P., & Hoi, S. (2023). Instructblip: Towards general-purpose vision-language models with instruction tuning. arXiv preprint arkiv:2305.06500","DOI":"10.52202\/075280-2142"},{"key":"2620_CR21","unstructured":"Dong, X., Zhang, P., Zang, Y., Cao, Y., Wang, B., Ouyang, L., Wei, X., Zhang, S., Duan, H., Cao, M., Zhang, W., Li, Y., Yan, H., Gao, Y., Zhang, X., Li, W., Li, J., Chen, K., He, C., Zhang, X., Qiao, Y., Lin, D., & Wang, J. (2024) Internlm-xcomposer2: Mastering free-form text-image composition and comprehension in vision-language large model. arXiv preprint arXiv:2401.16420"},{"key":"2620_CR22","doi-asserted-by":"crossref","unstructured":"Du, Y., Chen, Z., Jia, C., Yin, X., Zheng, T., Li, C., Du, Y., & Jiang, Y.-G. (2022). Svtr: Scene text recognition with a single visual model. arXiv preprint arXiv:2205.00159","DOI":"10.24963\/ijcai.2022\/124"},{"key":"2620_CR23","unstructured":"Fang, X., Mao, K., Duan, H., Zhao, X., Li, Y., Lin, D., & Chen, K. (2024). Mmbench-video: A long-form multi-shot benchmark for holistic video understanding. arXiv preprint arXiv:2406.14515"},{"key":"2620_CR24","doi-asserted-by":"crossref","unstructured":"Fang, Y., Wang, W., Xie, B., Sun, Q., Wu, L., Wang, X., Huang, T., Wang, X., & Cao, Y. (2023). Eva: Exploring the limits of masked visual representation learning at scale. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19358\u201319369","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"2620_CR25","unstructured":"Fei, J., Li, D., Deng, Z., Wang, Z., Liu, G., & Wang, H. (2024). Video-ccam: Ehancing video-language understanding with causal cross-attention masks for short and long videos. arXiv preprint arXiv:2408.14023"},{"key":"2620_CR26","doi-asserted-by":"crossref","unstructured":"Fu, C., Dai, Y., Luo, Y., Li, L., Ren, S., Zhang, R., Wang, Z., Zhou, C., Shen, Y., Zhang, M., Chen, P., Li, Y., Lin, S., Zhao, S., Li, K., Xu, T., Zheng, X., Chen, E., Shan, C., He, R., & Sun, X. (2024). Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis. arXiv preprint arXiv:2405.21075","DOI":"10.1109\/CVPR52734.2025.02245"},{"key":"2620_CR27","unstructured":"Fu, C., Lin, H., Long, Z., Shen, Y., Zhao, M., Zhang, Y., Wang, X., Yin, D., Ma, L., Zheng, X., He, R., Ji, R., Wu, Y., Shan, C., & Sun, X. (2024). Vita: Towards open-source interactive omni multimodal llm. arXiv preprint arXiv:2408.05211"},{"key":"2620_CR28","unstructured":"GLM, T., Zeng, A., Xu, B., Wang, B., Zhang, C., Yin, D., Zhang, D., Rojas, D., Feng, G., Zhao, H., Lai, H., Yu, H., Wang, H., Sun, J., Zhang, J., Cheng, J., Gui, J., Tang, J., Zhang, J., Sun, J., Li, J., Zhao, L., Wu, L., Zhong, L., Liu, M., Huang, M., Zhang, P., Zheng, Q., Lu, R., Duan, S., Zhang, S., Cao, S., Yang, S., Tam, W.L., Zhao, W., Liu, X., Xia, X., Zhang, X., Gu, X., Lv, X., Liu, X., Liu, X., Yang, X., Song, X., Zhang, X., An, Y., Xu, Y., Niu, Y., Yang, Y., Li, Y., Bai, Y., Dong, Y., Qi, Z., Wang, Z., Yang, Z., Du, Z., Hou, Z., & Wang, Z. (2024). Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv preprint arXiv:2406.12793"},{"key":"2620_CR29","unstructured":"Google, G.T. (2023). Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805"},{"key":"2620_CR30","doi-asserted-by":"crossref","unstructured":"Goyal, R., Kahou, S.E., Michalski, V., Materzynska, J., Westphal, S., Kim, H., Haenel, V., Fruend, I., Yianilos, P., Mueller-Freitag, M., Hoppe, F., Thurau, C., Bax, I., & Memisevic, R. (2017). The something something\u201d video database for learning and evaluating visual common sense. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5842\u20135850","DOI":"10.1109\/ICCV.2017.622"},{"key":"2620_CR31","doi-asserted-by":"crossref","unstructured":"Grauman, K., Westbury, A., Byrne, E., Chavis, Z., Furnari, A., Girdhar, R., Hamburger, J., Jiang, H., Liu, M., Liu, X., Martin, M., Nagarajan, T., Radosavovic, I., Ramakrishnan, S.K., Ryan, F., Sharma, J., Wray, M., Xu, M., Xu, E.Z., Zhao, C., Bansal, S., Batra, D., Cartillier, V., Crane, S., Do, T., Doulaty, M., Erapalli, A., Feichtenhofer, C., Fragomeni, A., Fu, Q., Gebreselasie, A., Gonzalez, C., Hillis, J., Huang, X., Huang, Y., Jia, W., Khoo, W., Kolar, J., Kottur, S., Kumar, A., Landini, F., Li, C., Li, Y., Li, Z., Mangalam, K., Modhugu, R., Munro, J., Murrell, T., Nishiyasu, T., Price, W., Puentes, P.R., Ramazanova, M., Sari, L., Somasundaram, K., Southerland, A., Sugano, Y., Tao, R., Vo, M., Wang, Y., Wu, X., Yagi, T., Zhao, Z., Zhu, Y., Arbelaez, P., Crandall, D., Damen, D., Farinella, G.M., Fuegen, C., Ghanem, B., Ithapu, V.K., Jawahar, C.V., Joo, H., Kitani, K., Li, H., Newcombe, R., Oliva, A., Park, H.S., Rehg, J.M., Sato, Y., Shi, J., Shou, M.Z., Torralba, A., Torresani, L., Yan, M., & Malik, J. (2022). Ego4d: Around the world in 3,000 hours of egocentric video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18995\u201319012","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"2620_CR32","doi-asserted-by":"publisher","first-page":"26418","DOI":"10.52202\/068431-1916","volume":"35","author":"J Gu","year":"2022","unstructured":"Gu, J., Meng, X., Lu, G., Hou, L., Niu, M., Liang, X., Yao, L., Huang, R., Zhang, W., Jiang, X., Xu, C., & Xu, H. (2022). Wukong: A 100 million large-scale chinese cross-modal pre-training benchmark. Advances in Neural Information Processing Systems, 35, 26418\u201326431.","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"1\u20133","key":"2620_CR33","doi-asserted-by":"publisher","first-page":"185","DOI":"10.1016\/0004-3702(81)90024-2","volume":"17","author":"BK Horn","year":"1981","unstructured":"Horn, B. K., & Schunck, B. G. (1981). Determining optical flow. Artificial intelligence, 17(1\u20133), 185\u2013203.","journal-title":"Artificial intelligence"},{"key":"2620_CR34","unstructured":"Huang, S., Dong, L., Wang, W., Hao, Y., Singhal, S., Ma, S., Lv, T., Cui, L., Mohammed, O.K., Patra, B., Liu, Q., Aggarwal, K., Chi, Z., Bjorck, J., Chaudhary, V., Som, S., Song, X., & Wei, F. (2024). Language is not all you need: Aligning perception with language models. Advances in Neural Information Processing Systems 36"},{"key":"2620_CR35","doi-asserted-by":"crossref","unstructured":"Jang, Y., Song, Y., Yu, Y., Kim, Y., & Kim, G. (2017). Tgif-qa: Toward spatio-temporal reasoning in visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2758\u20132766","DOI":"10.1109\/CVPR.2017.149"},{"key":"2620_CR36","unstructured":"Jiang, D., He, X., Zeng, H., Wei, C., Ku, M., Liu, Q., & Chen, W. (2024). Mantis: Interleaved multi-image instruction tuning. arXiv preprint arXiv:2405.01483"},{"key":"2620_CR37","doi-asserted-by":"crossref","unstructured":"Jin, P., Takanobu, R., Zhang, W., Cao, X., & Yuan, L. (2024). Chat-univi: Unified visual representation empowers large language models with image and video understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13700\u201313710","DOI":"10.1109\/CVPR52733.2024.01300"},{"key":"2620_CR38","unstructured":"Jocher, G., Chaurasia, A., & Qiu, J. (2023) Ultralytics YOLO. https:\/\/github.com\/ultralytics\/ultralytics"},{"key":"2620_CR39","doi-asserted-by":"crossref","unstructured":"Lei, J., Yu, L., Bansal, M., & Berg, T.L. (2018). Tvqa: Localized, compositional video question answering. arXiv preprint arXiv:1809.01696","DOI":"10.18653\/v1\/D18-1167"},{"key":"2620_CR40","unstructured":"Li, Y., Chen, X., Hu, B., Wang, L., Shi, H., & Zhang, M. (2024). Videovista: A versatile benchmark for video understanding and reasoning. arXiv preprint arXiv:2406.11303"},{"key":"2620_CR41","doi-asserted-by":"crossref","unstructured":"Li, B., Ge, Y., Ge, Y., Wang, G., Wang, R., Zhang, R., & Shan, Y. (2024). Seed-bench: Benchmarking multimodal large language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13299\u201313308","DOI":"10.1109\/CVPR52733.2024.01263"},{"key":"2620_CR42","unstructured":"Li, K., He, Y., Wang, Y., Li, Y., Wang, W., Luo, P., Wang, Y., Wang, L., & Qiao, Y. (2023). Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355"},{"key":"2620_CR43","unstructured":"Li, J., Li, D., Savarese, S., & Hoi, S. (2023). Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, pp. 19730\u201319742. PMLR"},{"key":"2620_CR44","doi-asserted-by":"crossref","unstructured":"Li, Y., Wang, C., & Jia, J. (2024). Llama-vid: An image is worth 2 tokens in large language models. In: European Conference on Computer Vision","DOI":"10.1007\/978-3-031-72952-2_19"},{"key":"2620_CR45","doi-asserted-by":"crossref","unstructured":"Li, K., Wang, Y., He, Y., Li, Y., Wang, Y., Liu, Y., Wang, Z., Xu, J., Chen, G., Luo, P., Wang, L., & Qiao, Y. (2024). Mvbench: A comprehensive multi-modal video understanding benchmark. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22195\u201322206","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"2620_CR46","unstructured":"Li, K., Wang, Y., He, Y., Li, Y., Wang, Y., Wang, L., & Qiao, Y. (2022) Uniformerv2: Spatiotemporal learning by arming image vits with video uniformer. arXiv preprint arXiv:2211.09552"},{"key":"2620_CR47","unstructured":"Li, F., Zhang, R., Zhang, H., Zhang, Y., Li, B., Li, W., Ma, Z., & Li, C. (2024). Llava-next-interleave: Tackling multi-image, video, and 3d in large multimodal models. arXiv preprint arXiv:2407.07895"},{"key":"2620_CR48","doi-asserted-by":"crossref","unstructured":"Lin, J., Yin, H., Ping, W., Molchanov, P., Shoeybi, M., & Han, S. (2024). Vila: On pre-training for visual language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26689\u201326699","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"2620_CR49","doi-asserted-by":"crossref","unstructured":"Lin, B., Zhu, B., Ye, Y., Ning, M., Jin, P., & Yuan, L. (2023). Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"2620_CR50","doi-asserted-by":"crossref","unstructured":"Liu, H., & Abbeel, P. (2023). Blockwise parallel transformer for large context models. Advances in neural information processing systems","DOI":"10.52202\/075280-0386"},{"key":"2620_CR51","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., & Lee, Y.J. (2024). Improved baselines with visual instruction tuning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26296\u201326306","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"2620_CR52","doi-asserted-by":"crossref","unstructured":"Liu, Y., Li, S., Liu, Y., Wang, Y., Ren, S., Li, L., Chen, S., Sun, X., & Hou, L. (2024). Tempcompass: Do video llms really understand videos? arXiv preprint arXiv:2403.00476","DOI":"10.18653\/v1\/2024.findings-acl.517"},{"key":"2620_CR53","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. Advances in neural information processing systems 36 (2024)"},{"key":"2620_CR54","unstructured":"Liu, H., Yan, W., Zaharia, M., & Abbeel, P. (2024). World model on million-length video and language with ringattention. arXiv preprint arXiv:2402.08268"},{"key":"2620_CR55","unstructured":"Liu, D., Zhang, R., Qiu, L., Huang, S., Lin, W., Zhao, S., Geng, S., Lin, Z., Jin, P., Zhang, K., Shao, W., Xu, C., He, C., He, J., Shao, H., Lu, P., Li, H., Qiao, Y., & Gao, P. (2024). Sphinx-x: Scaling data and parameters for a family of multi-modal large language models. arXiv preprint arXiv:2402.05935"},{"key":"2620_CR56","unstructured":"Luo, R., Zhao, Z., Yang, M., Dong, J., Li, D., Lu, P., Wang, T., Hu, L., Qiu, M., & Wei, Z. (2023). Valley: Video assistant with large language model enhanced ability. arXiv preprint arXiv:2306.07207"},{"key":"2620_CR57","unstructured":"Lv, T., Huang, Y., Chen, J., Zhao, Y., Jia, Y., Cui, L., Ma, S., Chang, Y., Huang, S., Wang, W., Dong, L., Luo, W., Wu, S., Wang, G., Zhang, C., & Wei, F. (2023). Kosmos-2.5: A multimodal literate model. arXiv preprint arXiv:2309.11419"},{"key":"2620_CR58","doi-asserted-by":"crossref","unstructured":"Maaz, M., Rasheed, H., Khan, S., & Khan, F.S. (2024) Video-chatgpt: Towards detailed video understanding via large vision and language models. Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"2620_CR59","unstructured":"Mahdisoltani, F., Berger, G., Gharbieh, W., Fleet, D., & Memisevic, R. (2018) On the effectiveness of task granularity for transfer learning. arXiv preprint arXiv:1804.09235"},{"key":"2620_CR60","doi-asserted-by":"crossref","unstructured":"Mangalam, K., Akshulakov, R., & Malik, J. (2024). Egoschema: A diagnostic benchmark for very long-form video language understanding. Advances in Neural Information Processing Systems 36","DOI":"10.52202\/075280-2004"},{"key":"2620_CR61","unstructured":"OpenAI: Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"2620_CR62","unstructured":"OpenAI: Gpt-4v(ision) system card. https:\/\/cdn.openai.com\/papers\/GPTV_System_Card.pdf (2023)"},{"key":"2620_CR63","unstructured":"OpenAI: Hello GPT-4o. https:\/\/openai.com\/index\/hello-gpt-4o\/ (2024)"},{"key":"2620_CR64","unstructured":"OpenAI: Introducing ChatGPT. https:\/\/openai.com\/chatgpt\/ (2023)"},{"key":"2620_CR65","unstructured":"Peng, Z., Wang, W., Dong, L., Hao, Y., Huang, S., Ma, S., Ye, Q., & Wei, F. (2024). Grounding multimodal large language models to the world. In: The Twelfth International Conference on Learning Representations"},{"key":"2620_CR66","unstructured":"Rawal, R., Saifullah, K., Basri, R., Jacobs, D., Somepalli, G., & Goldstein, T. (2024). Cinepile: A long video question answering dataset and benchmark. arXiv preprint arXiv:2405.08813"},{"key":"2620_CR67","doi-asserted-by":"crossref","unstructured":"Ren, S., Yao, L., Li, S., Sun, X. & Hou, L. (2024) Timechat: A time-sensitive multimodal large language model for long video understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14313\u201314323","DOI":"10.1109\/CVPR52733.2024.01357"},{"key":"2620_CR68","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., Beaumont, R., Vencu, R., Gordon, C., Wightman, R., Cherti, M., Coombes, T., Katta, A., Mullis, C., Wortsman, M., Schramowski, P., Kundurthy, S., Crowson, K., Schmidt, L., Kaczmarczyk, R., & Jitsev, J. (2022). Laion-5b: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems, 35, 25278\u201325294.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2620_CR69","doi-asserted-by":"crossref","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., & Gupta, A. (2016). Hollywood in homes: Crowdsourcing data collection for activity understanding. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part I 14, pp. 510\u2013526. Springer","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"2620_CR70","doi-asserted-by":"crossref","unstructured":"Song, E., Chai, W., Wang, G., Zhang, Y., Zhou, H., Wu, F., Chi, H., Guo, X., Ye, T., Zhang, Y., Lu, Y., Hwang, J.-N., & Wang, G. (2024) Moviechat: From dense token to sparse memory for long video understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18221\u201318232","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"2620_CR71","unstructured":"Sun, A. jieba. https:\/\/github.com\/fxsjy\/jieba"},{"key":"2620_CR72","unstructured":"Vaswani, A. (2017) Attention is all you need. Advances in Neural Information Processing Systems"},{"key":"2620_CR73","unstructured":"Wang, W., He, Z., Hong, W., Cheng, Y., Zhang, X., Qi, J., Gu, X., Huang, S., Xu, B., Dong, Y., Ding, M., & Tang, J. (2024). Lvbench: An extreme long video understanding benchmark. arXiv preprint arXiv:2406.08035"},{"key":"2620_CR74","unstructured":"Wang, W., Lv, Q., Yu, W., Hong, W., Qi, J., Wang, Y., Ji, J., Yang, Z., Zhao, L., Song, X., Xu, J., Xu, B., Li, J., Dong, Y., Ding, M., & Tang, J. (2023). Cogvlm: Visual expert for pretrained language models. arXiv preprint arXiv:2311.03079"},{"key":"2620_CR75","doi-asserted-by":"crossref","unstructured":"Wang, X., Wu, J., Chen, J., Li, L., Wang, Y.-F., & Wang, W.Y. (2019). Vatex: A large-scale, high-quality multilingual dataset for video-and-language research. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4581\u20134591","DOI":"10.1109\/ICCV.2019.00468"},{"key":"2620_CR76","unstructured":"Wu, H., Li, D., Chen, B., & Li, J. (2024). Longvideobench: A benchmark for long-context interleaved video-language understanding. arXiv preprint arXiv:2407.15754"},{"key":"2620_CR77","doi-asserted-by":"crossref","unstructured":"Wu, W., Zhao, Y., Li, Z., Li, J., Zhou, H., Shou, M.Z., & Bai, X. (2024). A large cross-modal video retrieval dataset with reading comprehension. Pattern Recognition, 110818","DOI":"10.1016\/j.patcog.2024.110818"},{"key":"2620_CR78","doi-asserted-by":"crossref","unstructured":"Xiao, J., Shang, X., Yao, A., & Chua, T.-S. (2021). Next-qa: Next phase of question-answering to explaining temporal actions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9777\u20139786","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"2620_CR79","unstructured":"Xu, H., Ye, Q., Wu, X., Yan, M., Miao, Y., Ye, J., Xu, G., Hu, A., Shi, Y., Xu, G., Li, C., Qian, Q., Que, M., Zhang, J., Zeng, X., & Huang, F. (2023). Youku-mplug: A 10 million large-scale chinese video-language dataset for pre-training and benchmarks. arXiv preprint arXiv:2306.04362"},{"key":"2620_CR80","unstructured":"Xu, L., Zhao, Y., Zhou, D., Lin, Z., Ng, S.K., & Feng, J. (2024). Pllava: Parameter-free llava extension from images to videos for video dense captioning. arXiv preprint arXiv:2404.16994"},{"key":"2620_CR81","doi-asserted-by":"crossref","unstructured":"Yang, A., Miech, A., Sivic, J., Laptev, I., & Schmid, C. (2021). Just ask: Learning to answer questions from millions of narrated videos. In: ICCV","DOI":"10.1109\/ICCV48922.2021.00171"},{"key":"2620_CR82","unstructured":"Yang, A., Xiao, B., Wang, B., Zhang, B., Bian, C., Yin, C., Lv, C., Pan, D., Wang, D., Yan, D., Yang, F., Deng, F., Wang, F., Liu, F., Ai, G., Dong, G., Zhao, H., Xu, H., Sun, H., Zhang, H., Liu, H., Ji, J., Xie, J., Dai, J., Fang, K., Su, L., Song, L., Liu, L., Ru, L., Ma, L., Wang, M., Liu, M., Lin, M., Nie, N., Guo, P., Sun, R., Zhang, T., Li, T., Li, T., Cheng, W., Chen, W., Zeng, X., Wang, X., Chen, X., Men, X., Yu, X., Pan, X., Shen, Y., Wang, Y., Li, Y., Jiang, Y., Gao, Y., Zhang, Y., Zhou, Z., & Wu, Z. (2023). Baichuan 2: Open large-scale language models. arXiv preprint arXiv:2309.10305"},{"key":"2620_CR83","unstructured":"Ye, J., Hu, A., Xu, H., Ye, Q., Yan, M., Dan, Y., Zhao, C., Xu, G., Li, C., Tian, J., Qi, Q., Zhang, J., & Huang, F. (2023). mplug-docowl: Modularized multimodal large language model for document understanding. arXiv preprint arXiv:2307.02499"},{"key":"2620_CR84","unstructured":"Yi, K., Gan, C., Li, Y., Kohli, P., Wu, J., Torralba, A., & Tenenbaum, J.B. (2019). Clevrer: Collision events for video representation and reasoning. arXiv preprint arXiv:1910.01442"},{"key":"2620_CR85","unstructured":"Zhang, P., Dong, X., Wang, B., Cao, Y., Xu, C., Ouyang, L., Zhao, Z., Duan, H., Zhang, S., Ding, S., Zhang, W., Yan, H., Zhang, X., Li, W., Li, J., Chen, K., He, C., Zhang, X., Qiao, Y., Lin, D., & Wang, J. (2023). Internlm-xcomposer: A vision-language large model for advanced text-image comprehension and composition. arXiv preprint arXiv:2309.15112"},{"key":"2620_CR86","unstructured":"Zhang, P., Dong, X., Zang, Y., Cao, Y., Qian, R., Chen, L., Guo, Q., Duan, H., Wang, B., Ouyang, L., Zhang, S., Zhang, W., Li, Y., Gao, Y., Sun, P., Zhang, X., Li, W., Li, J., Wang, W., Yan, H., He, C., Zhang, X., Chen, K., Dai, J., Qiao, Y., Lin, D., & Wang, J. (2024). Internlm-xcomposer-2.5: A versatile large vision language model supporting long-contextual input and output. arXiv preprint arXiv:2407.03320"},{"key":"2620_CR87","doi-asserted-by":"crossref","unstructured":"Zhang, R., Gui, L., Sun, Z., Feng, Y., Xu, K., Zhang, Y., Fu, D., Li, C., Hauptmann, A., Bisk, Y., & Yang, Y. (2024). Direct preference optimization of video large multimodal models from language model reward. arXiv preprint arXiv:2404.01258","DOI":"10.18653\/v1\/2025.naacl-long.30"},{"key":"2620_CR88","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., & Bing, L. (2023) Video-llama: An instruction-tuned audio-visual language model for video understanding. EMNLP Demo Track","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"2620_CR89","unstructured":"Zhang, Y., Li, B., Liu, h., Lee, Y.j., Gui, L., Fu, D., Feng, J., Liu, Z., & Li, C. (2024). LLaVA-NeXT: A Strong Zero-shot Video Understanding Model. https:\/\/llava-vl.github.io\/blog\/2024-04-30-llava-next-video\/"},{"key":"2620_CR90","unstructured":"Zhang, Y.-F., Wen, Q., Fu, C., Wang, X., Zhang, Z., Wang, L., & Jin, R. (2024). Beyond llava-hd: Diving into high-resolution large multimodal models. arXiv preprint arXiv:2406.08487"},{"key":"2620_CR91","unstructured":"Zhang, Y., Zhang, R., Gu, J., Zhou, Y., Lipka, N., Yang, D., & Sun, T. (2023). Llavar: Enhanced visual instruction tuning for text-rich image understanding. arXiv preprint arXiv:2306.17107"},{"key":"2620_CR92","unstructured":"Zhang, P., Zhang, K., Li, B., Zeng, G., Yang, J., Zhang, Y., Wang, Z., Tan, H., Li, C., & Liu, Z. (2024). Long context transfer from language to vision. arXiv preprint arXiv:2406.16852"},{"key":"2620_CR93","doi-asserted-by":"crossref","unstructured":"Zhou, J., Shu, Y., Zhao, B., Wu, B., Xiao, S., Yang, X., Xiong, Y., Zhang, B., Huang, T., & Liu, Z. (2024). Mlvu: A comprehensive benchmark for multi-task long video understanding. arXiv preprint arXiv:2406.04264","DOI":"10.1109\/CVPR52734.2025.01278"},{"key":"2620_CR94","doi-asserted-by":"crossref","unstructured":"Zhou, L., Xu, C., & Corso, J. (2018). Towards automatic learning of procedures from web instructional videos. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"2620_CR95","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., & Elhoseiny, M. (2024). Minigpt-4: Enhancing vision-language understanding with advanced large language models. In: International Conference on Learning Representations"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02620-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02620-2","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02620-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T08:34:46Z","timestamp":1774600486000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02620-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,9]]},"references-count":95,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["2620"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02620-2","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,9]]},"assertion":[{"value":"19 November 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 November 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of Interest"}}],"article-number":"114"}}