{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T13:04:44Z","timestamp":1775567084866,"version":"3.50.1"},"reference-count":97,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2025,2,18]],"date-time":"2025-02-18T00:00:00Z","timestamp":1739836800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,18]],"date-time":"2025-02-18T00:00:00Z","timestamp":1739836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001381","name":"National Research Foundation Singapore","doi-asserted-by":"publisher","award":["NRF-NRFFAI1-2019-0001"],"award-info":[{"award-number":["NRF-NRFFAI1-2019-0001"]}],"id":[{"id":"10.13039\/501100001381","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1007\/s11263-025-02385-8","type":"journal-article","created":{"date-parts":[[2025,2,18]],"date-time":"2025-02-18T19:36:27Z","timestamp":1739907387000},"page":"3970-3993","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["VideoQA in the Era of LLMs: An Empirical Study"],"prefix":"10.1007","volume":"133","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5573-6195","authenticated-orcid":false,"given":"Junbin","family":"Xiao","sequence":"first","affiliation":[]},{"given":"Nanxin","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Hangyu","family":"Qin","sequence":"additional","affiliation":[]},{"given":"Dongyang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Yicong","family":"Li","sequence":"additional","affiliation":[]},{"given":"Fengbin","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Zhulin","family":"Tao","sequence":"additional","affiliation":[]},{"given":"Jianxing","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Liang","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Tat-Seng","family":"Chua","sequence":"additional","affiliation":[]},{"given":"Angela","family":"Yao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,2,18]]},"reference":[{"key":"2385_CR1","doi-asserted-by":"crossref","unstructured":"Agrawal, A., Batra, D., & Parikh, D. (2016). Analyzing the behavior of visual question answering models. In: Conference on empirical methods in natural language processing (EMNLP), pp. 1955\u20131960.","DOI":"10.18653\/v1\/D16-1203"},{"key":"2385_CR2","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J. B., Donahue, J., Luc, P., Miech, A., Barr, I., Hasson, Y., Lenc, K., Mensch, A., Millican, K., Reynolds, M., et al. (2022). Flamingo: A visual language model for few-shot learning. Advances in Neural Information Processing Systems (NeurIPS), 35, 23716\u201323736.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"2385_CR3","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C.L., & Parikh, D. (2015). Vqa: Visual question answering. In: Proceedings of the IEEE international conference on computer vision (ICCV), pp. 2425\u20132433.","DOI":"10.1109\/ICCV.2015.279"},{"key":"2385_CR4","doi-asserted-by":"crossref","unstructured":"Bagad, P., Tapaswi, M., & Snoek, C.G. (2023). Test of time: Instilling video-language models with a sense of time. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 2503\u20132516.","DOI":"10.1109\/CVPR52729.2023.00247"},{"key":"2385_CR5","unstructured":"Bai, Z., Wang, P., Xiao, T., He, T., Han, Z., Zhang, Z., & Shou, M.Z. (2024). Hallucination of multimodal large language models: A survey. arXiv preprint arXiv:2404.18930."},{"key":"2385_CR6","doi-asserted-by":"crossref","unstructured":"Buch, S., Eyzaguirre, C., Gaidon, A., Wu, J., Fei-Fei, L., & Niebles, J.C. (2022). Revisiting the \"video\" in video-language understanding. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 2917\u20132927.","DOI":"10.1109\/CVPR52688.2022.00293"},{"key":"2385_CR7","doi-asserted-by":"crossref","unstructured":"Chen, X., Djolonga, J., Padlewski, P., Mustafa, B., Changpinyo, S., Wu, J., Ruiz, C.R., Goodman, S., Wang, X., Tay, Y., et\u00a0al. (2023). Pali-x: On scaling up a multilingual vision and language model. arXiv preprint arXiv:2305.18565.","DOI":"10.1109\/CVPR52733.2024.01368"},{"key":"2385_CR8","unstructured":"Chiang, W.L., Li, Z., Lin, Z., Sheng, Y., Wu, Z., Zhang, H., Zheng, L., Zhuang, S., Zhuang, Y., Gonzalez, J.E., Stoica, I., & Xing, E.P. (2023). Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/."},{"key":"2385_CR9","unstructured":"Chung, H.W., Hou, L., Longpre, S., Zoph, B., Tay, Y., Fedus, W., Li, Y., Wang, X., Dehghani, M., Brahma, S., et\u00a0al. (2022). Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416."},{"key":"2385_CR10","unstructured":"Dai, W., Li, J., Li, D., Tiong, A.M.H., Zhao, J., Wang, W., Li, B., Fung, P., & Hoi, S. (2023). Instructblip: Towards general-purpose vision-language models with instruction tuning. In: Proceedings of the 37th international conference on neural information processing systems (NeurIPS), pp. 49250\u201349267."},{"key":"2385_CR11","doi-asserted-by":"crossref","unstructured":"Datta, S., Dharur, S., Cartillier, V., Desai, R., Khanna, M., Batra, D., & Parikh, D. (2022). Episodic memory question answering. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 19119\u201319128.","DOI":"10.1109\/CVPR52688.2022.01853"},{"key":"2385_CR12","unstructured":"Devlin, J., Chang, M.W., Lee, K., & Toutanova, K. (2018). Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805."},{"key":"2385_CR13","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., & Gelly, S., et\u00a0al. (2021). An image is worth 16x16 words: Transformers for image recognition at scale. In: International conference on learning representations (ICLR)."},{"key":"2385_CR14","unstructured":"Dubey, A., Jauhri, A., Pandey, A., Kadian, A., Al-Dahle, A., Letman, A., Mathur, A., Schelten, A., Yang, A., Fan, A., et\u00a0al. (2024). The llama 3 herd of models. arXiv preprint arXiv:2407.21783."},{"key":"2385_CR15","doi-asserted-by":"crossref","unstructured":"Fan, Y., Ma, X., Wu, R., Du, Y., Li, J., Gao, Z., & Li, Q. (2024). Videoagent: A memory-augmented multimodal agent for video understanding. In: European conference on computer vision (ECCV).","DOI":"10.1007\/978-3-031-72670-5_5"},{"key":"2385_CR16","unstructured":"Fei, H., Wu, S., Ji, W., Zhang, H., Zhang, M., Lee, M.L., & Hsu, W. (2024). Video-of-thought: Step-by-step video reasoning from perception to cognition. In: Forty-first international conference on machine learning (ICML)."},{"key":"2385_CR17","unstructured":"Fu, C., Dai, Y., Luo, Y., Li, L., Ren, S., Zhang, R., Wang, Z., Zhou, C., Shen, Y., Zhang, M., et\u00a0al. (2024). Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis. arXiv preprint arXiv:2405.21075."},{"key":"2385_CR18","unstructured":"Fu, T.J., Li, L., Gan, Z., Lin, K., Wang, W.Y., Wang, L., & Liu, Z. (2021). Violet: End-to-end video-language transformers with masked visual-token modeling. arXiv preprint arXiv:2111.12681."},{"key":"2385_CR19","doi-asserted-by":"crossref","unstructured":"Fu, T.J., Li, L., Gan, Z., Lin, K., Wang, W.Y., Wang, L., & Liu, Z. (2023). An empirical study of end-to-end video-language transformers with masked visual modeling. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp 22898\u201322909.","DOI":"10.1109\/CVPR52729.2023.02193"},{"key":"2385_CR20","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., & Parikh, D. (2017). Making the v in vqa matter: Elevating the role of image understanding in visual question answering. In: IEEE conference on computer vision and pattern recognition (CVPR), pp. 6904\u20136913.","DOI":"10.1109\/CVPR.2017.670"},{"key":"2385_CR21","unstructured":"Grauman, K., Westbury, A., Byrne, E., Chavis, Z., Furnari, A., Girdhar, R., Hamburger, J., Jiang, H., Liu, M., Liu, X., et\u00a0al. (2022). Ego4d: Around the world in 3000 hours of egocentric video. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 18995\u201319012."},{"key":"2385_CR22","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In: Proceedings of the ieee conference on computer vision and pattern recognition (CVPR), pp. 770\u2013778 .","DOI":"10.1109\/CVPR.2016.90"},{"key":"2385_CR23","unstructured":"He, P., Liu, X., Gao, J., & Chen, W. (2020). Deberta: Decoding-enhanced bert with disentangled attention. arXiv preprint arXiv:2006.03654."},{"key":"2385_CR24","doi-asserted-by":"crossref","unstructured":"Himakunthala, V., Ouyang, A., Rose, D., He, R., Mei, A., Lu, Y., Sonar, C., Saxon, M., & Wang, W. (2023). Let\u2019s think frame by frame with vip: A video infilling and prediction dataset for evaluating video chain-of-thought. In: Proceedings of the 2023 conference on empirical methods in natural language processing (EMNLP), pp. 204\u2013219.","DOI":"10.18653\/v1\/2023.emnlp-main.15"},{"issue":"8","key":"2385_CR25","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural computation, 9(8), 1735\u20131780.","journal-title":"Neural computation"},{"key":"2385_CR26","unstructured":"Hu, E.J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., & Chen, W. (2021). Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685."},{"key":"2385_CR27","unstructured":"Ilharco , G., Wortsman, M., Wightman, R., Gordon, C., Carlini, N., Taori, R., Dave, A., Shankar, V., Namkoong, H., Miller, J., Hajishirzi, H., Farhadi, A,. Schmidt, L. (2021). Openclip. DOI 10.5281\/zenodo.5143773, URL https:\/\/doi.org\/10.5281\/zenodo.5143773."},{"key":"2385_CR28","doi-asserted-by":"crossref","unstructured":"Jang, Y., Song, Y., Yu, Y., Kim, Y., & Kim, G. (2017). Tgif-qa: Toward spatio-temporal reasoning in visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), pp. 2758\u20132766.","DOI":"10.1109\/CVPR.2017.149"},{"key":"2385_CR29","doi-asserted-by":"publisher","first-page":"1385","DOI":"10.1007\/s11263-019-01189-x","volume":"127","author":"Y Jang","year":"2019","unstructured":"Jang, Y., Song, Y., Kim, C. D., Yu, Y., Kim, Y., & Kim, G. (2019). Video question answering with spatio-temporal reasoning. International Journal of Computer Vision (IJCV), 127, 1385\u20131412.","journal-title":"International Journal of Computer Vision (IJCV)"},{"key":"2385_CR30","doi-asserted-by":"crossref","unstructured":"Kervadec, C., Antipov, G., Baccouche, M., & Wolf, C. (2021). Roses are red, violets are blue... but should vqa expect them to? In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 2776\u20132785.","DOI":"10.1109\/CVPR46437.2021.00280"},{"key":"2385_CR31","doi-asserted-by":"crossref","unstructured":"Kim, W., Choi, C., Lee, W., & Rhee, W. (2024). An image grid can be worth a video: Zero-shot video question answering using a vlm. arXiv preprint arXiv:2403.18406.","DOI":"10.1109\/ACCESS.2024.3517625"},{"key":"2385_CR32","doi-asserted-by":"crossref","unstructured":"Ko, D., Lee, J., Kang, W.Y., Roh, B., & Kim, H. (2023). Large language models are temporal and causal reasoners for video question answering. In: Proceedings of the 2023 conference on empirical methods in natural language processing (EMNLP), pp. 4300\u20134316.","DOI":"10.18653\/v1\/2023.emnlp-main.261"},{"key":"2385_CR33","first-page":"22199","volume":"35","author":"T Kojima","year":"2022","unstructured":"Kojima, T., Gu, S. S., Reid, M., Matsuo, Y., & Iwasawa, Y. (2022). Large language models are zero-shot reasoners. Advances in neural information processing systems (NeurIPS), 35, 22199\u201322213.","journal-title":"Advances in neural information processing systems (NeurIPS)"},{"key":"2385_CR34","doi-asserted-by":"crossref","unstructured":"Le, T.M., Le, V., Venkatesh, S., & Tran, T. (2020). Hierarchical conditional relation networks for video question answering. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 9972\u20139981.","DOI":"10.1109\/CVPR42600.2020.00999"},{"key":"2385_CR35","doi-asserted-by":"crossref","unstructured":"Lei, J., Li, L., Zhou, L., Gan, Z., Berg, T.L., Bansal, M., & Liu, J. (2021). Less is more: Clipbert for video-and-language learning via sparse sampling. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 7331\u20137341.","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"2385_CR36","doi-asserted-by":"crossref","unstructured":"Lei, J., Berg, T., & Bansal, M. (2023). Revealing single frame bias for video-and-language learning. In: Proceedings of the 61st annual meeting of the association for computational linguistics (Volume 1: Long Papers), pp. 487\u2013507.","DOI":"10.18653\/v1\/2023.acl-long.29"},{"key":"2385_CR37","unstructured":"Li, J., Li, D., Savarese, S., & Hoi, S. (2023a). Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In: International conference on machine learning (ICML), PMLR, pp. 19730\u201319742."},{"key":"2385_CR38","unstructured":"Li, K., He, Y., Wang, Y., Li, Y., Wang, W., Luo, P., Wang, Y., Wang, L., & Qiao, Y. (2023b). Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355."},{"key":"2385_CR39","doi-asserted-by":"crossref","unstructured":"Li, K., Wang, Y., Li, Y., Wang, Y., He, Y., Wang, L., & Qiao, Y. (2023c). Unmasked teacher: Towards training-efficient video foundation models. In: Proceedings of the IEEE\/CVF international conference on computer vision (ICCV).","DOI":"10.1109\/ICCV51070.2023.01826"},{"key":"2385_CR40","doi-asserted-by":"crossref","unstructured":"Li, K., Wang, Y., He, Y., Li, Y., Wang, Y., Liu, Y., Wang, Z., Xu, J., Chen, G., Luo, P., et\u00a0al. (2024a). Mvbench: A comprehensive multi-modal video understanding benchmark. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp 22195\u201322206.","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"2385_CR41","doi-asserted-by":"crossref","unstructured":"Li, L., Chen, Y.C., Cheng, Y., Gan, Z., Yu, L., & Liu, J. (2020). Hero: Hierarchical encoder for video+ language omni-representation pre-training. In: Proceedings of the 2020 conference on empirical methods in natural language processing (EMNLP), pp. 2046\u20132065.","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"2385_CR42","doi-asserted-by":"crossref","unstructured":"Li, S., Li, L., Ren, S., Liu, Y., Liu, Y., Gao, R., Sun, X., & Hou, L. (2023d). Vitatecs: A diagnostic dataset for temporal concept understanding of video-language models. arXiv preprint arXiv:2311.17404.","DOI":"10.1007\/978-3-031-72897-6_19"},{"key":"2385_CR43","doi-asserted-by":"crossref","unstructured":"Li, Y., Wang, X., Xiao, J., Ji, W., & Chua, T.S. (2022). Invariant grounding for video question answering. In: IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 2928\u20132937.","DOI":"10.1109\/CVPR52688.2022.00294"},{"key":"2385_CR44","doi-asserted-by":"crossref","unstructured":"Li, Y., Wang, X., Xiao, J., Ji, W., & Chua, T.S. (2023e). Transformer-empowered invariant grounding for video question answering. IEEE Transactions on Pattern Analysis and Machine Intelligence.","DOI":"10.1109\/TPAMI.2023.3303451"},{"key":"2385_CR45","doi-asserted-by":"crossref","unstructured":"Li, Y., Xiao, J., Feng, C., Wang, X., & Chua, T.S. (2023f). Discovering spatio-temporal rationales for video question answering. In: IEEE\/CVF international conference on computer vision (ICCV), pp. 13869\u201313878.","DOI":"10.1109\/ICCV51070.2023.01275"},{"key":"2385_CR46","unstructured":"Li, Y., Chen, X., Hu, B., Wang, L., Shi, H., & Zhang, M. (2024b). Videovista: A versatile benchmark for video understanding and reasoning. arXiv preprint arXiv:2406.11303."},{"key":"2385_CR47","doi-asserted-by":"crossref","unstructured":"Li, Y., Wang, C., & Jia, J. (2024c). Llama-vid: An image is worth 2 tokens in large language models. In: European conference on computer vision (ECCV), Springer, pp. 323\u2013340.","DOI":"10.1007\/978-3-031-72952-2_19"},{"key":"2385_CR48","doi-asserted-by":"crossref","unstructured":"Lin, B., Zhu, B., Ye, Y., Ning, M., Jin, P., & Yuan, L. (2023). Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122.","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"2385_CR49","unstructured":"Liu, B., Dong, Y., Wang, Y., Rao, Y., Tang, Y., Ma, W.C., & Krishna, R. (2024a). Coarse correspondence elicit 3d spacetime understanding in multimodal language model. arXiv preprint arXiv:2408.00754."},{"key":"2385_CR50","unstructured":"Liu, H., Li, C., Wu, Q., & Lee, Y.J. (2023). Visual instruction tuning. Advances in neural information processing systems (NeurIPS) 36."},{"key":"2385_CR51","unstructured":"Liu, Y., Ott, M., Goyal, N., Du, J., Joshi, M., Chen, D., Levy, O., Lewis, M., Zettlemoyer, L., & Stoyanov, V. (2019). Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692."},{"key":"2385_CR52","doi-asserted-by":"crossref","unstructured":"Liu, Y., Li, S., Liu, Y., Wang, Y., Ren, S., Li, L., Chen, S., Sun, X., & Hou, L. (2024b). Tempcompass: Do video llms really understand videos? arXiv preprint arXiv:2403.00476.","DOI":"10.18653\/v1\/2024.findings-acl.517"},{"key":"2385_CR53","doi-asserted-by":"crossref","unstructured":"Liu, Z., Ning, J., Cao, Y., Wei, Y., Zhang, Z., Lin, S., & Hu, H. (2022). Video swin transformer. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 3202\u20133211.","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"2385_CR54","doi-asserted-by":"crossref","unstructured":"Maaz, M., Rasheed, H., Khan, S., & Khan, F.S. (2023). Video-chatgpt: Towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424.","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"2385_CR55","doi-asserted-by":"crossref","unstructured":"Majumdar, A., Ajay, A., Zhang, X., Putta, P., Yenamandra, S., Henaff, M., Silwal, S., Mcvay, P., Maksymets, O., Arnaud, S., et\u00a0al. (2024). Openeqa: Embodied question answering in the era of foundation models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 16488\u201316498.","DOI":"10.1109\/CVPR52733.2024.01560"},{"key":"2385_CR56","unstructured":"Mangalam, K., Akshulakov, R., & Malik, J. (2023). Egoschema: A diagnostic benchmark for very long-form video language understanding. In: The 37th conference on neural information processing systems (NeurIPS) track on datasets and benchmarks."},{"key":"2385_CR57","doi-asserted-by":"crossref","unstructured":"Min, J., Buch, S., Nagrani, A., Cho, M., & Schmid, C. (2024). Morevqa: Exploring modular reasoning models for video question answering. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 13235\u201313245.","DOI":"10.1109\/CVPR52733.2024.01257"},{"key":"2385_CR58","doi-asserted-by":"crossref","unstructured":"Niu, Y., Tang, K., Zhang, H., Lu, Z., Hua, X.S., & Wen, J.R. (2021). Counterfactual vqa: A cause-effect look at language bias. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 12700\u201312710.","DOI":"10.1109\/CVPR46437.2021.01251"},{"key":"2385_CR59","unstructured":"OpenAI. (2023). Gpt-4 technical report. arXiv:2303.08774."},{"key":"2385_CR60","unstructured":"P\u0103tr\u0103ucean, V., Smaira, L., Gupta, A., Continente, A.R., Markeeva, L., Banarse, D., Koppula, S., Heyward, J., Malinowski, M., Yang, Y., et\u00a0al. (2023). Perception test: A diagnostic benchmark for multimodal video models. In: The 37th conference on neural information processing systems (NeurIPS) track on datasets and benchmarks."},{"key":"2385_CR61","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al. (2021). Learning transferable visual models from natural language supervision. In: International conference on machine learning (ICML), PMLR, pp. 8748\u20138763."},{"key":"2385_CR62","unstructured":"Seo, P.H., Nagrani, A., & Schmid, C. (2021). Look before you speak: Visually contextualized utterances. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 16877\u201316887."},{"key":"2385_CR63","doi-asserted-by":"crossref","unstructured":"Shah, M., Chen, X., Rohrbach, M., & Parikh, D. (2019). Cycle-consistency for robust visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6649\u20136658.","DOI":"10.1109\/CVPR.2019.00681"},{"key":"2385_CR64","doi-asserted-by":"crossref","unstructured":"Shang, C., You, A., Subramanian, S., Darrell, T., & Herzig, R. (2024). Traveler: A multi-lmm agent framework for video question-answering. arXiv preprint arXiv:2404.01476.","DOI":"10.18653\/v1\/2024.emnlp-main.544"},{"key":"2385_CR65","doi-asserted-by":"crossref","unstructured":"Sun, C., Myers, A., Vondrick, C., Murphy, K., & Schmid, C. (2019). Videobert: A joint model for video and language representation learning. In: Proceedings of the IEEE\/CVF international conference on computer vision (ICCV), pp. 7464\u20137473.","DOI":"10.1109\/ICCV.2019.00756"},{"key":"2385_CR66","unstructured":"Sun, Q., Fang, Y., Wu, L., Wang, X., & Cao, Y. (2023). Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389."},{"key":"2385_CR67","doi-asserted-by":"crossref","unstructured":"Sur\u00eds, D., Menon, S., & Vondrick, C. (2023). Vipergpt: Visual inference via python execution for reasoning. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp. 11888\u201311898.","DOI":"10.1109\/ICCV51070.2023.01092"},{"key":"2385_CR68","unstructured":"Tang, Y., Bi, J., Xu, S., Song, L., Liang, S., Wang, T., Zhang, D., An, J., Lin, J., Zhu, R., et\u00a0al. (2023). Video understanding with large language models: A survey. arXiv preprint arXiv:2312.17432."},{"key":"2385_CR69","unstructured":"Team, G., Anil, R., Borgeaud, S., Wu, Y., Alayrac, J.B., Yu, J., Soricut, R., Schalkwyk, J., Dai, A.M., Hauth, A., et\u00a0al. (2023). Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805."},{"key":"2385_CR70","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., Azhar, F., et\u00a0al. (2023). Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971."},{"key":"2385_CR71","doi-asserted-by":"crossref","unstructured":"Wang, X., Zhang, Y., Zohar, O., & Yeung-Levy, S. (2024a). Videoagent: Long-form video understanding with large language model as agent. In: European conference on computer vision (ECCV).","DOI":"10.1007\/978-3-031-72989-8_4"},{"key":"2385_CR72","unstructured":"Wang, Z., Yu, S., Stengel-Eskin, E., Yoon, J., Cheng, F., Bertasius, G., & Bansal, M. (2024b). Videotree: Adaptive tree-based video representation for llm reasoning on long videos. arXiv preprint arXiv:2405.19209."},{"key":"2385_CR73","first-page":"24824","volume":"35","author":"J Wei","year":"2022","unstructured":"Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F., Chi, E., Le, Q. V., Zhou, D., et al. (2022). Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems (NeurIPS), 35, 24824\u201324837.","journal-title":"Advances in neural information processing systems (NeurIPS)"},{"key":"2385_CR74","doi-asserted-by":"crossref","unstructured":"Xiao, J., Shang, X., Yao, A., & Chua, T.S. (2021). Next-qa: Next phase of question-answering to explaining temporal actions. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 9777\u20139786.","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"2385_CR75","doi-asserted-by":"publisher","first-page":"2804","DOI":"10.1609\/aaai.v36i3.20184","volume":"36","author":"J Xiao","year":"2022","unstructured":"Xiao, J., Yao, A., Liu, Z., Li, Y., Ji, W., & Chua, T. S. (2022). Video as conditional graph hierarchy for multi-granular question answering. Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), 36, 2804\u20132812.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)"},{"key":"2385_CR76","doi-asserted-by":"crossref","unstructured":"Xiao, J., Zhou, P., Chua, T.S., & Yan, S. (2022b). Video graph transformer for video question answering. In: European conference on computer vision (ECCV), Springer, pp. 39\u201358.","DOI":"10.1007\/978-3-031-20059-5_3"},{"issue":"11","key":"2385_CR77","doi-asserted-by":"publisher","first-page":"13265","DOI":"10.1109\/TPAMI.2023.3292266","volume":"45","author":"J Xiao","year":"2023","unstructured":"Xiao, J., Zhou, P., Yao, A., Li, Y., Hong, R., Yan, S., & Chua, T. S. (2023). Contrastive video question answering via video graph transformer. IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI), 45(11), 13265\u201313280. https:\/\/doi.org\/10.1109\/TPAMI.2023.3292266","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"},{"key":"2385_CR78","doi-asserted-by":"crossref","unstructured":"Xiao, J., Yao, A., Li, Y., & Chua, T.S. (2024). Can i trust your answer? visually grounded video question answering. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 13204\u201313214.","DOI":"10.1109\/CVPR52733.2024.01254"},{"key":"2385_CR79","doi-asserted-by":"crossref","unstructured":"Xu, D., Zhao, Z., Xiao, J., Wu, F., Zhang, H., He, X., & Zhuang, Y. (2017). Video question answering via gradually refined attention over appearance and motion. In: Proceedings of the 25th ACM international conference on multimedia, pp. 1645\u20131653.","DOI":"10.1145\/3123266.3123427"},{"key":"2385_CR80","doi-asserted-by":"crossref","unstructured":"Yang, A., Miech, A., Sivic, J., Laptev, I., & Schmid, C. (2021). Just ask: Learning to answer questions from millions of narrated videos. In: Proceedings of the IEEE\/CVF international conference on computer vision (ICCV), pp. 1686\u20131697.","DOI":"10.1109\/ICCV48922.2021.00171"},{"key":"2385_CR81","first-page":"124","volume":"35","author":"A Yang","year":"2022","unstructured":"Yang, A., Miech, A., Sivic, J., Laptev, I., & Schmid, C. (2022). Zero-shot video question answering via frozen bidirectional language models. Advances in Neural Information Processing Systems (NeurIPS), 35, 124\u2013141.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"2385_CR82","unstructured":"Yu, S., Cho, J., Yadav, P., & Bansal, M. (2023). Self-chained image-language model for video localization and question answering. In: The 37th conference on neural information processing systems (NeurIPS)."},{"key":"2385_CR83","doi-asserted-by":"publisher","first-page":"9127","DOI":"10.1609\/aaai.v33i01.33019127","volume":"33","author":"Z Yu","year":"2019","unstructured":"Yu, Z., Xu, D., Yu, J., Yu, T., Zhao, Z., Zhuang, Y., & Tao, D. (2019). Activitynet-qa: A dataset for understanding complex web videos via question answering. Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), 33, 9127\u20139134.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)"},{"key":"2385_CR84","unstructured":"Zeng, A., Attarian, M., Choromanski, K.M., Wong, A., Welker, S., Tombari, F., Purohit, A., Ryoo, M.S., Sindhwani, V., Lee, V.V. Johnny, & Florence, P. (2023). Socratic models: Composing zero-shot multimodal reasoning with language. In: The 11th international conference on learning representations (ICLR)."},{"key":"2385_CR85","doi-asserted-by":"crossref","unstructured":"Zhang, C., Lu, T., Islam, M.M., Wang, Z., Yu, S., Bansal, M., & Bertasius, G. (2023a). A simple llm framework for long-range video question-answering. arXiv preprint arXiv:2312.17235.","DOI":"10.18653\/v1\/2024.emnlp-main.1209"},{"key":"2385_CR86","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., & Bing, L. (2023b). Video-llama: An instruction-tuned audio-visual language model for video understanding. In: Proceedings of the 2023 conference on empirical methods in natural language processing: system demonstrations, pp. 543\u2013553.","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"2385_CR87","unstructured":"Zhang, P., Zhang, K., Li, B., Zeng, G., Yang, J., Zhang, Y., Wang, Z., Tan, H., Li, C., & Liu, Z. (2024a). Long context transfer from language to vision. arXiv preprint arXiv:2406.16852."},{"key":"2385_CR88","unstructured":"Zhang, R., Han, J., Liu, C., Gao, P., Zhou, A., Hu, X., Yan, S., Lu, P., Li, H., & Qiao, Y. (2023c). Llama-adapter: Efficient fine-tuning of language models with zero-init attention. arXiv preprint arXiv:2303.16199."},{"key":"2385_CR89","unstructured":"Zhang, R., Han, J., Zhou, A., Hu, X., Yan, S., Lu, P., Li, H., Gao, P., & Qiao, Y. (2024b). Llama-adapter: Efficient fine-tuning of language models with zero-init attention. ICLR."},{"key":"2385_CR90","doi-asserted-by":"crossref","unstructured":"Zhang, X., Zhang, F., & Xu, C. (2023d). Reducing vision-answer biases for multiple-choice vqa. IEEE Transactions on Image Processing (TIP) pp. 4621\u20134634.","DOI":"10.1109\/TIP.2023.3302162"},{"issue":"4","key":"2385_CR91","doi-asserted-by":"publisher","first-page":"1913","DOI":"10.1109\/TPAMI.2023.3269429","volume":"46","author":"X Zhang","year":"2024","unstructured":"Zhang, X., Zhang, F., & Xu, C. (2024). Next-ood: Overcoming dual multiple-choice vqa biases. IEEE Transactions on Pattern Analysis and Machine Intelligence (T-PAMI), 46(4), 1913\u20131931. https:\/\/doi.org\/10.1109\/TPAMI.2023.3269429","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence (T-PAMI)"},{"key":"2385_CR92","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Huang, X., Ma, J., Li, Z., Luo, Z., Xie, Y., Qin, Y., Luo, T., Li, Y., Liu, S., et\u00a0al. (2024d). Recognize anything: A strong image tagging model. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 1724\u20131732.","DOI":"10.1109\/CVPRW63382.2024.00179"},{"key":"2385_CR93","unstructured":"Zhang, Y., Li, B., Liu, H., Lee, Y.J., Gui, L., Fu, D., Feng, J., Liu, Z., & Li, C. (2024e). Llava-next: A strong zero-shot video understanding model. https:\/\/llava-vl.github.io\/blog\/2024-04-30-llava-next-video\/."},{"key":"2385_CR94","unstructured":"Zhang, Z., Zhang, A., Li, M., Zhao, H., Karypis, G., & Smola, A. (2023e). Multimodal chain-of-thought reasoning in language models. arXiv preprint arXiv:2302.00923."},{"key":"2385_CR95","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Misra, I., Kr\u00e4henb\u00fchl, P., & Girdhar, R. (2023). Learning video representations from large language models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 6586\u20136597.","DOI":"10.1109\/CVPR52729.2023.00637"},{"key":"2385_CR96","doi-asserted-by":"crossref","unstructured":"Zhong, Y., Xiao, J., Ji, W., Li, Y., Deng, W., & Chua, T.S. (2022). Video question answering: Datasets, algorithms and challenges. In: Proceedings of the 2022 conference on empirical methods in natural language processing (EMNLP), pp. 6439\u20136455.","DOI":"10.18653\/v1\/2022.emnlp-main.432"},{"key":"2385_CR97","unstructured":"Zhu, B., Lin, B., Ning, M., Yan, Y., Cui, J., Wang, H., Pang, Y., Jiang, W., Zhang, J., Li, Z., et\u00a0al. (2023). Languagebind: Extending video-language pretraining to n-modality by language-based semantic alignment. arXiv preprint arXiv:2310.01852."}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02385-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02385-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02385-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,7]],"date-time":"2025-06-07T06:01:21Z","timestamp":1749276081000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02385-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,18]]},"references-count":97,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2025,7]]}},"alternative-id":["2385"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02385-8","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2,18]]},"assertion":[{"value":"7 August 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 February 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 February 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}