{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T09:05:42Z","timestamp":1779267942091,"version":"3.51.4"},"reference-count":63,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T00:00:00Z","timestamp":1779235200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T00:00:00Z","timestamp":1779235200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1007\/s11263-026-02831-1","type":"journal-article","created":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T08:08:29Z","timestamp":1779264509000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Dr.V : A Hierarchical Perception-Temporal-Cognition Framework to Diagnose Video Hallucination by Fine-Grained Spatial-Temporal Grounding"],"prefix":"10.1007","volume":"134","author":[{"given":"Meng","family":"Luo","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6192-1194","authenticated-orcid":false,"given":"Shengqiong","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Liqiang","family":"Jing","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tianjie","family":"Ju","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Li","family":"Zheng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jinxiang","family":"Lai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tianlong","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinya","family":"Du","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jian","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Siyuan","family":"Yan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiebo","family":"Luo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"William Yang","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hao","family":"Fei","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mong-Li","family":"Lee","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wynne","family":"Hsu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,5,20]]},"reference":[{"key":"2831_CR1","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F. L., Almeida, D., Altenschmidt, J., Altman, S., Anadkat, S., et\u00a0al. (2023). Gpt-4 technical report. arXiv preprint arXiv:2303.08774"},{"key":"2831_CR2","doi-asserted-by":"crossref","unstructured":"Bae, K., Kim, J., Lee, S., Lee, S., Lee, G., & Choi, J. (2025). Mash-vlm: Mitigating action-scene hallucination in video-llms through disentangled spatial-temporal representations. In Proceedings of the computer vision and pattern recognition conference, pp 13744\u201313753.","DOI":"10.1109\/CVPR52734.2025.01283"},{"key":"2831_CR3","unstructured":"Bai, J., Bai, S., Chu, Y., Cui, Z., Dang, K., Deng, X., Fan, Y., Ge, W., Han, Y., Huang, F., et\u00a0al. (2023). Qwen technical report. arXiv preprint arXiv:2309.16609"},{"key":"2831_CR4","unstructured":"Bai, S., Cai, Y., Chen, R., Chen, K., Chen, X., Cheng, Z., Deng, L., Ding, W., Gao, C., Ge, C., et\u00a0al. (2025a). Qwen3-vl technical report. arXiv preprint arXiv:2511.21631"},{"key":"2831_CR5","unstructured":"Bai, S., Chen, K., Liu, X., Wang, J., Ge, W., Song, S., Dang, K., Wang, P., Wang, S., Tang, J., et\u00a0al. (2025b). Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923"},{"key":"2831_CR6","unstructured":"Bai, Z., Wang, P., Xiao, T., He, T., Han, Z., Zhang, Z., & Shou, M. Z. (2024). Hallucination of multimodal large language models: A survey. arXiv preprint arXiv:2404.18930"},{"key":"2831_CR7","unstructured":"Chen, D., & Dolan, W. B. (2011). Collecting highly parallel data for paraphrase evaluation. In Proceedings of the 49th annual meeting of the association for computational linguistics: human language technologies, pp 190\u2013200."},{"key":"2831_CR8","doi-asserted-by":"crossref","unstructured":"Chen, Z., Wu, J., Wang, W., Su, W., Chen, G., Xing, S., Zhong, M., Zhang, Q., Zhu, X., Lu, L., et\u00a0al. (2024). Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 24185\u201324198.","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"2831_CR9","doi-asserted-by":"crossref","unstructured":"Cheng, T., Song, L., Ge, Y., Liu, W., Wang, X., & Shan, Y. (2024a). Yolo-world: Real-time open-vocabulary object detection. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR52733.2024.01599"},{"key":"2831_CR10","unstructured":"Cheng, Z., Leng, S., Zhang, H., Xin, Y., Li, X., Chen, G., Zhu, Y., Zhang, W., Luo, Z., Zhao, D., et\u00a0al. (2024b). Videollama 2: Advancing spatial-temporal modeling and audio understanding in video-llms. arXiv preprint arXiv:2406.07476"},{"key":"2831_CR11","unstructured":"Choong, W. Y., Guo, Y., & Kankanhalli, M. (2024). Vidhal: Benchmarking temporal hallucinations in vision llms. arXiv preprint arXiv:2411.16771"},{"key":"2831_CR12","unstructured":"Chu, Z., Zhang, L., Sun, Y., Xue, S., Wang, Z., Qin, Z., & Ren, K. (2024). Sora detector: A unified hallucination detection for large text-to-video models. arXiv preprint arXiv:2405.04180"},{"key":"2831_CR13","unstructured":"Comanici, G., Bieber, E., Schaekermann, M., Pasupat, I., Sachdeva, N., Dhillon, I., Blistein, M., Ram, O., Zhang, D., Rosen, E., et\u00a0al. (2025). Gemini 2.5: Pushing the frontier with advanced reasoning, multimodality, long context, and next generation agentic capabilities. arXiv preprint arXiv:2507.06261"},{"key":"2831_CR14","unstructured":"Ding, X., Zhang, K., Han, J., Hong, L., Xu, H., & Li, X. (2025). Pami-vdpo: Mitigating video hallucinations by prompt-aware multi-instance video preference learning. arXiv preprint arXiv:2504.05810"},{"key":"2831_CR15","doi-asserted-by":"crossref","unstructured":"Fu, C., Dai, Y., Luo, Y., Li, L., Ren, S., Zhang, R., Wang, Z., Zhou, C., Shen, Y., Zhang, M., et\u00a0al. (2024). Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis. arXiv preprint arXiv:2405.21075","DOI":"10.1109\/CVPR52734.2025.02245"},{"key":"2831_CR16","unstructured":"Gao, H., Qu, J., Tang, J., Bi, B., Liu, Y., Chen, H., Liang, L., Su, L., & Huang, Q. (2025). Exploring hallucination of large multimodal models in video understanding: Benchmark, analysis and mitigation. arXiv preprint arXiv:2503.19622"},{"key":"2831_CR17","doi-asserted-by":"crossref","unstructured":"Gu, X., Fan, H., Huang, Y., Luo, T., & Zhang, L. (2024). Context-guided spatio-temporal video grounding. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 18330\u201318339.","DOI":"10.1109\/CVPR52733.2024.01735"},{"key":"2831_CR18","unstructured":"Guo, D., Yang, D., Zhang, H., Song, J., Zhang, R., Xu, R., Zhu, Q., Ma, S., Wang, P., Bi, X., et\u00a0al. (2025). Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:2501.12948"},{"key":"2831_CR19","unstructured":"He, X., Feng, W., Zheng, K., Lu, Y., Zhu, W., Li, J., Fan, Y., Wang, J., Li, L., Yang, Z., et\u00a0al. (2024). Mmworld: Towards multi-discipline multi-faceted world model evaluation in videos. arXiv preprint arXiv:2406.08407"},{"key":"2831_CR20","doi-asserted-by":"crossref","unstructured":"Huang, D. A., Ramanathan, V., Mahajan, D., Torresani, L., Paluri, M., Fei-Fei, L., & Niebles, J. C. (2018). What makes a video a video: Analyzing temporal information in video understanding models and datasets. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2018.00769"},{"key":"2831_CR21","unstructured":"Huang, H., Chen, H., Wu, S., Luo, M., Fu, J., Du, X., Zhang, H., & Fei, H. (2025). Vistadpo: Video hierarchical spatial-temporal direct preference optimization for large video models. arXiv preprint arXiv:2504.13122"},{"key":"2831_CR22","unstructured":"Hurst, A., Lerer, A., Goucher, A. P., Perelman, A., Ramesh, A., Clark, A., Ostrow, A., Welihinda, A., Hayes, A., Radford, A., et\u00a0al. (2024). Gpt-4o system card. arXiv preprint arXiv:2410.21276"},{"key":"2831_CR23","doi-asserted-by":"crossref","unstructured":"Kong, M., Zeng, X., Chen, L., Li, Y., Yan, B., & Zhu, Q. (2025). Mhbench: Demystifying motion hallucination in videollms. AAAI-25, sponsored by the association for the advancement of artificial intelligence, February 25 - March 4, 2025 (pp. 4401\u20134409). Philadelphia: PA, USA.","DOI":"10.1609\/aaai.v39i4.32463"},{"key":"2831_CR24","doi-asserted-by":"crossref","unstructured":"Lavee, G., Rivlin, E., & Rudzsky, M. (2009). Understanding video events: A survey of methods for automatic interpretation of semantic occurrences in video. IEEE Transactions on Systems, Man, and Cybernetics, Part C (Applications and Reviews).","DOI":"10.1109\/TSMCC.2009.2023380"},{"key":"2831_CR25","doi-asserted-by":"crossref","unstructured":"Li, C., Im, E. W., & Fazli, P. (2024a). Vidhalluc: Evaluating temporal hallucinations in multimodal large language models for video understanding. arXiv preprint arXiv:2412.03735","DOI":"10.1109\/CVPR52734.2025.01281"},{"key":"2831_CR26","doi-asserted-by":"crossref","unstructured":"Li, J., Niu, L., & Zhang, L. (2022). From representation to reasoning: Towards both evidence and commonsense reasoning for video question-answering. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 21273\u201321282.","DOI":"10.1109\/CVPR52688.2022.02059"},{"key":"2831_CR27","unstructured":"Li, J., Lu, W., Fei, H., Luo, M., Dai, M., Xia, M., Jin, Y., Gan, Z., Qi, D., Fu, C., et\u00a0al. (2024b). A survey on benchmarks of multimodal large language models. arXiv preprint arXiv:2408.08632"},{"key":"2831_CR28","doi-asserted-by":"crossref","unstructured":"Li, K., Wang, Y., He, Y., Li, Y., Wang, Y., Liu, Y., Wang, Z., Xu, J., Chen, G., Luo, P., et\u00a0al. (2024c). Mvbench: A comprehensive multi-modal video understanding benchmark. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 22195\u201322206.","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"2831_CR29","doi-asserted-by":"crossref","unstructured":"Li, Y., Wang, C., & Jia, J. (2025a). Llama-vid: An image is worth 2 tokens in large language models. In European conference on computer vision, pp 323\u2013340.","DOI":"10.1007\/978-3-031-72952-2_19"},{"key":"2831_CR30","doi-asserted-by":"crossref","unstructured":"Li, Z., Wu, X., Shi, G., Qin, Y., Du, H., Zhou, T., Manocha, D., & Boyd-Graber, J. L. (2025b). Videohallu: Evaluating and mitigating multi-modal hallucinations on synthetic video understanding. arXiv preprint arXiv:2505.01481","DOI":"10.32388\/BXC6X1"},{"key":"2831_CR31","doi-asserted-by":"crossref","unstructured":"Lin, B., Ye, Y., Zhu, B., Cui, J., Ning, M., Jin, P., & Yuan, L. (2023). Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"2831_CR32","doi-asserted-by":"crossref","unstructured":"Liu, H., & Wan, X. (2023). Models see hallucinations: Evaluating the factuality in video captioning. arXiv preprint arXiv:2303.02961","DOI":"10.18653\/v1\/2023.emnlp-main.723"},{"key":"2831_CR33","doi-asserted-by":"crossref","unstructured":"Liu, S., Zeng, Z., Ren, T., Li, F., Zhang, H., Yang, J., Li, C., Yang, J., Su, H., Zhu, J., et\u00a0al. (2023). Grounding dino: Marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"2831_CR34","doi-asserted-by":"crossref","unstructured":"Liu, Y., Li, S., Liu, Y., Wang, Y., Ren, S., Li, L., Chen, S., Sun, X., & Hou, L. (2024). Tempcompass: Do video llms really understand videos? arXiv preprint arXiv:2403.00476","DOI":"10.18653\/v1\/2024.findings-acl.517"},{"key":"2831_CR35","doi-asserted-by":"crossref","unstructured":"Ma, F., Jin, X., Wang, H., Xian, Y., Feng, J., & Yang, Y. (2024). Vista-llama: Reducing hallucination in video language models via equal distance to visual tokens. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 13151\u201313160","DOI":"10.1109\/CVPR52733.2024.01249"},{"key":"2831_CR36","doi-asserted-by":"crossref","unstructured":"Maaz, M., Rasheed, H., Khan, S., & Khan, F. S. (2023). Video-chatgpt: Towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"2831_CR37","unstructured":"Maaz, M., Rasheed, H., Khan, S., & Khan, F. (2024). Videogpt+: Integrating image and video encoders for enhanced video understanding. arXiv preprint arXiv:2406.09418"},{"key":"2831_CR38","unstructured":"OpenAI. (2022). Introducing chatgpt."},{"key":"2831_CR39","unstructured":"OpenAI. (2025). GPT-5 System Card. Technical report, OpenAI, accessed: 2025-08-10."},{"key":"2831_CR40","unstructured":"Ravi, N., Gabeur, V., Hu, Y. T., Hu, R., Ryali, C., Ma, T., Khedr, H., R\u00e4dle, R., Rolland, C., Gustafson, L., et\u00a0al. (2024). Sam 2: Segment anything in images and videos. arXiv preprint arXiv:2408.00714"},{"key":"2831_CR41","doi-asserted-by":"crossref","unstructured":"Rawte, V., Jain, S., Sinha, A., Kaushik, G., Bansal, A., Vishwanath, P. R., Jain, S. R., Reganti , A. N., Jain, V., Chadha, A., et\u00a0al. (2024). Vibe: A text-to-video benchmark for evaluating hallucination in large multimodal models. arXiv preprint arXiv:2411.10867","DOI":"10.18653\/v1\/2025.trustnlp-main.15"},{"key":"2831_CR42","unstructured":"Ren, T., Liu, S., Zeng, A., Lin, J., Li, K., Cao, H., Chen, J., Huang, X., Chen, Y., Yan, F., Zeng , Z., Zhang, H., Li, F., Yang, J., Li, H., Jiang, Q., & Zhang, L. (2024). Grounded sam: Assembling open-world models for diverse visual tasks. arXiv:2401.14159"},{"key":"2831_CR43","first-page":"11709","volume":"2024","author":"P Sahoo","year":"2024","unstructured":"Sahoo, P., Meharia, P., Ghosh, A., Saha, S., Jain, V., & Chadha, A. (2024). A comprehensive survey of hallucination in large language, image, video and audio foundation models. Findings of the Association for Computational Linguistics: EMNLP, 2024, 11709\u201311724.","journal-title":"Findings of the Association for Computational Linguistics: EMNLP"},{"key":"2831_CR44","unstructured":"Shangguan, Z., Li, C., Ding, Y., Zheng, Y., Zhao, Y., Fitzgerald, T., & Cohan, A. (2024). Tomato: Assessing visual temporal reasoning capabilities in multimodal foundation models. arXiv preprint arXiv:2410.23266"},{"key":"2831_CR45","unstructured":"Sun, Y., Liu, Z., Liu, C., Pu, B., Zhang, Z., & Xie, H. (2024). Hallucination mitigation prompts long-term video understanding. arXiv preprint arXiv:2406.11333"},{"key":"2831_CR46","unstructured":"Team, G., Georgiev, P., Lei, V. I., Burnell, R., Bai, L., Gulati, A., Tanzer, G., Vincent, D., Pan, Z., Wang, S., et\u00a0al. (2024). Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530"},{"key":"2831_CR47","doi-asserted-by":"crossref","unstructured":"Tom, G., Mathew, M., Garcia-Bordils, S., Karatzas, D., & Jawahar, C. (2023). Reading between the lanes: Text videoqa on the road. In International conference on document analysis and recognition (pp. 137\u2013154). Springer.","DOI":"10.1007\/978-3-031-41731-3_9"},{"key":"2831_CR48","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., Azhar, F., Rodriguez, A., Joulin, A., Grave, E., & Lample, G. (2023a). Llama: Open and efficient foundation language models. CoRR abs\/2302.13971"},{"key":"2831_CR49","unstructured":"Touvron, H., Martin, L., Stone, K., Albert, P., Almahairi, A., Babaei, Y., Bashlykov, N., Batra, S., Bhargava, P., Bhosale, S., et\u00a0al. (2023b). Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288"},{"key":"2831_CR50","doi-asserted-by":"crossref","unstructured":"Wang, H., Xu, Z., Cheng, Y., Diao, S., Zhou, Y., Cao, Y., Wang, Q., Ge, W., & Huang, L. (2024a). Grounded-videollm: Sharpening fine-grained temporal grounding in video large language models. arXiv preprint arXiv:2410.03290","DOI":"10.18653\/v1\/2025.findings-emnlp.50"},{"key":"2831_CR51","doi-asserted-by":"crossref","unstructured":"Wang, X., Wu, J., Chen, J., Li, L., Wang, Y. F., & Wang, W. Y. (2019). Vatex: A large-scale, high-quality multilingual dataset for video-and-language research. In Proceedings of the IEEE\/CVF international conference on computer vision, pp 4581\u20134591.","DOI":"10.1109\/ICCV.2019.00468"},{"key":"2831_CR52","unstructured":"Wang, Y., Wang, Y., Zhao, D., Xie, C., & Zheng, Z. (2024b). Videohallucer: Evaluating intrinsic and extrinsic hallucinations in large video-language models. arXiv preprint arXiv:2406.16338"},{"key":"2831_CR53","doi-asserted-by":"crossref","unstructured":"Xiao, J., Shang, X., Yao, A., & Chua, T. S. (2021). Next-qa: Next phase of question-answering to explaining temporal actions. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 9777\u20139786.","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"2831_CR54","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., & Rui, Y. (2016). Msr-vtt: A large video description dataset for bridging video and language. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp 5288\u20135296.","DOI":"10.1109\/CVPR.2016.571"},{"key":"2831_CR55","unstructured":"Xu, L., Zhao, Y., Zhou, D., Lin, Z., Ng, S. K., & Feng, J. (2024). Pllava: Parameter-free llava extension from images to videos for video dense captioning. arXiv preprint arXiv:2404.16994"},{"key":"2831_CR56","unstructured":"Yi, K., Gan, C., Li, Y., Kohli, P., Wu, J., Torralba, A., & Tenenbaum, J. B. (2019). Clevrer: Collision events for video representation and reasoning. arXiv preprint arXiv:1910.01442"},{"key":"2831_CR57","doi-asserted-by":"crossref","unstructured":"Yu, Z., Xu, D., Yu, J., Yu, T., Zhao, Z., Zhuang, Y., & Tao, D. (2019). Activitynet-qa: A dataset for understanding complex web videos via question answering. In Proceedings of the AAAI conference on artificial intelligence.","DOI":"10.1609\/aaai.v33i01.33019127"},{"key":"2831_CR58","unstructured":"Zhang, J., Jiao, Y., Chen, S., Chen, J., & Jiang, Y. G. (2024a). Eventhallusion: Diagnosing event hallucinations in video llms. arXiv preprint arXiv:2409.16597"},{"key":"2831_CR59","doi-asserted-by":"crossref","unstructured":"Zhang, R., Gui, L., Sun, Z., Feng, Y., Xu, K., Zhang, Y., Fu, D., Li, C., Hauptmann, A., Bisk, Y., et\u00a0al. (2024b). Direct preference optimization of video large multimodal models from language model reward. arXiv preprint arXiv:2404.01258","DOI":"10.18653\/v1\/2025.naacl-long.30"},{"key":"2831_CR60","unstructured":"Zhang, Y., Li, B., Liu h, Lee Yj, Gui, L., Fu, D., Feng, J., Liu, Z., & Li, C. (2024c). Llava-next: A strong zero-shot video understanding model. https:\/\/llava-vl.github.io\/blog\/2024-04-30-llava-next-video\/"},{"key":"2831_CR61","first-page":"35549","volume":"35","author":"M Zhao","year":"2022","unstructured":"Zhao, M., Li, B., Wang, J., Li, W., Zhou, W., Zhang, L., Xuyang, S., Yu, Z., Yu, X., Li, G., et al. (2022). Towards video text visual question answering: Benchmark and baseline. Advances in Neural Information Processing Systems, 35, 35549\u201335562.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2831_CR62","doi-asserted-by":"crossref","unstructured":"Zhou, L., Xu, C., & Corso, J. (2018). Towards automatic learning of procedures from web instructional videos. In Proceedings of the AAAI conference on artificial intelligence.","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"2831_CR63","doi-asserted-by":"crossref","unstructured":"Zohar, O., Wang, X., Dubois, Y., Mehta, N., Xiao, T., Hansen-Estruch, P., Yu, L., Wang, X., Juefei-Xu, F., Zhang, N., et\u00a0al. (2024). Apollo: An exploration of video understanding in large multimodal models. arXiv preprint arXiv:2412.10360","DOI":"10.1109\/CVPR52734.2025.01760"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02831-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-026-02831-1","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02831-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T08:08:52Z","timestamp":1779264532000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-026-02831-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,20]]},"references-count":63,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2026,6]]}},"alternative-id":["2831"],"URL":"https:\/\/doi.org\/10.1007\/s11263-026-02831-1","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,5,20]]},"assertion":[{"value":"7 August 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 March 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 May 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"278"}}