{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T08:33:52Z","timestamp":1758098032190,"version":"3.44.0"},"reference-count":70,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"10","license":[{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Young Scientists Fund of the Hunan Natural Science Foundation","award":["2024JJ6474"],"award-info":[{"award-number":["2024JJ6474"]}]},{"name":"Youth Independent Innovation Science Fund Project of NUDT","award":["ZK24-08"],"award-info":[{"award-number":["ZK24-08"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376282"],"award-info":[{"award-number":["62376282"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1109\/tpami.2025.3592831","type":"journal-article","created":{"date-parts":[[2025,7,25]],"date-time":"2025-07-25T17:56:43Z","timestamp":1753466203000},"page":"9330-9344","source":"Crossref","is-referenced-by-count":0,"title":["WildVideo: Benchmarking LMMs for Understanding Video-Language Interaction"],"prefix":"10.1109","volume":"47","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-0170-3377","authenticated-orcid":false,"given":"Songyuan","family":"Yang","sequence":"first","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology, Changsha, Hunan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7449-3093","authenticated-orcid":false,"given":"Weijiang","family":"Yu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6997-0406","authenticated-orcid":false,"given":"Wenjing","family":"Yang","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology, Changsha, Hunan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9066-1475","authenticated-orcid":false,"given":"Xinwang","family":"Liu","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology, Changsha, Hunan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4060-8793","authenticated-orcid":false,"given":"Huibin","family":"Tan","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology, Changsha, Hunan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4238-8985","authenticated-orcid":false,"given":"Long","family":"Lan","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology, Changsha, Hunan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2166-977X","authenticated-orcid":false,"given":"Nong","family":"Xiao","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"article-title":"MME: A comprehensive evaluation benchmark for multimodal large language models","year":"2023","author":"Fu","key":"ref1"},{"key":"ref2","first-page":"9556","article-title":"MMMU: A massive multi-discipline multimodal understanding and reasoning benchmark for expert AGI","volume-title":"Proc. Conf. Comput. Vis. Pattern Recognit.","author":"Xiang","year":"2024"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01363"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01355"},{"key":"ref5","first-page":"8698","article-title":"MMDU: A multi-turn multi-image dialog understanding benchmark and instruction-tuning dataset for LVLMs","author":"Liu","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"article-title":"MM-Vet v2: A challenging benchmark to evaluate large multimodal models for integrated capabilities","year":"2024","author":"Weihao","key":"ref6"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"ref8","first-page":"42748","article-title":"Perception test: A diagnostic benchmark for multimodal video models","volume":"36","author":"Patraucean","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"article-title":"Video-Bench: A comprehensive benchmark and toolkit for evaluating video-based large language models","year":"2023","author":"Munan","key":"ref9"},{"key":"ref10","first-page":"46212","article-title":"EgoSchema: A diagnostic benchmark for very long-form video language understanding","author":"Mangalam","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73113-6_11"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.517"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02245"},{"article-title":"LVBench: An extreme long video understanding benchmark","year":"2024","author":"Wang","key":"ref14"},{"article-title":"MLVU: A comprehensive benchmark for multi-task long video understanding","year":"2024","author":"Zhou","key":"ref15"},{"key":"ref16","article-title":"MMWorld: Towards multi-discipline multi-faceted world model evaluation in videos","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"He","year":"2025"},{"key":"ref17","article-title":"VideoHallucer: Evaluating intrinsic and extrinsic hallucinations in large video-language models","volume":"abs\/2406.16338","author":"Wang","year":"2024","journal-title":"CoRR"},{"key":"ref18","first-page":"32076","article-title":"ET Bench: Towards open-ended event-level video-language understanding","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"37","author":"Liu","year":"2024"},{"key":"ref19","first-page":"89098","article-title":"MMBench-video: A long-form multi-shot benchmark for holistic video understanding","volume":"37","author":"Fang","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"article-title":"The curse of multi-modalities: Evaluating hallucinations of large multimodal models across language, visual, and audio","year":"2024","author":"Sicong","key":"ref20"},{"key":"ref21","first-page":"53168","article-title":"HourVideo: 1-hour video-language understanding","volume-title":"Adv. Neural Inf. Process. Syst.","volume":"37","author":"Chandrasegaran","year":"2024"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02283"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4231-5"},{"article-title":"Qwen-VL: A versatile vision-language model for understanding, localization, text reading, and beyond","year":"2023","author":"Bai","key":"ref24"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-025-61040-5"},{"year":"2023","key":"ref26","article-title":"GPT-4V(ision) system card"},{"year":"2024","key":"ref27","article-title":"Gemini: A family of highly capable multimodal models"},{"year":"2024","key":"ref28","article-title":"Introducing LLaMA 3.1: Our most capable models to date"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"ref30","article-title":"VideoLLaMA 2: Advancing spatial-temporal modeling and audio understanding in video-LLMs","volume":"abs\/2406.07476","author":"Cheng","year":"2024","journal-title":"CoRR"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"ref32","first-page":"19472","article-title":"ShareGPT4Video: Improving video understanding and generation with better captions","volume":"37","author":"Chen","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"year":"2024","key":"ref33","article-title":"InternVL2 blog"},{"year":"2024","key":"ref34","article-title":"GPT-4o mini: Advancing cost-efficient intelligence"},{"key":"ref35","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","volume":"abs\/2403.05530","year":"2024","journal-title":"CoRR"},{"year":"2024","key":"ref36","article-title":"Hello GPT-4o"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/3703155"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/3571730"},{"article-title":"A survey of hallucination in large foundation models","year":"2023","author":"Vipula","key":"ref39"},{"article-title":"A survey on hallucination in large vision-language models","year":"2024","author":"Hanchao","key":"ref40"},{"article-title":"Hallucination of multimodal large language models: A survey","year":"2024","author":"Zechen","key":"ref41"},{"key":"ref42","first-page":"292","article-title":"Evaluating object hallucination in large vision-language models","volume-title":"Proc. 2023 Conf. Empirical Methods Natural Lang. Process.","author":"Li","year":"2023"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.178"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.573"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.290"},{"article-title":"Evaluation and analysis of hallucination in large vision-language models","year":"2023","author":"Junyang","key":"ref46"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.572"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4251-x"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-024-07421-0"},{"key":"ref50","article-title":"Video instruction tuning with synthetic data","author":"Zhang","year":"2025","journal-title":"Trans. Mach. Learn. Res."},{"year":"2024","key":"ref51","article-title":"Claude 3.5 sonnet"},{"article-title":"Vicuna: An open-source chatbot impressing GPT-4 with 90% ChatGPT quality","year":"2023","author":"Chiang","key":"ref52"},{"key":"ref53","article-title":"Yi: Open foundation models by 01.AI","volume":"abs\/2403.04652","year":"2024","journal-title":"CoRR"},{"key":"ref54","article-title":"Qwen2 Technical report","volume":"abs\/2407.10671","year":"2024","journal-title":"CoRR"},{"year":"2024","key":"ref55","article-title":"Introducing meta LLaMA 3: The most capable openly available LLM to date"},{"year":"2024","key":"ref56","article-title":"GPT-4 technical report"},{"key":"ref57","article-title":"Ovis: Structural embedding alignment for multimodal large language model","volume":"abs\/2405.20797","author":"Lu","year":"2024","journal-title":"CoRR"},{"key":"ref58","article-title":"Qwen2-VL: Enhancing vision-language model\u2019s perception of the world at any resolution","volume":"abs\/2409.12191","author":"Qwen","year":"2024","journal-title":"CoRR"},{"key":"ref59","article-title":"SEED-Bench-2-Plus: Benchmarking multimodal large language models with text-rich visual comprehension","volume":"abs\/2404.16790","author":"Li","year":"2024","journal-title":"CoRR"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"ref61","first-page":"100734","article-title":"ConvBench: A multi-turn conversation evaluation benchmark with hierarchical capability for large vision-language models","volume":"37","author":"Liu","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00772"},{"key":"ref64","first-page":"27056","article-title":"Are we on the right way for evaluating large vision-language models?","volume":"37","author":"Chen","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.365"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.153"},{"key":"ref67","first-page":"2381","article-title":"MM-Vet: Evaluating large multimodal models for integrated capabilities","author":"Yu","year":"2024","journal-title":"Proc. 41st Int. Conf. Mach. Learn."},{"key":"ref68","first-page":"46595","article-title":"Judging LLM-as-a-judge with MT-bench and chatbot arena","volume":"36","author":"Zheng","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02324"},{"year":"2024","key":"ref70","article-title":"Gemini flash: Lightweight models, two variants, both optimized for speed and efficiency"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/11163533\/11097075.pdf?arnumber=11097075","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T13:11:30Z","timestamp":1758028290000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11097075\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10]]},"references-count":70,"journal-issue":{"issue":"10"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3592831","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"type":"print","value":"0162-8828"},{"type":"electronic","value":"2160-9292"},{"type":"electronic","value":"1939-3539"}],"subject":[],"published":{"date-parts":[[2025,10]]}}}