{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T16:16:35Z","timestamp":1775578595422,"version":"3.50.1"},"reference-count":371,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Circuits Syst. Video Technol."],"published-print":{"date-parts":[[2026,2]]},"DOI":"10.1109\/tcsvt.2025.3566695","type":"journal-article","created":{"date-parts":[[2025,5,2]],"date-time":"2025-05-02T13:18:12Z","timestamp":1746191892000},"page":"1355-1376","source":"Crossref","is-referenced-by-count":60,"title":["Video Understanding With Large Language Models: A Survey"],"prefix":"10.1109","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2796-1787","authenticated-orcid":false,"given":"Yunlong","family":"Tang","sequence":"first","affiliation":[{"name":"Department of Computer Science, University of Rochester, Rochester, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8235-2158","authenticated-orcid":false,"given":"Jing","family":"Bi","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Rochester, Rochester, NY, USA"}]},{"given":"Siting","family":"Xu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Southern University of Science and Technology, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0126-1259","authenticated-orcid":false,"given":"Luchuan","family":"Song","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Rochester, Rochester, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3523-1339","authenticated-orcid":false,"given":"Susan","family":"Liang","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Rochester, Rochester, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2331-3619","authenticated-orcid":false,"given":"Teng","family":"Wang","sequence":"additional","affiliation":[{"name":"Department of Computer Science, The University of Hong Kong, Pokfulam, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6959-165X","authenticated-orcid":false,"given":"Daoan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Rochester, Rochester, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1402-8288","authenticated-orcid":false,"given":"Jie","family":"An","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Rochester, Rochester, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3223-3827","authenticated-orcid":false,"given":"Jingyang","family":"Lin","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Rochester, Rochester, NY, USA"}]},{"given":"Rongyi","family":"Zhu","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Rochester, Rochester, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1014-2937","authenticated-orcid":false,"given":"Ali","family":"Vosoughi","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Rochester, Rochester, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1469-1020","authenticated-orcid":false,"given":"Chao","family":"Huang","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Rochester, Rochester, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3890-5388","authenticated-orcid":false,"given":"Zeliang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Rochester, Rochester, NY, USA"}]},{"given":"Pinxin","family":"Liu","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Rochester, Rochester, NY, USA"}]},{"given":"Mingqian","family":"Feng","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Rochester, Rochester, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1701-9141","authenticated-orcid":false,"given":"Feng","family":"Zheng","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Southern University of Science and Technology, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9317-0268","authenticated-orcid":false,"given":"Jianguo","family":"Zhang","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Southern University of Science and Technology, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6685-7950","authenticated-orcid":false,"given":"Ping","family":"Luo","sequence":"additional","affiliation":[{"name":"Department of Computer Science, The University of Hong Kong, Pokfulam, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4516-9729","authenticated-orcid":false,"given":"Jiebo","family":"Luo","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Rochester, Rochester, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2183-822X","authenticated-orcid":false,"given":"Chenliang","family":"Xu","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Rochester, Rochester, NY, USA"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.4249\/scholarpedia.10491"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2007.09.014"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.177"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2013.12.005"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-022-10159-8"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.441"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-16178-5_38"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2003.1211373"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2004.1334092"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICMLC.2002.1167381"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2007.70738"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2013.11.009"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-88458-3_27"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.59"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.213"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299101"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2015.06.029"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.168"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-015-2819-7"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20044-1_30"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1212.0402"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.4324\/9781410605337-29"},{"key":"ref28","article-title":"The kinetics human action video dataset","author":"Kay","year":"2017","journal-title":"arXiv:1705.06950"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2017.373"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_22"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_18"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_43"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.590"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2712608"},{"key":"ref40","article-title":"Temporal 3D ConvNets: New architecture and transfer learning for video classification","author":"Diba","year":"2017","journal-title":"arXiv:1711.08200"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"ref42","first-page":"1","article-title":"V4D: 4D convolutional neural networks for video-level representation learning","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Zhang"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00565"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2010.11929"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2102.05095"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01332"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00128"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01920"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i9.26325"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.24818\/ida-ql\/2019.5"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00877"},{"key":"ref57","first-page":"35946","article-title":"Masked autoencoders as spatiotemporal learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Feichtenhofer"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01003"},{"key":"ref59","first-page":"10078","article-title":"VideoMAE: Masked autoencoders are data-efficient learners for self-supervised video pre-training","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Zhan"},{"key":"ref60","article-title":"Self-supervised video representation learning with motion-aware masked autoencoders","author":"Yang","year":"2022","journal-title":"arXiv:2210.04154"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01426"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.370"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00490"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2757"},{"key":"ref65","article-title":"MaskViT: Masked visual pre-training for video prediction","volume-title":"arXiv:2206.11894","author":"Gupta"},{"key":"ref66","first-page":"1","article-title":"CLIP-ViP: Adapting pre-trained image-text model to video-language alignment","volume-title":"Proc. 11th Int. Conf. Learn. Represent.","author":"Xue"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.29"},{"key":"ref68","first-page":"38032","article-title":"Long-form video-language pre-training with multimodal temporal contrastive learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Sun"},{"key":"ref69","first-page":"30291","article-title":"Expectation-maximization contrastive learning for compact video-and-language representations","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Jin"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01413"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01421"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1145\/3709005"},{"key":"ref73","article-title":"DNAGPT: A generalized pretrained tool for multiple DNA sequence analysis tasks","author":"Zhang","year":"2023","journal-title":"arXiv:2307.05628"},{"key":"ref74","volume-title":"Introducing ChatGPT","year":"2022"},{"key":"ref75","article-title":"Visual ChatGPT: Talking, drawing and editing with visual foundation models","author":"Wu","year":"2023","journal-title":"arXiv:2303.04671"},{"key":"ref76","article-title":"Visual instruction tuning","author":"Liu","year":"2023","journal-title":"arXiv:2304.08485"},{"key":"ref77","article-title":"A survey of large language models","author":"Xin Zhao","year":"2023","journal-title":"arXiv:2303.18223"},{"key":"ref78","article-title":"Multimodal foundation models: From specialists to general-purpose assistants","author":"Li","year":"2023","journal-title":"arXiv:2309.10020"},{"key":"ref79","article-title":"A review of deep learning for video captioning","author":"Abdar","year":"2023","journal-title":"arXiv:2304.11431"},{"key":"ref80","article-title":"A comprehensive study of deep video action recognition","author":"Zhu","year":"2020","journal-title":"arXiv:2012.06567"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1145\/3696415"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1007\/s10115-024-02310-4"},{"key":"ref83","article-title":"Foundation models for video understanding: A survey","author":"Madan","year":"2024","journal-title":"arXiv:2405.03770"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206557"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"ref87","article-title":"A short note about kinetics-600","author":"Carreira","year":"2018","journal-title":"arXiv:1808.01340"},{"key":"ref88","article-title":"A short note on the kinetics-700 human action dataset","author":"Carreira","year":"2019","journal-title":"arXiv:1907.06987"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00876"},{"key":"ref90","article-title":"YouTube-8M: A large-scale video classification benchmark","author":"Abu-El-Haija","year":"2016","journal-title":"arXiv:1609.08675"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02062"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_41"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123427"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"ref96","article-title":"Youku-mPLUG: A 10 million large-scale Chinese video-language dataset for pre-training and benchmarks","author":"Xu","year":"2023","journal-title":"arXiv:2306.04362"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.aacl-main.48"},{"key":"ref98","article-title":"VideoXum: Cross-modal visual and textural summarization of videos","author":"Lin","year":"2023","journal-title":"arXiv:2303.12060"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i4.32374"},{"key":"ref101","first-page":"190","article-title":"Collecting highly parallel data for paraphrase evaluation","volume-title":"Proc. 49th Annu. Meeting Assoc. Comput. Linguistics, Hum. Lang. Technol.","author":"Chen"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.502"},{"key":"ref104","article-title":"Charades-ego: A large-scale dataset of paired third and first person videos","author":"Sigurdsson","year":"2018","journal-title":"arXiv:1804.09626"},{"key":"ref105","article-title":"VAST: A vision-audio-subtitle-text omni-modality foundation model and dataset","author":"Chen","year":"2023","journal-title":"arXiv:2305.18500"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00688"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.149"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/280"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d18-1167"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019127"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10584-0_33"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299154"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-26284-5_34"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10590-1_51"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_38"},{"key":"ref117","volume-title":"THUMOS Challenge: Action Recognition With a Large Number of Classes","author":"Jiang et al","year":"2014"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02197"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i7.32784"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00797"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3152990"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00677"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3241517"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01462"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3369863"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"ref128","article-title":"VidChapters-7M: Video chapters at scale","author":"Yang","year":"2023","journal-title":"arXiv:2309.13952"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.312"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_27"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-25085-9_25"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.133"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.27"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00016"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-48881-3_2"},{"key":"ref136","article-title":"AIM 2024 challenge on video saliency prediction: Methods and results","author":"Moskalenko","year":"2024","journal-title":"arXiv:2409.14827"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00514"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2008.4587727"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1109\/VCIP.2015.7457921"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1167\/14.8.5"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4939-3435-5_16"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1016\/j.image.2015.08.004"},{"key":"ref143","article-title":"YouTube-VOS: A large-scale video object segmentation benchmark","author":"Xu","year":"2018","journal-title":"arXiv:1809.03327"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.85"},{"key":"ref145","article-title":"The 4th large-scale video object segmentation challenge\u2013video instance segmentation track","author":"Yang","year":"2022"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.350"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00254"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01068"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3085907"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"ref151","article-title":"Scaling laws for neural language models","author":"Kaplan","year":"2020","journal-title":"arXiv:2001.08361"},{"key":"ref152","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023","journal-title":"arXiv:2304.10592"},{"key":"ref153","article-title":"Shikra: Unleashing multimodal LLM\u2019s referential dialogue magic","author":"Chen","year":"2023","journal-title":"arXiv:2306.15195"},{"key":"ref154","article-title":"Pink: Unveiling the power of referential comprehension for multi-modal LLMs","author":"Xuan","year":"2023","journal-title":"arXiv:2310.00582"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72673-6_26"},{"key":"ref156","article-title":"Seeing the unseen: Visual metaphor captioning for videos","author":"Kalarani","year":"2024","journal-title":"arXiv:2406.04886"},{"key":"ref157","article-title":"Zero-shot long-form video understanding through screenplay","author":"Wu","year":"2024","journal-title":"arXiv:2406.17309"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01257"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2024.3517625"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.294"},{"key":"ref161","article-title":"Understanding long videos with multimodal language models","author":"Ranasinghe","year":"2024","journal-title":"arXiv:2403.16998"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01723"},{"key":"ref163","article-title":"A simple LLM framework for long-range video question-answering","author":"Zhang","year":"2023","journal-title":"arXiv:2312.17235"},{"key":"ref164","article-title":"Grounding-prompter: Prompting LLM with multimodal information for temporal sentence grounding in long videos","author":"Chen","year":"2023","journal-title":"arXiv:2312.17117"},{"key":"ref165","article-title":"Learning object state changes in videos: An open-world perspective","author":"Xue","year":"2023","journal-title":"arXiv:2312.11782"},{"key":"ref166","article-title":"AntGPT: Can large language models help long-term action anticipation from videos?","author":"Zhao","year":"2023","journal-title":"arXiv:2307.16368"},{"key":"ref167","volume-title":"Vlog: Transform Video as a Document With ChatGPT, Clip, Blip2, Grit, Whisper, Langchain","year":"2023"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00637"},{"key":"ref169","article-title":"DrVideo: Document retrieval based long video understanding","author":"Ma","year":"2024","journal-title":"arXiv:2406.12846"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.559"},{"key":"ref171","article-title":"Too many frames, not all useful: Efficient strategies for long-form video QA","author":"Park","year":"2024","journal-title":"arXiv:2406.09396"},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2023.3340103"},{"key":"ref173","article-title":"VideoTree: Adaptive tree-based video representation for LLM reasoning on long videos","author":"Wang","year":"2024","journal-title":"arXiv:2405.19209"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01753"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.544"},{"key":"ref176","article-title":"Reframe anything: LLM agent for open world video reframing","author":"Cao","year":"2024","journal-title":"arXiv:2403.06070"},{"key":"ref177","article-title":"SCHEMA: State CHangEs MAtter for procedure planning in instructional videos","author":"Niu","year":"2024","journal-title":"arXiv:2403.01599"},{"key":"ref178","first-page":"38154","article-title":"HuggingGPT: Solving AI tasks with ChatGPT and its friends in hugging face","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Shen"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.1059"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72670-5_5"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72989-8_4"},{"key":"ref182","article-title":"VURF: A general-purpose reasoning and self-refinement framework for video understanding","author":"Mahmood","year":"2024","journal-title":"arXiv:2403.14743"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01780"},{"key":"ref184","article-title":"DoraemonGPT: Toward understanding dynamic scenes with large language models (exemplified as a video agent)","author":"Yang","year":"2024","journal-title":"arXiv:2401.08392"},{"key":"ref185","article-title":"Hawk: Learning to understand open-world video anomalies","author":"Tang","year":"2024","journal-title":"arXiv:2405.16886"},{"key":"ref186","article-title":"LifelongMemory: Leveraging LLMs for answering queries in long-form egocentric videos","author":"Wang","year":"2023","journal-title":"arXiv:2312.05269"},{"key":"ref187","article-title":"Zero-shot video question answering with procedural programs","author":"Choudhury","year":"2023","journal-title":"arXiv:2312.00937"},{"key":"ref188","article-title":"AssistGPT: A general multi-modal assistant that can plan, execute, inspect, and learn","author":"Gao","year":"2023","journal-title":"arXiv:2306.08640"},{"key":"ref189","article-title":"ChatVideo: A tracklet-centric multimodal and versatile video understanding system","author":"Wang","year":"2023","journal-title":"arXiv:2304.14407"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01092"},{"key":"ref191","article-title":"Artemis: Towards referential understanding in complex videos","author":"Qiu","year":"2024","journal-title":"arXiv:2406.00258"},{"key":"ref192","article-title":"EmoLLM: Multimodal emotional understanding meets large language models","author":"Yang","year":"2024","journal-title":"arXiv:2406.16442"},{"key":"ref193","article-title":"Fewer tokens and fewer videos: Extending video understanding abilities in large vision-language models","author":"Chen","year":"2024","journal-title":"arXiv:2406.08024"},{"key":"ref194","article-title":"Flash-VStream: Memory-based real-time understanding for long video streams","author":"Zhang","year":"2024","journal-title":"arXiv:2406.08085"},{"key":"ref195","article-title":"LLAVIDAL: A large LAnguage VIsion model for daily activities of living","author":"Reilly","year":"2024","journal-title":"arXiv:2406.09390"},{"key":"ref196","article-title":"Long context transfer from language to vision","author":"Zhang","year":"2024","journal-title":"arXiv:2406.16852"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0614"},{"key":"ref198","article-title":"Towards event-oriented long video understanding","author":"Du","year":"2024","journal-title":"arXiv:2406.14129"},{"key":"ref199","article-title":"Video-SALMONN: Speech-enhanced audio-visual large language models","author":"Sun","year":"2024","journal-title":"arXiv:2406.15704"},{"key":"ref200","article-title":"VideoGPT$: Integrating image and video encoders for enhanced video understanding","author":"Maaz","year":"2024","journal-title":"arXiv:2406.09418"},{"key":"ref201","article-title":"VideoLLaMA 2: Advancing spatial\u2013temporal modeling and audio understanding in video-LLMs","author":"Cheng","year":"2024","journal-title":"arXiv:2406.07476"},{"key":"ref202","article-title":"MotionLLM: Understanding human behaviors from human motions and videos","author":"Chen","year":"2024","journal-title":"arXiv:2405.20340"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1145\/3689091.3690086"},{"key":"ref205","article-title":"Streaming long video understanding with large language models","author":"Qian","year":"2024","journal-title":"arXiv:2405.16009"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.513"},{"key":"ref207","article-title":"TOPA: Extending large language models for video understanding via text-only pre-alignment","author":"Li","year":"2024","journal-title":"arXiv:2405.13911"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01720"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680826"},{"key":"ref210","article-title":"Direct preference optimization of video large multimodal models from language model reward","author":"Zhang","year":"2024","journal-title":"arXiv:2404.01258"},{"key":"ref211","article-title":"From image to video, what do we need in multimodal LLMs?","author":"Huang","year":"2024","journal-title":"arXiv:2404.11865"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01289"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73414-4_26"},{"key":"ref214","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01282"},{"key":"ref215","article-title":"MiniGPT4-video: Advancing multimodal LLMs for video understanding with interleaved visual-textual tokens","author":"Ataallah","year":"2024","journal-title":"arXiv:2404.03413"},{"key":"ref216","article-title":"Pegasus-v1 technical report","volume-title":"arXiv:2404.14687","author":"Jung","year":"2024"},{"key":"ref217","article-title":"PLLaVA: Parameter-free LLaVA extension from images to videos for video dense captioning","author":"Xu","year":"2024","journal-title":"arXiv:2404.16994"},{"key":"ref218","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72998-0_1"},{"key":"ref219","article-title":"COSMO: COntrastive streamlined MultimOdal model with interleaved pre-training","author":"Jinpeng Wang","year":"2024","journal-title":"arXiv:2401.00849"},{"key":"ref220","article-title":"Tarsier: Recipes for training and evaluating large video description models","author":"Wang","year":"2024","journal-title":"arXiv:2407.00634"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00332"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72684-2_9"},{"key":"ref223","article-title":"VideoLLM: Modeling video sequence with large language models","author":"Chen","year":"2023","journal-title":"arXiv:2305.13292"},{"key":"ref224","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73013-9_23"},{"key":"ref225","article-title":"MovieLLM: Enhancing long video understanding with AI-generated movies","author":"Song","year":"2024","journal-title":"arXiv:2403.01422"},{"key":"ref226","article-title":"LLMs meet long video: Advancing long video question answering with an interactive visual adapter in LLMs","author":"Li","year":"2024","journal-title":"arXiv:2402.13546"},{"key":"ref227","article-title":"LSTP: Language-guided spatial\u2013temporal prompt learning for long-form video-text understanding","author":"Wang","year":"2024","journal-title":"arXiv:2402.16050"},{"key":"ref228","article-title":"LVCHAT: Facilitating long video comprehension","author":"Wang","year":"2024","journal-title":"arXiv:2402.12079"},{"key":"ref229","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-naacl.226"},{"key":"ref230","article-title":"Slot-VLM: SlowFast slots for video-language modeling","author":"Xu","year":"2024","journal-title":"arXiv:2402.13088"},{"key":"ref231","article-title":"Audio-visual LLM for video understanding","author":"Shu","year":"2023","journal-title":"arXiv:2312.06720"},{"key":"ref232","article-title":"Generative multimodal models are in-context learners","author":"Sun","year":"2023","journal-title":"arXiv:2312.13286"},{"key":"ref233","doi-asserted-by":"publisher","DOI":"10.1145\/3688804"},{"key":"ref234","article-title":"VaQuitA: Enhancing alignment in LLM-assisted video understanding","author":"Wang","year":"2023","journal-title":"arXiv:2312.02310"},{"key":"ref235","article-title":"VILA: On pre-training for visual language models","author":"Lin","year":"2023","journal-title":"arXiv:2312.07533"},{"key":"ref236","article-title":"Vista-LLaMA: Reducing hallucination in video language models via equal distance to visual tokens","author":"Ma","year":"2023","journal-title":"arXiv:2312.08870"},{"key":"ref237","article-title":"Chat-UniVi: Unified visual representation empowers large language models with image and video understanding","author":"Jin","year":"2023","journal-title":"arXiv:2311.08046"},{"key":"ref238","article-title":"LLaMA-VID: An image is worth 2 tokens in large language models","author":"Li","year":"2023","journal-title":"arXiv:2311.17043"},{"key":"ref239","article-title":"Video-LLaVA: Learning united visual representation by alignment before projection","author":"Lin","year":"2023","journal-title":"arXiv:2311.10122"},{"key":"ref240","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.261"},{"key":"ref241","article-title":"MovieChat: From dense token to sparse memory for long video understanding","author":"Song","year":"2023","journal-title":"arXiv:2307.16449"},{"key":"ref242","article-title":"LLMVA-GEBC: Large language model with video adapter for generic event boundary captioning","author":"Tang","year":"2023","journal-title":"arXiv:2306.10354"},{"key":"ref243","article-title":"Macaw-LLM: Multi-modal language modeling with image, audio, video, and text integration","author":"Lyu","year":"2023","journal-title":"arXiv:2306.09093"},{"key":"ref244","article-title":"Valley: Video assistant with large language model enhanced abilitY","author":"Luo","year":"2023","journal-title":"arXiv:2306.07207"},{"key":"ref245","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"ref246","article-title":"ChatBridge: Bridging modalities with large language model as a language catalyst","author":"Zhao","year":"2023","journal-title":"arXiv:2305.16103"},{"key":"ref247","article-title":"Otter: A multi-modal model with in-context instruction tuning","author":"Li","year":"2023","journal-title":"arXiv:2305.03726"},{"key":"ref248","article-title":"Holmes-VAD: Towards unbiased and explainable video anomaly detection via multi-modal LLM","author":"Zhang","year":"2024","journal-title":"arXiv:2406.12235"},{"key":"ref249","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01742"},{"key":"ref250","article-title":"HOI-ref: Hand-object interaction referral in egocentric vision","author":"Bansal","year":"2024","journal-title":"arXiv:2404.09933"},{"key":"ref251","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72670-5_10"},{"key":"ref252","article-title":"HawkEye: Training video-text LLMs for grounding text in videos","author":"Wang","year":"2024","journal-title":"arXiv:2403.10228"},{"key":"ref253","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73039-9_12"},{"key":"ref254","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01724"},{"key":"ref255","article-title":"Self-chained image-language model for video localization and question answering","author":"Yu","year":"2023","journal-title":"arXiv:2305.06988"},{"key":"ref256","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.360"},{"key":"ref257","article-title":"TimeChat: A time-sensitive multimodal large language model for long video understanding","author":"Ren","year":"2023","journal-title":"arXiv:2312.02051"},{"key":"ref258","article-title":"VTimeLLM: Empower LLM to grasp video moments","author":"Huang","year":"2023","journal-title":"arXiv:2311.18445"},{"key":"ref259","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i3.32341"},{"key":"ref260","article-title":"Vitron: A unified pixel-level vision LLM for understanding, generating, segmenting, editing","author":"Fei","year":"2024","journal-title":"arXiv:2412.19806"},{"key":"ref261","doi-asserted-by":"publisher","DOI":"10.3390\/app14051894"},{"key":"ref262","article-title":"Momentor: Advancing video large language model with fine-grained temporal reasoning","author":"Qian","year":"2024","journal-title":"arXiv:2402.11435"},{"key":"ref263","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01779"},{"key":"ref264","article-title":"OneLLM: One framework to align all modalities with language","author":"Han","year":"2023","journal-title":"arXiv:2312.03700"},{"key":"ref265","article-title":"GPT4Video: A unified multimodal large language model for lnstruction-followed understanding and safety-aware generation","author":"Wang","year":"2023","journal-title":"arXiv:2311.16511"},{"key":"ref266","article-title":"MM-VID: Advancing video understanding with GPT-4V(ision)","author":"Lin","year":"2023","journal-title":"arXiv:2310.19773"},{"key":"ref267","article-title":"Shot2Story20K: A new benchmark for comprehensive understanding of multi-shot videos","author":"Han","year":"2023","journal-title":"arXiv:2312.10300"},{"key":"ref268","article-title":"Vript: A video is worth thousands of words","author":"Yang","year":"2024","journal-title":"arXiv:2406.06040"},{"key":"ref269","article-title":"Merlin: Empowering multimodal LLMs with foresight minds","author":"Yu","year":"2023","journal-title":"arXiv:2312.00589"},{"key":"ref270","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4321-9"},{"key":"ref271","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"ref272","article-title":"Contextual AD narration with interleaved multimodal sequence","author":"Wang","year":"2024","journal-title":"arXiv:2403.12922"},{"key":"ref273","article-title":"MM-narrator: Narrating long-form videos with multimodal in-context learning","author":"Zhang","year":"2023","journal-title":"arXiv:2311.17435"},{"key":"ref274","article-title":"Vamos: Versatile action models for video understanding","author":"Wang","year":"2023","journal-title":"arXiv:2311.13627"},{"key":"ref275","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01255"},{"key":"ref276","article-title":"PG-Video-LLaVA: Pixel grounding large video-language models","author":"Munasinghe","year":"2023","journal-title":"arXiv:2311.13435"},{"key":"ref277","article-title":"Video ChatCaptioner: Towards enriched spatiotemporal descriptions","author":"Chen","year":"2023","journal-title":"arXiv:2304.04227"},{"key":"ref278","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"ref279","article-title":"SlowFast-LLaVA: A strong training-free baseline for video large language models","author":"Xu","year":"2024","journal-title":"arXiv:2407.15841"},{"key":"ref280","article-title":"Socratic models: Composing zero-shot multimodal reasoning with language","author":"Zeng","year":"2022","journal-title":"arXiv:2204.00598"},{"key":"ref281","article-title":"Fine-grained audio-visual joint representations for multimodal large language models","author":"Sun","year":"2023","journal-title":"arXiv:2310.05863"},{"key":"ref282","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","author":"Li"},{"key":"ref283","doi-asserted-by":"publisher","DOI":"10.1093\/nsr\/nwae403"},{"key":"ref284","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"ref285","first-page":"1","article-title":"STAR: A benchmark for situated reasoning in real-world videos","volume-title":"Proc. NeurIPS","author":"Wu"},{"key":"ref286","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref287","first-page":"65","article-title":"METEOR: An automatic metric for MT evaluation with improved correlation with human judgments","volume-title":"Proc. ACL Workshop Intrinsic Extrinsic Eval. Measures Mach. Transl. Summarization","author":"Banerjee"},{"key":"ref288","first-page":"74","article-title":"Rouge: A package for automatic evaluation of summaries","author":"Lin","year":"2004","journal-title":"Text Summarization Branches Out"},{"key":"ref289","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"ref290","article-title":"EgoSchema: A diagnostic benchmark for very long-form video language understanding","author":"Mangalam","year":"2023","journal-title":"arXiv:2308.09126"},{"key":"ref291","article-title":"Video-MME: The first-ever comprehensive evaluation benchmark of multi-modal LLMs in video analysis","author":"Fu","year":"2024","journal-title":"arXiv:2405.21075"},{"key":"ref292","article-title":"VidComposition: Can MLLMs analyze compositions in compiled videos?","author":"Tang","year":"2024","journal-title":"arXiv:2411.10979"},{"key":"ref293","article-title":"MLVU: Benchmarking multi-task long video understanding","author":"Zhou","year":"2024","journal-title":"arXiv:2406.04264"},{"key":"ref294","doi-asserted-by":"publisher","DOI":"10.26599\/cvm.2025.9450516"},{"key":"ref295","article-title":"AutoEval-video: An automatic benchmark for assessing large vision language models in open-ended video question answering","author":"Chen","year":"2023","journal-title":"arXiv:2311.14906"},{"key":"ref296","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.517"},{"key":"ref297","article-title":"VideoVista: A versatile benchmark for video understanding and reasoning","author":"Li","year":"2024","journal-title":"arXiv:2406.11303"},{"key":"ref298","article-title":"CinePile: A long video question answering dataset and benchmark","author":"Rawal","year":"2024","journal-title":"arXiv:2405.08813"},{"key":"ref299","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01271"},{"key":"ref300","article-title":"Short film dataset (SFD): A benchmark for story-level video understanding","author":"Ghermi","year":"2024","journal-title":"arXiv:2406.10221"},{"key":"ref301","article-title":"Beyond raw videos: Understanding edited videos with large multimodal model","author":"Xu","year":"2024","journal-title":"arXiv:2406.10484"},{"key":"ref302","article-title":"InfiniBench: A comprehensive benchmark for large multimodal models in very long video understanding","author":"Ataallah","year":"2024","journal-title":"arXiv:2406.19875"},{"key":"ref303","article-title":"MMWorld: Towards multi-discipline multi-faceted world model evaluation in videos","author":"He","year":"2024","journal-title":"arXiv:2406.08407"},{"key":"ref304","article-title":"VELOCITI: Can video-language models bind semantic concepts through time?","author":"Saravanan","journal-title":"arXiv:2406.10889"},{"key":"ref305","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681618"},{"key":"ref306","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i7.32785"},{"key":"ref307","article-title":"MMCOMPOSITION: Revisiting the compositionality of pre-trained vision-language models","author":"Hua","year":"2024","journal-title":"arXiv:2410.09733"},{"key":"ref308","article-title":"Unveiling visual perception in language models: An attention head analysis approach","author":"Bi","year":"2024","journal-title":"arXiv:2412.18108"},{"key":"ref309","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00911"},{"key":"ref310","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01318"},{"key":"ref311","article-title":"TRACE: Temporal grounding video LLM via causal event modeling","author":"Guo","year":"2024","journal-title":"arXiv:2410.05643"},{"key":"ref312","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01353"},{"key":"ref313","article-title":"GIT: A generative image-to-text transformer for vision and language","author":"Wang","year":"2022","journal-title":"arXiv:2205.14100"},{"key":"ref314","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01727"},{"key":"ref315","volume-title":"GPT-4V: An Overview","year":"2023"},{"key":"ref316","article-title":"LLaMA-adapter v2: Parameter-efficient visual instruction model","author":"Gao","year":"2023","journal-title":"arXiv:2304.15010"},{"key":"ref317","volume-title":"LLaVA-Next: A Strong Zero-Shot Video Understanding Model","author":"Zhang","year":"2024"},{"key":"ref318","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.86"},{"key":"ref319","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"ref320","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/104"},{"key":"ref321","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00234"},{"key":"ref322","article-title":"LaunchpadGPT: Language model as music visualization designer on launchpad","author":"Xu","year":"2023","journal-title":"arXiv:2307.04827"},{"key":"ref323","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475196"},{"key":"ref324","first-page":"20839","article-title":"Emotional listener portrait: Neural listener head generation with emotion","volume-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis.","author":"Song"},{"key":"ref325","article-title":"Tri2-plane: Volumetric avatar reconstruction with feature pyramid","author":"Song","year":"2024","journal-title":"arXiv:2401.09386"},{"key":"ref326","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP42928.2021.9506512"},{"key":"ref327","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01756"},{"key":"ref328","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00487"},{"key":"ref329","doi-asserted-by":"publisher","DOI":"10.1109\/BigData59044.2023.10386291"},{"key":"ref330","article-title":"A survey on deep multi-modal learning for body language recognition and generation","author":"Liu","year":"2023","journal-title":"arXiv:2308.08849"},{"key":"ref331","doi-asserted-by":"publisher","DOI":"10.1007\/s10209-023-00992-1"},{"key":"ref332","article-title":"Generating video game quests from stories","author":"Mishra","year":"2023"},{"key":"ref333","article-title":"Text generation for quests in multiplayer role-playing video games","author":"Koomen","year":"2023"},{"issue":"1","key":"ref334","first-page":"67","article-title":"Large language models for enhancing customer lifecycle management","volume":"7","author":"Soni","year":"2023","journal-title":"J. Empirical Social Sci. Stud."},{"key":"ref335","doi-asserted-by":"publisher","DOI":"10.3390\/vehicles5040076"},{"key":"ref336","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-99-4641-9_4"},{"key":"ref337","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-30566-5"},{"key":"ref338","article-title":"PromptFix: You prompt and we fix the photo","author":"Yu","year":"2024","journal-title":"arXiv:2405.16785"},{"key":"ref339","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02194"},{"key":"ref340","first-page":"1","article-title":"MISAR: A multimodal instructional system with augmented reality","volume-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis. (ICCV)","author":"Bi"},{"key":"ref341","article-title":"Caption anything: Interactive image description with diverse multimodal controls","author":"Wang","year":"2023","journal-title":"arXiv:2305.02677"},{"key":"ref342","article-title":"PromptCap: Prompt-guided task-aware image captioning","author":"Hu","year":"2022","journal-title":"arXiv:2211.09699"},{"key":"ref343","first-page":"23","article-title":"SayPlan: Grounding large language models using 3D scene graphs for scalable robot task planning","volume-title":"Proc. Conf. Robot Learn.","author":"Rana"},{"key":"ref344","doi-asserted-by":"publisher","DOI":"10.2196\/46885"},{"key":"ref345","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20071-7_36"},{"key":"ref346","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548400"},{"key":"ref347","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00115"},{"key":"ref348","article-title":"LLaVA-med: Training a large language-and-vision assistant for biomedicine in one day","author":"Li","year":"2023","journal-title":"arXiv:2306.00890"},{"key":"ref349","doi-asserted-by":"publisher","DOI":"10.1007\/s10586-023-04124-5"},{"key":"ref350","doi-asserted-by":"publisher","DOI":"10.1016\/j.cose.2023.103139"},{"key":"ref351","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2023.110689"},{"key":"ref352","doi-asserted-by":"publisher","DOI":"10.1109\/TNSM.2024.3358730"},{"key":"ref353","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2023.09.101"},{"key":"ref354","article-title":"GraphGPT: Graph instruction tuning for large language models","author":"Tang","year":"2023","journal-title":"arXiv:2310.13023"},{"key":"ref355","article-title":"Efficient masked AutoEncoder for video object counting and a large-scale benchmark","author":"Cao","year":"2024","journal-title":"arXiv:2411.13056"},{"key":"ref356","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00101"},{"key":"ref357","article-title":"LISA: Reasoning segmentation via large language model","author":"Lai","year":"2023","journal-title":"arXiv:2308.00692"},{"key":"ref358","article-title":"A survey on generative AI and LLM for video generation, understanding, and streaming","author":"Zhou","year":"2024","journal-title":"arXiv:2404.16038"},{"key":"ref359","article-title":"VideoDirectorGPT: Consistent multi-scene video generation via LLM-guided planning","author":"Lin","year":"2023","journal-title":"arXiv:2309.15091"},{"key":"ref360","article-title":"VideoPoet: A large language model for zero-shot video generation","author":"Kondratyuk","year":"2023","journal-title":"arXiv:2312.14125"},{"key":"ref361","doi-asserted-by":"publisher","DOI":"10.1007\/s44267-025-00099-6"},{"key":"ref362","article-title":"B-VLLM: A vision large language model with balanced spatio-temporal tokens","author":"Lu","year":"2024","journal-title":"arXiv:2412.09919"},{"key":"ref363","article-title":"Edge-based video analytics: A survey","author":"Hu","year":"2023","journal-title":"arXiv:2303.14329"},{"key":"ref364","article-title":"Federated large language models: Current progress and future directions","author":"Yao","year":"2024","journal-title":"arXiv:2409.15723"},{"key":"ref365","doi-asserted-by":"publisher","DOI":"10.1109\/DCOSS-IoT61029.2024.00025"},{"key":"ref366","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096019"},{"key":"ref367","first-page":"1","article-title":"Multimedia-agent: A multimodal agent for multimedia content generation","volume-title":"Proc. ICLR","author":"Zhang"},{"key":"ref368","article-title":"EventHallusion: Diagnosing event hallucinations in video LLMs","author":"Zhang","year":"2024","journal-title":"arXiv:2409.16597"},{"key":"ref369","article-title":"Video token merging for long-form video understanding","author":"Lee","year":"2024","journal-title":"arXiv:2410.23782"},{"key":"ref370","article-title":"Enhancing multimodal LLM for detailed and accurate video captioning using multi-round preference optimization","author":"Tang","year":"2024","journal-title":"arXiv:2410.06682"},{"key":"ref371","article-title":"Interpolating video-LLMs: Toward longer-sequence LMMs in a training-free manner","author":"Shang","year":"2024","journal-title":"arXiv:2409.12963"}],"container-title":["IEEE Transactions on Circuits and Systems for Video Technology"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/76\/11392768\/10982110.pdf?arnumber=10982110","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T21:02:46Z","timestamp":1770930166000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10982110\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2]]},"references-count":371,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/tcsvt.2025.3566695","relation":{},"ISSN":["1051-8215","1558-2205"],"issn-type":[{"value":"1051-8215","type":"print"},{"value":"1558-2205","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2]]}}}