{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T17:06:39Z","timestamp":1771952799064,"version":"3.50.1"},"reference-count":101,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"1","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key R&#x0026;D Program of China","award":["2022ZD0162000"],"award-info":[{"award-number":["2022ZD0162000"]}]},{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LD24F020016"],"award-info":[{"award-number":["LD24F020016"]}]},{"DOI":"10.13039\/501100014219","name":"National Science Fund for Distinguished Young Scholars","doi-asserted-by":"publisher","award":["62225605"],"award-info":[{"award-number":["62225605"]}],"id":[{"id":"10.13039\/501100014219","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Scientific Research Foundation of Sichuan Provincial Department of Science and Technology, China","award":["2024YFHZ0001"],"award-info":[{"award-number":["2024YFHZ0001"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2026,1]]},"DOI":"10.1109\/tpami.2025.3604614","type":"journal-article","created":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T17:31:02Z","timestamp":1756834262000},"page":"374-389","source":"Crossref","is-referenced-by-count":3,"title":["MovieChat+: Question-Aware Sparse Memory for Long Video Question Answering"],"prefix":"10.1109","volume":"48","author":[{"given":"Enxin","family":"Song","sequence":"first","affiliation":[{"name":"Zhejiang University-University of Illinois Urbana-Champaign Institute, Zhejiang University, Haining, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2611-0008","authenticated-orcid":false,"given":"Wenhao","family":"Chai","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, The University of Washington, Seattle, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8255-2997","authenticated-orcid":false,"given":"Tian","family":"Ye","sequence":"additional","affiliation":[{"name":"Robotics and Autonomous Systems Thrust, Hong Kong University of Science and Technology (GuangZhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8877-2421","authenticated-orcid":false,"given":"Jenq-Neng","family":"Hwang","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, The University of Washington, Seattle, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3023-1662","authenticated-orcid":false,"given":"Xi","family":"Li","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8403-1538","authenticated-orcid":false,"given":"Gaoang","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhejiang University-University of Illinois Urbana-Champaign Institute, Zhejiang University, Haining, China"}]}],"member":"263","reference":[{"key":"ref1","first-page":"23716","article-title":"Flamingo: A visual language model for few-shot learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Alayrac"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1080\/02724980543000097"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.744"},{"key":"ref4","article-title":"Meet claude","year":"2023"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/S0079-7421(08)60422-3"},{"key":"ref7","article-title":"TRECVID 2020: A comprehensive campaign for evaluating video retrieval tasks across multiple application domains","author":"Awad","year":"2021"},{"key":"ref8","article-title":"TRECVID 2019: An evaluation campaign to benchmark video activity detection, video captioning and matching, and video search & retrieval","author":"Awad","year":"2020"},{"key":"ref9","first-page":"1","article-title":"TRECVID 2018: Benchmarking video activity detection, video captioning and matching, video storytelling linking and video search","volume-title":"Proc. TRECVID","author":"Awad"},{"key":"ref10","first-page":"1","article-title":"TRECVID 2017: Evaluating ad-hoc and instance video search, events detection, video captioning, and hyperlinking","volume-title":"Proc. TREC Video Retrieval Eval.","author":"Awad"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"ref13","first-page":"1","article-title":"Token Merging: Your ViT but faster","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"Bolya"},{"key":"ref14","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Brown"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00792"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.3390\/app12136588"},{"key":"ref18","first-page":"190","article-title":"Collecting highly parallel data for paraphrase evaluation","volume-title":"Proc. 49th Annu. Meeting Assoc. Comput. Linguistics: Hum. Lang. Technol.","author":"Chen"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_37"},{"key":"ref20","article-title":"Vicuna: An open-source chatbot impressing GPT-4 with 90%* ChatGPT quality","author":"Chiang","year":"2023"},{"key":"ref21","article-title":"Stablevicuna","year":"2023"},{"key":"ref22","article-title":"InstructBLIP: Towards general-purpose vision-language models with instruction tuning","author":"Dai","year":"2023"},{"key":"ref23","article-title":"PaLM-E: An embodied multimodal language model","author":"Driess","year":"2023"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01855"},{"key":"ref25","article-title":"MME: A comprehensive evaluation benchmark for multimodal large language models","author":"Fu","year":"2023"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01419"},{"key":"ref27","article-title":"LLaMA-Adapter V2: Parameter-efficient visual instruction model","author":"Gao","year":"2023"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref29","article-title":"Multimodal-GPT: A vision and language model for dialogue with humans","author":"Gong","year":"2023"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.3389\/fmars.2022.1071618"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00413"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_41"},{"key":"ref33","article-title":"langchain","year":"2023"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.149"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref36","article-title":"Seed-bench: Benchmarking multimodal LLMs with generative comprehension","author":"Li","year":"2023"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2025.3571946"},{"key":"ref38","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023"},{"key":"ref39","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023"},{"key":"ref40","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4321-9"},{"key":"ref42","article-title":"MAVOT: Memory-augmented video object tracking","author":"Liu","year":"2017"},{"key":"ref43","article-title":"Visual instruction tuning","author":"Liu","year":"2023"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"ref45","article-title":"Macaw-LLM: Multi-modal language modeling with image, audio, video, and text integration","author":"Lyu","year":"2023"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-018-1076-4"},{"key":"ref47","article-title":"Vista-LLaMA: Reliable video narrator via equal distance to visual tokens","author":"Ma","year":"2023"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"ref49","article-title":"EgoSchema: A diagnostic benchmark for very long-form video language understanding","author":"Mangalam","year":"2023"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"ref51","article-title":"GPT3.5","year":"2021"},{"key":"ref52","article-title":"GPT-4 technical report","year":"2023"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-naacl.130"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00207"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-11752-2_15"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.447"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247801"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33718-5_11"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0851-8"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_10"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58542-6_38"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01265"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00797"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00497"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1101\/cshperspect.a021766"},{"key":"ref66","article-title":"BERT position encoding","author":"Su","year":"2023"},{"key":"ref67","article-title":"PandaGPT: One model to instruction-follow them all","author":"Su","year":"2023"},{"key":"ref68","article-title":"Stanford alpaca: An instruction-following llama model","author":"Taori","year":"2023"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.501"},{"key":"ref70","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref71","article-title":"LLaMA 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref72","article-title":"ChatVideo: A tracklet-centric multimodal and versatile video understanding system","author":"Wang","year":"2023"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01271"},{"key":"ref74","article-title":"GIT: A generative image-to-text transformer for vision and language","author":"Wang","year":"2022"},{"key":"ref75","article-title":"VisionLLM: Large language model is also an open-ended decoder for vision-centric tasks","author":"Wang","year":"2023"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73414-4_26"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00037"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00192"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01322"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01254"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/CVIDLICCEA56201.2022.9825193"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123427"},{"key":"ref84","article-title":"mPLUG-2: A modularized multi-modal foundation model across text, image and video","author":"Xu","year":"2023"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref86","first-page":"124","article-title":"Zero-shot video question answering via frozen bidirectional language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yang"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01240-3_10"},{"key":"ref88","article-title":"mPLUG-Owl: Modularization empowers large language models with multimodality","author":"Ye","year":"2023"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019127"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02208"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_38"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"ref93","article-title":"LLaMA-adapter: Efficient fine-tuning of language models with zero-init attention","author":"Zhang","year":"2023"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19830-4_28"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73242-3_11"},{"key":"ref96","article-title":"Hierarchical auto-organizing system for open-ended multi-agent navigation","author":"Zhao","year":"2024"},{"key":"ref97","article-title":"Do we really need a complex agent system? Distill embodied agent into a single model","author":"Zhao","year":"2024"},{"key":"ref98","article-title":"On large language models\u2019 selection bias in multi-choice questions","author":"Zheng","year":"2023"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01727"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3272319"},{"key":"ref101","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielam\/34\/11275622\/11146594-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/11275622\/11146594.pdf?arnumber=11146594","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,4]],"date-time":"2025-12-04T21:01:27Z","timestamp":1764882087000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11146594\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1]]},"references-count":101,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3604614","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1]]}}}