{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T17:07:19Z","timestamp":1780765639071,"version":"3.54.1"},"reference-count":86,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01876","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"20168-20179","source":"Crossref","is-referenced-by-count":1,"title":["VCA: Video Curious Agent for Long Video Understanding"],"prefix":"10.1109","author":[{"given":"Zeyuan","family":"Yang","sequence":"first","affiliation":[{"name":"University of Massachusetts,Amherst"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Delin","family":"Chen","sequence":"additional","affiliation":[{"name":"Massachusetts Institute of Technology"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xueyang","family":"Yu","sequence":"additional","affiliation":[{"name":"University of Massachusetts,Amherst"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Maohao","family":"Shen","sequence":"additional","affiliation":[{"name":"Massachusetts Institute of Technology"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chuang","family":"Gan","sequence":"additional","affiliation":[{"name":"University of Massachusetts,Amherst"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Gpt-4 technical report","author":"Achiam","year":"2023","journal-title":"arXiv preprint"},{"key":"ref2","article-title":"Pixtral 12b","author":"Agrawal","year":"2024","journal-title":"arXiv preprint"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"ref4","article-title":"Qwen-vl: A frontier large vision-language model with versatile abilities","author":"Bai","year":"2023","journal-title":"arXiv preprint"},{"key":"ref5","article-title":"Memory consolidation enables long-context video understanding","volume-title":"Forty-first International Conference on Machine Learning","author":"Balazevic","year":"2024"},{"key":"ref6","author":"Bavishi","year":"2023","journal-title":"Fuyu-8b: A multimodal architecture for ai agents"},{"key":"ref7","author":"Bertasius","year":"2021","journal-title":"Is space-time attention all you need for video understanding?"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2017.502"},{"key":"ref9","article-title":"Pali-3 vision language models: Smaller, faster, stronger","author":"Chen","year":"2023","journal-title":"arXiv preprint"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4231-5"},{"key":"ref11","article-title":"Videollama 2: Advancing spatialtemporal modeling and audio understanding in video-llms","author":"Cheng","year":"2024","journal-title":"arXiv preprint"},{"key":"ref12","article-title":"Long story short: a summarize-then-search method for long video question answering","author":"Chung","year":"2023","journal-title":"arXiv preprint"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1146\/annurev.neuro.18.1.193"},{"key":"ref14","author":"Dosovitskiy","year":"2021","journal-title":"An image is worth 16x16 words: Transformers for image recognition at scale"},{"key":"ref15","author":"Durante","year":"2024","journal-title":"Agent ai: Surveying the horizons of multimodal interaction"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72670-5_5"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2016.213"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"ref20","article-title":"Mme: A comprehensive evaluation benchmark for multimodal large language models","author":"Fu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.70675\/802872a7z8092z4556za1fdz2c7af45f0e72"},{"key":"ref23","article-title":"Cogvlm 2: Visual language models for image and video understanding","author":"Hong","year":"2024","journal-title":"arXiv preprint"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01354"},{"key":"ref25","article-title":"Gpt-4o system card","author":"Hurst","year":"2024","journal-title":"arXiv preprint"},{"key":"ref26","author":"Jang","year":"2024","journal-title":"Videowebarena: Evaluating long context multimodal agents with video understanding web tasks"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01300"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.50"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73016-0_16"},{"key":"ref30","author":"Kumar","year":"2024","journal-title":"Mmctagent: Multi-modal critical thinking agent framework for complex visual reasoning"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"ref32","author":"Li","journal-title":"Llms meet long video: Advancing long video comprehension with an interactive visual adapter in 11 ms, 2024"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72952-2_19"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"ref35","author":"Lin","year":"2023","journal-title":"Vila: On pre-training for visual language models"},{"key":"ref36","author":"Lin","journal-title":"Vila: Efficient video-language alignment for video question answering. 1"},{"key":"ref37","author":"Liu","year":"2024","journal-title":"Llava-next: Improved reasoning, ocr, and world knowledge"},{"key":"ref38","article-title":"World model on million-length video and language with ringattention","author":"Liu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.01764"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.01764"},{"key":"ref41","article-title":"Egoschema: A diagnostic benchmark for very long-form video language understanding","author":"Mangalam","year":"2023","journal-title":"arXiv preprint"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2015.7299101"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01364"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2026.eacl-long.164"},{"key":"ref45","article-title":"Cinepile: A long video question answering dataset and benchmark","author":"Rawal","journal-title":"Synthetic Data for Computer Vision Workshop@ CVPR 2024"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01357"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01357"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00497"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01725"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.52202\/068431-2756"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2025.3566695"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.501"},{"key":"ref54","article-title":"Chameleon: Mixed-modal early-fusion foundation models","author":"Team","year":"2024","journal-title":"arXiv preprint"},{"key":"ref55","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","author":"Team","year":"2024","journal-title":"arXiv preprint"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.52202\/068431-0732"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/iccv.2015.510"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"ref59","article-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref60","article-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref61","article-title":"Lvbench: An extreme long video understanding benchmark, 2024","author":"Wang","journal-title":"1, 5"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72989-8_4"},{"key":"ref63","article-title":"Internvideo 2: Scaling video foundation models for multimodal video understanding","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref64","article-title":"Videollamb: Long-context video understanding with recurrent memory bridges","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref65","article-title":"Lifelongmemory: Leveraging 11 ms for answering queries in long-form egocentric videos","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.00311"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1800"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73414-4_26"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00192"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01322"},{"key":"ref71","article-title":"Longvideobench: A benchmark for long-context interleaved video-language understanding","author":"Wu","year":"2024","journal-title":"CoRR"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"ref73","article-title":"Openagents: An open platform for language agents in the wild","author":"Xie","year":"2024","journal-title":"ICLR"},{"key":"ref74","article-title":"Osworld: Benchmarking multimodal agents for open-ended tasks in real computer environments","author":"Xie","year":"2024","journal-title":"arXiv preprint"},{"key":"ref75","author":"Xu","journal-title":"Retrieval-based video language model for efficient long video question answering, 2023"},{"key":"ref76","author":"Yao","year":"2023","journal-title":"Webshop: Towards scalable real-world web interaction with grounded language agents"},{"key":"ref77","article-title":"Self-chained image-language model for video localization and question answering","author":"Yu","year":"2023","journal-title":"NeurIPS"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.52202\/075280-3354"},{"key":"ref79","article-title":"A simple 11 m framework for long-range video question-answering","author":"Zhang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"ref81","article-title":"Movqa: A benchmark of versatile question-answering for long-form movie understanding","author":"Zhang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref82","article-title":"Long context transfer from language to vision","author":"Zhang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref83","author":"Zhang","year":"2024","journal-title":"Llavanext: A strong zero-shot video understanding model"},{"key":"ref84","article-title":"Longagent: Scaling language models to 128 k context through multi-agent collaboration","author":"Zhao","year":"2024","journal-title":"arXiv preprint"},{"key":"ref85","article-title":"Videoprism: A foundational visual encoder for video understanding","volume-title":"Forty-first International Conference on Machine Learning","author":"Zhao"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00877"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11445362.pdf?arnumber=11445362","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:30:16Z","timestamp":1777530616000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11445362\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":86,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01876","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}