{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T03:28:01Z","timestamp":1777865281965,"version":"3.51.4"},"reference-count":101,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00805","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"8600-8612","source":"Crossref","is-referenced-by-count":0,"title":["VLM4D: Towards Spatiotemporal Awareness in Vision Language Models"],"prefix":"10.1109","author":[{"given":"Shijie","family":"Zhou","sequence":"first","affiliation":[{"name":"UCLA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alexander","family":"Vilesov","sequence":"additional","affiliation":[{"name":"UCLA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuehai","family":"He","sequence":"additional","affiliation":[{"name":"Microsoft"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ziyu","family":"Wan","sequence":"additional","affiliation":[{"name":"Microsoft"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shuwang","family":"Zhang","sequence":"additional","affiliation":[{"name":"UCLA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Aditya","family":"Nagachandra","sequence":"additional","affiliation":[{"name":"UCLA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Di","family":"Chang","sequence":"additional","affiliation":[{"name":"USC"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dongdong","family":"Chen","sequence":"additional","affiliation":[{"name":"Microsoft"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xin Eric","family":"Wang","sequence":"additional","affiliation":[{"name":"UCSC"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Achuta","family":"Kadambi","sequence":"additional","affiliation":[{"name":"UCLA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Phi-3 technical report: A highly capable language model locally on your phone","author":"Abdin","year":"2024","journal-title":"arXiv preprint"},{"key":"ref2","article-title":"Phi-4 technical report","author":"Abdin","year":"2024","journal-title":"arXiv preprint"},{"key":"ref3","author":"Abouelenin","year":"2025","journal-title":"Phi-4-mini technical report: Compact yet powerful multimodal language models via mixture-of-loras"},{"key":"ref4","article-title":"Cosmos world foundation model platform for physical ai","author":"Agarwal","year":"2025","journal-title":"arXiv preprint"},{"key":"ref5","article-title":"Pixtral 12b","author":"Agrawal","year":"2024","journal-title":"arXiv preprint"},{"key":"ref6","article-title":"Anthropic. System card: Claude opus 4 & claude sonnet 4","year":"2025","journal-title":"Technical report"},{"key":"ref7","article-title":"Qwen technical report","author":"Bai","year":"2023","journal-title":"arXiv preprint"},{"key":"ref8","author":"Brooks","year":"2024","journal-title":"Video generation models as world simulators"},{"key":"ref9","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/j.tics.2006.10.005"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2023.3260405"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01750"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0850"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0614"},{"key":"ref15","article-title":"Expanding performance boundaries of open-source multimodal models with model, data, and test-time scaling","author":"Chen","year":"2024","journal-title":"arXiv preprint"},{"key":"ref16","article-title":"Videollama 2: Advancing spatialtemporal modeling and audio understanding in video-llms","author":"Cheng","year":"2024","journal-title":"arXiv preprint"},{"key":"ref17","author":"Cui","year":"2024","journal-title":"Sharegpt-4o: Comprehensive multimodal annotations with gpt-4o"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2142"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1167\/16.3.22"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/vl\/N19-142"},{"key":"ref21","article-title":"An image is worth $16 \\times 16$ words: Transformers for image recognition at scale","volume-title":"In International Conference on Learning Representations","author":"Dosovitskiy"},{"key":"ref22","author":"Driess","year":"2023","journal-title":"Palm-e: An embodied multimodal language model"},{"key":"ref23","article-title":"Missing premise exacerbates overthinking: Are reasoning models losing critical thinking skill?","author":"Fan","year":"2025","journal-title":"arXiv preprint"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1271"},{"key":"ref25","article-title":"Vlm-3r: Vision-language models augmented with instruction-aligned 3d reconstruction","author":"Fan","year":"2025","journal-title":"arXiv preprint"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1037\/0278-7393.10.1.126"},{"key":"ref27","article-title":"Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal 11 ms in video analysis","author":"Fu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref28","article-title":"Gemini 2.5: Pushing the frontier with advanced reasoning, multimodality, long context, and next generation agentic capabilities","year":"2025","journal-title":"Technical report"},{"key":"ref29","article-title":"Multimodal-gpt: A vision and language model for dialogue with humans","author":"Gong","year":"2023","journal-title":"arXiv preprint"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"ref31","article-title":"Mmworld: Towards multidiscipline multi-faceted world model evaluation in videos","author":"He","year":"2024","journal-title":"arXiv preprint"},{"key":"ref32","article-title":"Mojito: Motion tra-jectory and intensity control for video generation","author":"He","year":"2024","journal-title":"arXiv preprint"},{"key":"ref33","article-title":"Gpt-4o system card","author":"Hurst","year":"2024","journal-title":"arXiv preprint"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.3758\/BF03212378"},{"key":"ref35","article-title":"How good is my video 1 mm ? complex video reasoning and robustness evaluation suite for video- 1 mms","author":"Uzair Khattak","year":"2024","journal-title":"arXiv preprint"},{"key":"ref36","article-title":"Openvla: An open-source vision-language-action model","author":"Jin Kim","year":"2024","journal-title":"arXiv preprint"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1694"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-005-1838-7"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1989.1.4.541"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1068\/p130287"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01263"},{"key":"ref42","article-title":"Llava-onevision: Easy visual task transfer","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref43","article-title":"Aria: An open multimodal native mixture-of-experts model","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4321-9"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02095"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"ref47","article-title":"4k4DGen: Panoramic 4d generation at 4k resolution","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Li"},{"key":"ref48","article-title":"Videoeval: Comprehensive benchmark suite for low-cost evaluation of video foundation model","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref49","article-title":"Scenethesis: A language and vision agentic framework for 3d scene generation","volume-title":"arXiv preprint","author":"Ling","year":"2025"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.52202\/075280"},{"key":"ref51","article-title":"World model on million-length video and language with ringattention","author":"Liu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref52","article-title":"World model on million-length video and language with blockwise ringattention","author":"Liu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"ref54","article-title":"Deepseek-vl: towards real-world visionlanguage understanding","author":"Lu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref55","article-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","author":"Lu","year":"2019","journal-title":"Neural Information Processing Systems"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1098\/rspb.1981.0001"},{"key":"ref58","article-title":"The llama 4 herd: The beginning of a new era of natively multimodal ai innovation","year":"2025","journal-title":"Technical report"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.26599\/cvm.2025.9450516"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73013-9_21"},{"key":"ref61","article-title":"Hello gpt-4o","year":"2024","journal-title":"Technical report"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA55743.2025.11127585"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.85"},{"key":"ref64","article-title":"The 2017 davis challenge on video object segmentation","author":"Pont-Tuset","year":"2017","journal-title":"arXiv preprint"},{"key":"ref65","author":"Radford","year":"2018","journal-title":"Improving language understanding by generative pre-training"},{"key":"ref66","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"Radford"},{"key":"ref67","first-page":"568","article-title":"Two-stream convolutional networks for action recognition in videos","author":"Simonyan","year":"2014","journal-title":"Advances in Neural Information Processing Systems (NIPS)"},{"issue":"1","key":"ref68","first-page":"89","volume":"10","author":"Spelke","year":"2007","journal-title":"Core knowledge. Developmental Science"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.649"},{"key":"ref70","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","author":"Team","year":"2024","journal-title":"arXiv preprint"},{"key":"ref71","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv preprint"},{"key":"ref72","article-title":"Wan: Open and advanced large-scale video generative models","author":"Wan","year":"2025","journal-title":"arXiv preprint"},{"key":"ref73","article-title":"Vlm see, robot do: Human demo video to robot action plan via vision language model","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995407"},{"key":"ref75","article-title":"Grounded-videollm: Sharpening fine-grained temporal grounding in video large language models","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref76","article-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref77","article-title":"Compositional 4d dynamic scenes understanding with physics priors for video question answering","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73013-9_23"},{"key":"ref79","article-title":"Internvideo2.5: Empowering video mllms with long and rich context modeling","author":"Wang","year":"2025","journal-title":"arXiv preprint"},{"key":"ref80","article-title":"Finetuned language models are zero-shot learners","author":"Wei","year":"2021","journal-title":"arXiv preprint"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1800"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02427"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3488"},{"key":"ref84","article-title":"Grok-2 beta release","year":"2024","journal-title":"Technical report"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_36"},{"key":"ref86","article-title":"Qwen2. 5 technical report","author":"Yang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.00994"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.01767"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72627-9_4"},{"key":"ref91","article-title":"Videollama 3: Frontier multimodal foundation models for image and video understanding","author":"Zhang","year":"2025","journal-title":"arXiv preprint"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"ref93","article-title":"Combo: compositional world models for embodied multi-agent cooperation","author":"Zhang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref94","author":"Zhang","year":"2024","journal-title":"Llava-next: A strong zero-shot video understanding model"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00793"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-demos.38"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02048"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_19"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01323"},{"key":"ref100","article-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref101","article-title":"Apollo: An exploration of video understanding in large multimodal models","author":"Zohar","year":"2024","journal-title":"arXiv preprint"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11443580.pdf?arnumber=11443580","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:10:46Z","timestamp":1777529446000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11443580\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":101,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00805","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}