{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T03:28:57Z","timestamp":1777865337589,"version":"3.51.4"},"reference-count":53,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.02042","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"21994-22003","source":"Crossref","is-referenced-by-count":0,"title":["How Can Objects Help Video-Language Understanding?"],"prefix":"10.1109","author":[{"given":"Zitian","family":"Tang","sequence":"first","affiliation":[{"name":"Brown University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shijie","family":"Wang","sequence":"additional","affiliation":[{"name":"Brown University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junho","family":"Cho","sequence":"additional","affiliation":[{"name":"Samsung Electronics"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jaewook","family":"Yoo","sequence":"additional","affiliation":[{"name":"Samsung Electronics"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chen","family":"Sun","sequence":"additional","affiliation":[{"name":"Brown University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","author":"Berrios","year":"2023","journal-title":"Towards language models that can see: Computer vision through the lens of natural language"},{"key":"ref2","author":"Chen","year":"2023","journal-title":"Shikra: Unleashing multimodal llm\u2019s referential dialogue magic"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01599"},{"key":"ref4","author":"Cheng","year":"2024","journal-title":"VideoLLaMA 2: Advancing spatial-temporal modeling and audio understanding in video11 ms"},{"key":"ref5","article-title":"Motion question answering via modular motion programs","volume-title":"ICML","author":"Endo","year":"2023"},{"key":"ref6","author":"Grattafiori","year":"2024","journal-title":"The Llama 3 herd of models"},{"key":"ref7","author":"Han","year":"2023","journal-title":"ImageBindLLM: Multi-modality instruction tuning"},{"key":"ref8","article-title":"LoRA: Low-rank adaptation of large language models","volume-title":"ICLR","author":"Hu","year":"2022"},{"key":"ref9","author":"Jiang","year":"2023","journal-title":"Mistral 7B"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01300"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.3758\/BF03212378"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.261"},{"key":"ref13","article-title":"LLaVA-OneVision: Easy visual task transfer","author":"Li","year":"2025","journal-title":"TMLR"},{"key":"ref14","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023","journal-title":"ICML"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01099"},{"key":"ref16","article-title":"CoVLM: Composing visual entities and relationships in large language models via communicative decoding","volume-title":"ICLR","author":"Li","year":"2024"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4321-9"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"ref25","article-title":"MM1: methods, analysis and insights from multimodal 11 m pre-training","volume-title":"ECCV","author":"McKinzie","year":"2024"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01257"},{"key":"ref27","article-title":"DINOv2: Learning robust visual features without supervision","author":"Oquab","year":"2024","journal-title":"TMLR"},{"key":"ref28","article-title":"Grounding multimodal large language models to the world","volume-title":"ICLR","author":"Peng","year":"2024"},{"key":"ref29","article-title":"Perception Test: A diagnostic benchmark for multimodal video models","volume-title":"NeurIPS","author":"P\u0103tr\u0103ucean","year":"2023"},{"key":"ref30","article-title":"Learning transferable visual models from natural language supervision","volume-title":"ICML","author":"Radford","year":"2021"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01236"},{"key":"ref32","article-title":"SAM 2: Segment anything in images and videos","volume-title":"ICLR","author":"Ravi","year":"2025"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01101"},{"key":"ref34","article-title":"Cambrian-1: A fully open, vision-centric exploration of multimodal LLMs","volume-title":"NeurIPS","author":"Tong","year":"2024"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00914"},{"key":"ref36","article-title":"Vamos: Versatile action models for video understanding","volume-title":"ECCV","author":"Wang","year":"2024"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73033-7_11"},{"key":"ref38","author":"Wang","year":"2022","journal-title":"InternVideo: General video foundation models via generative and discriminative learning"},{"key":"ref39","author":"Wang","year":"2024","journal-title":"LifelongMemory: Leveraging 11 ms for answering queries in long-form egocentric videos"},{"key":"ref40","article-title":"Language models with image descriptors are strong few-shot video-language learners","volume-title":"NeurIPS","author":"Wang","year":"2022"},{"key":"ref41","article-title":"STAR: A benchmark for situated reasoning in real-world videos","volume-title":"NeurIPS","author":"Wu","year":"2021"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/iccvw69036.2025.00644"},{"key":"ref44","article-title":"CLEVRER: collision events for video representation and reasoning","volume-title":"ICLR","author":"Yi","year":"2020"},{"key":"ref45","article-title":"Self-chained image-language model for video localization and question answering","volume-title":"NeurIPS","author":"Yu","year":"2023"},{"key":"ref46","article-title":"Socratic models: Composing zero-shot multimodal reasoning with language","volume-title":"ICLR","author":"Zeng","year":"2023"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"ref48","article-title":"A simple 11 m framework for long-range video question-answering","volume-title":"EMNLP","author":"Zhang","year":"2024"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"ref51","article-title":"LLaMA-Adapter: Efficient fine-tuning of language models with zero-init attention","volume-title":"ICLR","author":"Zhang","year":"2024"},{"key":"ref52","volume-title":"LLaVA-NeXT: A strong zero-shot video understanding model","author":"Zhang","year":"2024"},{"key":"ref53","article-title":"AntGPT: Can large language models help long-term action anticipation from videos?","volume-title":"ICLR","author":"Zhao","year":"2024"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11444550.pdf?arnumber=11444550","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:17:25Z","timestamp":1777529845000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11444550\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":53,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.02042","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}