{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T10:18:20Z","timestamp":1777889900493,"version":"3.51.4"},"reference-count":66,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.02068","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"22274-22284","source":"Crossref","is-referenced-by-count":0,"title":["Object-Centric Video Question Answering with Visual Grounding and Referring"],"prefix":"10.1109","author":[{"given":"Haochen","family":"Wang","sequence":"first","affiliation":[{"name":"University of Amsterdam"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qirui","family":"Chen","sequence":"additional","affiliation":[{"name":"SAI, Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cilin","family":"Yan","sequence":"additional","affiliation":[{"name":"Xiaohongshu Inc."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiayin","family":"Cai","sequence":"additional","affiliation":[{"name":"Xiaohongshu Inc."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaolong","family":"Jiang","sequence":"additional","affiliation":[{"name":"Xiaohongshu Inc."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yao","family":"Hu","sequence":"additional","affiliation":[{"name":"Xiaohongshu Inc."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Weidi","family":"Xie","sequence":"additional","affiliation":[{"name":"SAI, Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Stratis","family":"Gavves","sequence":"additional","affiliation":[{"name":"University of Amsterdam"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Gpt-4 technical report","author":"Achiam","year":"2023","journal-title":"arXiv preprint"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00172"},{"key":"ref3","article-title":"Qwen2.5-vl technical report","author":"Bai","year":"2025","journal-title":"arXiv preprint"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0219"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00065"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01227"},{"key":"ref7","article-title":"Shikra: Unleashing multimodal 1lm\u2019s referential dialogue magic","author":"Chen","year":"2023","journal-title":"arXiv preprint"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i2.32214"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4231-5"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_37"},{"key":"ref11","article-title":"Videollama 2: Advancing spatialtemporal modeling and audio understanding in video-llms","author":"Cheng","year":"2024","journal-title":"arXiv preprint"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01229"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00254"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01850"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2025.3611020"},{"key":"ref16","article-title":"Lora: Low-rank adaptation of large language models","author":"Hu","year":"2021","journal-title":"arXiv preprint"},{"key":"ref17","article-title":"Gpt-4o system card","author":"Hurst","year":"2024","journal-title":"arXiv preprint"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.00568"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1086"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-20870-7_8"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref22","first-page":"17283","article-title":"Grounding language models to images for multimodal inputs and outputs","volume-title":"International Conference on Machine Learning","author":"Koh","year":"2023"},{"key":"ref23","article-title":"Generating images with multimodal language models","volume":"36","author":"Koh","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"ref25","article-title":"Mimicit: Multi-modal in-context instruction tuning","author":"Li","year":"2023","journal-title":"arXiv preprint"},{"key":"ref26","article-title":"Llava-onevision: Easy visual task transfer","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref27","article-title":"Llava-next-interleave: Tackling multi-image, video, and 3d in large multimodal models","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"ref29","article-title":"Towards robust referring video object segmentation with cyclic relational consensus","author":"Li","year":"2022","journal-title":"arXiv preprint"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.02021"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"ref32","first-page":"74","article-title":"Rouge: A package for automatic evaluation of summaries","author":"Lin","year":"2004","journal-title":"Text Summarization Branches Out"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00809"},{"key":"ref34","article-title":"Visual instruction tuning","volume":"36","author":"Liu","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00380"},{"key":"ref36","article-title":"Decoupled weight decay eegularization","author":"Loshchilov","year":"2017","journal-title":"arXiv preprint"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02036"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2016.79"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref42","article-title":"Kosmos-2: Grounding multimodal large language models to the world","author":"Peng","year":"2023","journal-title":"arXiv preprint"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01629-1"},{"key":"ref44","article-title":"Sam 2: Segment anything in images and videos","author":"Ravi","year":"2024","journal-title":"arXiv preprint"},{"key":"ref45","first-page":"29441","article-title":"Hiera: A hierarchical vision transformer without the bells-andwhistles","volume-title":"International Conference on Machine Learning","author":"Ryali","year":"2023"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58555-6_13"},{"key":"ref47","article-title":"Generative pretraining in multimodality","author":"Sun","year":"2023","journal-title":"arXiv preprint"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02076-w"},{"key":"ref50","article-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01060"},{"key":"ref52","article-title":"Internvideo: General video foundation models via generative and discriminative learning","author":"Wang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73013-9_23"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72633-0_6"},{"key":"ref55","article-title":"Videogpt: Video generation using vq-vae and transformers","author":"Yan","year":"2021","journal-title":"arXiv preprint"},{"key":"ref56","article-title":"Set-of-mark prompting unleashes extraordinary visual grounding in gpt-4v","author":"Yang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref57","article-title":"Mplug-owl: Modularization empowers large language models with multimodality","author":"Ye","year":"2023","journal-title":"arXiv preprint"},{"key":"ref58","article-title":"Ferret: Refer and ground anything anywhere at any granularity","author":"You","year":"2023","journal-title":"arXiv preprint"},{"key":"ref59","article-title":"Sa2va: Marrying sam2 with llava for dense grounded understanding of images and videos","author":"Yuan","year":"2025","journal-title":"arXiv"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02664"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01767"},{"key":"ref62","article-title":"Videollama 3: Frontier multimodal foundation models for image and video understanding","author":"Zhang","year":"2025","journal-title":"arXiv preprint"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-91813-1_4"},{"key":"ref65","article-title":"Video instruction tuning with synthetic data","author":"Zhang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref66","article-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023","journal-title":"arXiv preprint"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11445213.pdf?arnumber=11445213","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:23:52Z","timestamp":1777613032000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11445213\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":66,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.02068","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}