{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:25:31Z","timestamp":1765308331380,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":76,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755537","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:47:42Z","timestamp":1761371262000},"page":"4639-4648","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["GraphVideoAgent: Enhancing Long-form Video Understanding with Entity Relation Graphs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-7105-5439","authenticated-orcid":false,"given":"Meng","family":"Chu","sequence":"first","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong SAR, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5659-793X","authenticated-orcid":false,"given":"Yicong","family":"Li","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6097-7807","authenticated-orcid":false,"given":"Tat-Seng","family":"Chua","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","unstructured":"Rohan Anil Sebastian Borgeaud Yonghui Wu Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M. Dai Anja Hauth Katie Millican David Silver Slav Petrov Melvin Johnson Ioannis Antonoglou Julian Schrittwieser Amelia Glaese Jilin Chen Emily Pitler Timothy P. Lillicrap Angeliki Lazaridou Orhan Firat James Molloy Michael Isard Paul Ronald Barham Tom Hennigan Benjamin Lee Fabio Viola Malcolm Reynolds Yuanzhong Xu Ryan Doherty Eli Collins Clemens Meyer Eliza Rutherford Erica Moreira Kareem Ayoub Megha Goel George Tucker Enrique Piqueras Maxim Krikun Iain Barr Nikolay Savinov Ivo Danihelka Becca Roelofs Ana\u00efs White Anders Andreassen Tamara von Glehn Lakshman Yagati Mehran Kazemi Lucas Gonzalez Misha Khalman Jakub Sygnowski and et al. 2023. Gemini: A Family of Highly Capable Multimodal Models. CoRR Vol. abs\/2312.11805 (2023). https:\/\/doi.org\/10.48550\/ARXIV.2312.11805 arXiv:2312.11805","DOI":"10.48550\/ARXIV.2312.11805"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.4324\/9780429449642"},{"key":"e_1_3_2_1_3_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Bai Ziyi","year":"2024","unstructured":"Ziyi Bai, Ruiping Wang, and Xilin Chen. 2024. Glance and Focus: Memory Prompting for Multi-Event Video Question Answering. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the 41st International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"2542","author":"Balazevic Ivana","year":"2024","unstructured":"Ivana Balazevic, Yuge Shi, Pinelopi Papalampidi, Rahma Chaabouni, Skanda Koppula, and Olivier J Henaff. 2024. Memory Consolidation Enables Long-Context Video Understanding. In Proceedings of the 41st International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 235). PMLR, 2527-2542."},{"key":"e_1_3_2_1_5_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877-1901."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00293"},{"key":"e_1_3_2_1_7_1","volume-title":"LongVILA: Scaling Long-Context Visual Language Models for Long Videos. In The International Conference on Learning Representations (ICLR).","author":"Chen Yukang","year":"2025","unstructured":"Yukang Chen, Fuzhao Xue, Dacheng Li, Qinghao Hu, Ligeng Zhu, Xiuyu Li, Yunhao Fang, Haotian Tang, Shang Yang, Zhijian Liu, Ethan He, Hongxu Yin, Pavlo Molchanov, Jan Kautz, Linxi Fan, Yuke Zhu, Yao Lu, and Song Han. 2025. LongVILA: Scaling Long-Context Visual Language Models for Long Videos. In The International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_8_1","volume-title":"TraveLLaMA: Facilitating Multi-modal Large Language Models to Understand Urban Scenes and Provide Travel Assistance. arXiv preprint arXiv:2504.16505","author":"Chu Meng","year":"2025","unstructured":"Meng Chu, Yukang Chen, Haokun Gui, Shaozuo Yu, Yi Wang, and Jiaya Jia. 2025a. TraveLLaMA: Facilitating Multi-modal Large Language Models to Understand Urban Scenes and Provide Travel Assistance. arXiv preprint arXiv:2504.16505 (2025)."},{"key":"e_1_3_2_1_9_1","volume-title":"3D-TAFS: A Training-free Framework for 3D Affordance Segmentation. arXiv preprint arXiv:2409.10078","author":"Chu Meng","year":"2024","unstructured":"Meng Chu, Xuan Zhang, Zhedong Zheng, and Tat-Seng Chua. 2024. 3D-TAFS: A Training-free Framework for 3D Affordance Segmentation. arXiv preprint arXiv:2409.10078 (2024)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73247-8_13"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00262"},{"key":"e_1_3_2_1_12_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_13_1","volume-title":"Joya Chen, Zihan Fan, and Mike Zheng Shou.","author":"Gao Difei","year":"2023","unstructured":"Difei Gao, Lei Ji, Luowei Zhou, Kevin Qinghong Lin, Joya Chen, Zihan Fan, and Mike Zheng Shou. 2023a. AssistGPT: A General Multi-modal Assistant that can Plan, Execute, Inspect, and Learn. arXiv preprint arXiv:2306.08640 (2023)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01419"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02012"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01535"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01354"},{"key":"e_1_3_2_1_18_1","volume-title":"Smeulders","author":"Hussein Noureldien","year":"2019","unstructured":"Noureldien Hussein, Efstratios Gavves, and Arnold W. M. Smeulders. 2019. VideoGraph: Recognizing Minutes-Long Human Activities in Videos. CoRR, Vol. abs\/1905.05143 (2019). arXiv:1905.05143 http:\/\/arxiv.org\/abs\/1905.05143"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_6"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","unstructured":"Albert Q. Jiang Alexandre Sablayrolles Antoine Roux Arthur Mensch Blanche Savary Chris Bamford Devendra Singh Chaplot Diego de Las Casas Emma Bou Hanna Florian Bressand Gianna Lengyel Guillaume Bour Guillaume Lample L\u00e9lio Renard Lavaud Lucile Saulnier Marie-Anne Lachaux Pierre Stock Sandeep Subramanian Sophia Yang Szymon Antoniak Teven Le Scao Th\u00e9ophile Gervet Thibaut Lavril Thomas Wang Timoth\u00e9e Lacroix and William El Sayed. 2024. Mixtral of Experts. CoRR Vol. abs\/2401.04088 (2024). https:\/\/doi.org\/10.48550\/ARXIV.2401.04088 arXiv:2401.04088","DOI":"10.48550\/ARXIV.2401.04088"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413902"},{"key":"e_1_3_2_1_22_1","volume-title":"Semi-parametric video-grounded text generation. arXiv preprint arXiv:2301.11507","author":"Kim Sungdong","year":"2023","unstructured":"Sungdong Kim, Jin-Hwa Kim, Jiyoung Lee, and Minjoon Seo. 2023. Semi-parametric video-grounded text generation. arXiv preprint arXiv:2301.11507 (2023)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.50"},{"key":"e_1_3_2_1_24_1","volume-title":"Text-conditioned resampler for long form video understanding. arXiv preprint arXiv:2312.11897","author":"Korbar Bruno","year":"2023","unstructured":"Bruno Korbar, Yongqin Xian, Alessio Tonioni, Andrew Zisserman, and Federico Tombari. 2023. Text-conditioned resampler for long form video understanding. arXiv preprint arXiv:2312.11897 (2023)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"e_1_3_2_1_26_1","unstructured":"KunChang Li Yinan He Yi Wang Yizhuo Li Wenhai Wang Ping Luo Yali Wang Limin Wang and Yu Qiao. 2023a. VideoChat: Chat-Centric Video Understanding. arXiv:2305.06355 [cs.CV]"},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. IEEE.","author":"Li Yicong","year":"2025","unstructured":"Yicong Li, Yiyang Chen, Zhenyuan Ma, Junbin Xiao, Xiang Wang, and Yao Angela. 2025. Intermediate Connectors and Geometric Priors for Language-Guided Affordance Segmentation on Unseen Object Categories. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. IEEE."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548035"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00294"},{"key":"e_1_3_2_1_30_1","volume-title":"Transformer-empowered invariant grounding for video question answering","author":"Li Yicong","year":"2023","unstructured":"Yicong Li, Xiang Wang, Junbin Xiao, Wei Ji, and Tat-Seng Chua. 2023b. Transformer-empowered invariant grounding for video question answering. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01275"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475540"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01351"},{"key":"e_1_3_2_1_34_1","volume-title":"Yifan Xu, Xixuan Song, Shudan Zhang, Hanyu Lai, Xinyi Liu, Hanlin Zhao, et al.","author":"Liu Xiao","year":"2024","unstructured":"Xiao Liu, Tianjie Zhang, Yu Gu, Iat Long Iong, Yifan Xu, Xixuan Song, Shudan Zhang, Hanyu Lai, Xinyi Liu, Hanlin Zhao, et al., 2024. Visualagentbench: Towards large multimodal models as visual foundation agents. arXiv preprint arXiv:2408.06327 (2024)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19781-9_19"},{"key":"e_1_3_2_1_36_1","volume-title":"Vista-LLaMA: Reliable Video Narrator via Equal Distance to Visual Tokens. arXiv preprint arXiv:2312.08870","author":"Ma Fan","year":"2023","unstructured":"Fan Ma, Xiaojie Jin, Heng Wang, Yuchen Xian, Jiashi Feng, and Yi Yang. 2023. Vista-LLaMA: Reliable Video Narrator via Equal Distance to Visual Tokens. arXiv preprint arXiv:2312.08870 (2023)."},{"key":"e_1_3_2_1_37_1","volume-title":"Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems","author":"Mangalam Karttikeya","year":"2023","unstructured":"Karttikeya Mangalam, Raiymbek Akshulakov, and Jitendra Malik. 2023. EgoSchema: A Diagnostic Benchmark for Very Long-form Video Language Understanding. In Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023, Alice Oh, Tristan Naumann, Amir Globerson, Kate Saenko, Moritz Hardt, and Sergey Levine (Eds.)."},{"key":"e_1_3_2_1_38_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Mangalam Karttikeya","year":"2024","unstructured":"Karttikeya Mangalam, Ruslan Akshulakov, and Jitendra Malik. 2024. Egoschema: A diagnostic benchmark for very long-form video language understanding. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01428"},{"key":"e_1_3_2_1_40_1","volume-title":"S4nd: Modeling images and videos as multidimensional signals with state spaces. Advances in neural information processing systems","author":"Nguyen Eric","year":"2022","unstructured":"Eric Nguyen, Karan Goel, Albert Gu, Gordon Downs, Preey Shah, Tri Dao, Stephen Baccus, and Christopher R\u00e9. 2022. S4nd: Modeling images and videos as multidimensional signals with state spaces. Advances in neural information processing systems, Vol. 35 (2022), 2846-2861."},{"key":"e_1_3_2_1_41_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. arXiv:2303.08774 [cs.CL]"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00288"},{"key":"e_1_3_2_1_43_1","volume-title":"A Simple Recipe for Contrastively Pre-training Video-First Encoders Beyond 16 Frames. arXiv preprint arXiv:2312.07395","author":"Papalampidi Pinelopi","year":"2023","unstructured":"Pinelopi Papalampidi, Skanda Koppula, Shreya Pathak, Justin Chiu, Joe Heyward, Viorica Patraucean, Jiajun Shen, Antoine Miech, Andrew Zisserman, and Aida Nematzdeh. 2023. A Simple Recipe for Contrastively Pre-training Video-First Encoders Beyond 16 Frames. arXiv preprint arXiv:2312.07395 (2023)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01364"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28253"},{"key":"e_1_3_2_1_46_1","volume-title":"Generalized Video Moment Retrieval. In The Thirteenth International Conference on Learning Representations.","author":"Qin You","year":"2025","unstructured":"You Qin, Qilong Wu, Yicong Li, Wei Ji, Li Li, Pengcheng Cai, Lina Wei, and Roger Zimmermann. 2025. Generalized Video Moment Retrieval. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475263"},{"key":"e_1_3_2_1_48_1","volume-title":"Video-xl: Extra-long vision language model for hour-scale video understanding. arXiv preprint arXiv:2409.14485","author":"Shu Yan","year":"2024","unstructured":"Yan Shu, Peitian Zhang, Zheng Liu, Minghao Qin, Junjie Zhou, Tiejun Huang, and Bo Zhao. 2024. Video-xl: Extra-long vision language model for hour-scale video understanding. arXiv preprint arXiv:2409.14485 (2024)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"e_1_3_2_1_50_1","volume-title":"Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389","author":"Sun Quan","year":"2023","unstructured":"Quan Sun, Yuxin Fang, Ledell Wu, Xinlong Wang, and Yue Cao. 2023. Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389 (2023)."},{"key":"e_1_3_2_1_51_1","volume-title":"Long-form video-language pre-training with multimodal temporal contrastive learning. Advances in neural information processing systems","author":"Sun Yuchong","year":"2022","unstructured":"Yuchong Sun, Hongwei Xue, Ruihua Song, Bei Liu, Huan Yang, and Jianlong Fu. 2022. Long-form video-language pre-training with multimodal temporal contrastive learning. Advances in neural information processing systems, Vol. 35 (2022), 38032-38045."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01092"},{"key":"e_1_3_2_1_53_1","volume-title":"European Conference on Computer Vision. Springer, 142-160","author":"Wang Shijie","year":"2024","unstructured":"Shijie Wang, Qi Zhao, Minh Quan Do, Nakul Agarwal, Kwonjoon Lee, and Chen Sun. 2024. Vamos: Versatile action models for video understanding. In European Conference on Computer Vision. Springer, 142-160."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72989-8_4"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00020"},{"key":"e_1_3_2_1_56_1","volume-title":"InternVideo: General Video Foundation Models via Generative and Discriminative Learning. CoRR","author":"Wang Yi","year":"2022","unstructured":"Yi Wang, Kunchang Li, Yizhuo Li, Yinan He, Bingkun Huang, Zhiyu Zhao, Hongjie Zhang, Jilan Xu, Yi Liu, Zun Wang, Sen Xing, Guo Chen, Junting Pan, Jiashuo Yu, Yali Wang, Limin Wang, and Yu Qiao. 2022. InternVideo: General Video Foundation Models via Generative and Discriminative Learning. CoRR, Vol. abs\/2212.03191 (2022)."},{"key":"e_1_3_2_1_57_1","volume-title":"Lifelongmemory: Leveraging llms for answering queries in egocentric videos. arXiv preprint arXiv:2312.05269","author":"Wang Yixin","year":"2023","unstructured":"Yixin Wang, You Yang, and Mingzhuo Ren. 2023. Lifelongmemory: Leveraging llms for answering queries in egocentric videos. arXiv preprint arXiv:2312.05269 (2023)."},{"key":"e_1_3_2_1_58_1","volume-title":"Linknet: Relational embedding for scene graph. Advances in neural information processing systems","author":"Woo Sanghyun","year":"2018","unstructured":"Sanghyun Woo, Dahun Kim, Donghyeon Cho, and In So Kweon. 2018. Linknet: Relational embedding for scene graph. Advances in neural information processing systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/Proceedings"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20184"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3292266"},{"key":"e_1_3_2_1_63_1","unstructured":"Jiaqi Xu Cuiling Lan Wenxuan Xie Xuejin Chen and Yan Lu. 2023. Retrieval-based Video Language Model for Efficient Long Video Question Answering. arXiv:2312.04931 [cs.CV]"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00171"},{"key":"e_1_3_2_1_65_1","unstructured":"Antoine Yang Antoine Miech Josef Sivic Ivan Laptev and Cordelia Schmid. 2022b. Zero-Shot Video Question Answering via Frozen Bidirectional Language Models. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_11"},{"key":"e_1_3_2_1_67_1","volume-title":"What gives the answer away? question answering bias analysis on video qa datasets. arXiv preprint arXiv:2007.03626","author":"Yang Jianing","year":"2020","unstructured":"Jianing Yang, Yuying Zhu, Yongxin Wang, Ruitao Yi, Amir Zadeh, and Louis-Philippe Morency. 2020. What gives the answer away? question answering bias analysis on video qa datasets. arXiv preprint arXiv:2007.03626 (2020)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01413"},{"key":"e_1_3_2_1_69_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision.","author":"Yu Jiashuo","year":"2025","unstructured":"Jiashuo Yu, Yue Wu, Meng Chu, Zhifei Ren, Zizheng Huang, Pei Chu, Ruijie Zhang, Yinan He, Qirui Li, Songze Li, et al., 2025. VRBench: A Benchmark for Multi-Step Reasoning in Long Narrative Videos. In Proceedings of the IEEE\/CVF International Conference on Computer Vision."},{"key":"e_1_3_2_1_70_1","volume-title":"Self-Chained Image-Language Model for Video Localization and Question Answering. NeurIPS","author":"Yu Shoubin","year":"2023","unstructured":"Shoubin Yu, Jaemin Cho, Prateek Yadav, and Mohit Bansal. 2023. Self-Chained Image-Language Model for Video Localization and Question Answering. NeurIPS (2023)."},{"key":"e_1_3_2_1_71_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Yu Sangho","year":"2024","unstructured":"Sangho Yu, Jaemin Cho, Prateek Yadav, and Mohit Bansal. 2024. Self-chained image-language model for video localization and question answering. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00611"},{"key":"e_1_3_2_1_73_1","volume-title":"Ziyang Wang, Shoubin Yu, Mohit Bansal, and Gedas Bertasius.","author":"Zhang Ce","year":"2023","unstructured":"Ce Zhang, Taixi Lu, Md Mohaiminul Islam, Ziyang Wang, Shoubin Yu, Mohit Bansal, and Gedas Bertasius. 2023a. A simple llm framework for long-range video question-answering. arXiv preprint arXiv:2312.17235 (2023)."},{"key":"e_1_3_2_1_74_1","volume-title":"Zichen Wang, Sangho Yu, Mohit Bansal, and Gedas Bertasius.","author":"Zhang Chunting","year":"2023","unstructured":"Chunting Zhang, Thomas Lu, Md Mohaiminul Islam, Zichen Wang, Sangho Yu, Mohit Bansal, and Gedas Bertasius. 2023b. A simple llm framework for long-range video question-answering. arXiv preprint arXiv:2312.17235 (2023)."},{"key":"e_1_3_2_1_75_1","volume-title":"Flash-VStream: Memory-Based Real-Time Understanding for Long Video Streams. arXiv preprint arXiv:2406.08085","author":"Zhang Haoji","year":"2024","unstructured":"Haoji Zhang, Yiqin Wang, Yansong Tang, Yong Liu, Jiashi Feng, Jifeng Dai, and Xiaojie Jin. 2024. Flash-VStream: Memory-Based Real-Time Understanding for Long Video Streams. arXiv preprint arXiv:2406.08085 (2024)."},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00637"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755537","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:20:39Z","timestamp":1765308039000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755537"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":76,"alternative-id":["10.1145\/3746027.3755537","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755537","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}