{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,12]],"date-time":"2026-06-12T17:08:20Z","timestamp":1781284100165,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":68,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["92470205, 62176002"],"award-info":[{"award-number":["92470205, 62176002"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754839","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:56:44Z","timestamp":1761375404000},"page":"10807-10816","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["TimeChat-Online: 80% Visual Tokens are Naturally Redundant in Streaming Videos"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9809-8864","authenticated-orcid":false,"given":"Linli","family":"Yao","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5599-1504","authenticated-orcid":false,"given":"Yicheng","family":"Li","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1576-3341","authenticated-orcid":false,"given":"Yuancheng","family":"Wei","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6984-5104","authenticated-orcid":false,"given":"Lei","family":"Li","sequence":"additional","affiliation":[{"name":"University of Hong Kong, Hong Kong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9998-864X","authenticated-orcid":false,"given":"Shuhuai","family":"Ren","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7218-4011","authenticated-orcid":false,"given":"Yuanxin","family":"Liu","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5788-9126","authenticated-orcid":false,"given":"Kun","family":"Ouyang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7676-8065","authenticated-orcid":false,"given":"Lean","family":"Wang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5724-0641","authenticated-orcid":false,"given":"Shicheng","family":"Li","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7994-8050","authenticated-orcid":false,"given":"Sida","family":"Li","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9033-2724","authenticated-orcid":false,"given":"Lingpeng","family":"Kong","sequence":"additional","affiliation":[{"name":"University of Hong Kong, Hong Kong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4608-5778","authenticated-orcid":false,"given":"Qi","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Hong Kong, Hong Kong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1460-8124","authenticated-orcid":false,"given":"Yuanxing","family":"Zhang","sequence":"additional","affiliation":[{"name":"Kling Team, Kuaishou, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8241-9320","authenticated-orcid":false,"given":"Xu","family":"Sun","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-44503-X_27"},{"key":"e_1_3_2_1_2_1","unstructured":"Anthropic. 2024. Claude 3.5 Sonnet. https:\/\/www.anthropic.com\/news\/claude-3-5-sonnet"},{"key":"e_1_3_2_1_3_1","volume-title":"MiniGPT4-Video: Advancing Multimodal LLMs for Video Understanding with Interleaved Visual-Textual Tokens. ArXiv preprint","author":"Ataallah Kirolos","year":"2024","unstructured":"Kirolos Ataallah, Xiaoqian Shen, Eslam Abdelrahman, Essam Sleiman, Deyao Zhu, Jian Ding, and Mohamed Elhoseiny. 2024. MiniGPT4-Video: Advancing Multimodal LLMs for Video Understanding with Interleaved Visual-Textual Tokens. ArXiv preprint, Vol. abs\/2404.03413 (2024)."},{"key":"e_1_3_2_1_4_1","volume-title":"Qwen-VL: A Frontier Large Vision-Language Model with Versatile Abilities. ArXiv preprint","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-VL: A Frontier Large Vision-Language Model with Versatile Abilities. ArXiv preprint, Vol. abs\/2308.12966 (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"arXiv preprint arXiv:2502.13923","author":"Bai Shuai","year":"2025","unstructured":"Shuai Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Sibo Song, Kai Dang, Peng Wang, Shijie Wang, Jun Tang, Humen Zhong, Yuanzhi Zhu, Mingkun Yang, Zhaohai Li, Jianqiang Wan, Pengfei Wang, Wei Ding, Zheren Fu, Yiheng Xu, Jiabo Ye, Xi Zhang, Tianbao Xie, Zesen Cheng, Hang Zhang, Zhibo Yang, Haiyang Xu, and Junyang Lin. 2025. Qwen2.5-VL Technical Report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_6_1","volume-title":"Auroracap: Efficient, performant video detailed captioning and a new benchmark. arXiv preprint arXiv:2410.03051","author":"Chai Wenhao","year":"2024","unstructured":"Wenhao Chai, Enxin Song, Yilun Du, Chenlin Meng, Vashisht Madhavan, Omer Bar-Tal, Jenq-Neng Hwang, Saining Xie, and Christopher D Manning. 2024. Auroracap: Efficient, performant video detailed captioning and a new benchmark. arXiv preprint arXiv:2410.03051 (2024)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01742"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Liang Chen Haozhe Zhao Tianyu Liu Shuai Bai Junyang Lin Chang Zhou and Baobao Chang. 2024b. An Image is Worth 1\/2 Tokens After Layer 2: Plug-and-Play Inference Acceleration for Large Vision-Language Models. arXiv:2403.06764 [cs.CV]","DOI":"10.1007\/978-3-031-73004-7_2"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0882"},{"key":"e_1_3_2_1_10_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven C. H. Hoi.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven C. H. Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. In Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023, Alice Oh, Tristan Naumann, Amir Globerson, Kate Saenko, Moritz Hardt, and Sergey Levine (Eds.)."},{"key":"e_1_3_2_1_11_1","volume-title":"Streaming video question-answering with in-context video kv-cache retrieval. arXiv preprint arXiv:2503.00540","author":"Di Shangzhe","year":"2025","unstructured":"Shangzhe Di, Zhelun Yu, Guanghao Zhang, Haoyuan Li, Tao Zhong, Hao Cheng, Bolin Li, Wanggui He, Fangxun Shu, and Hao Jiang. 2025. Streaming video question-answering with in-context video kv-cache retrieval. arXiv preprint arXiv:2503.00540 (2025)."},{"key":"e_1_3_2_1_12_1","volume-title":"International Conference on Learning Representations.","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, G Heigold, S Gelly, et al., 2020. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_13_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan Anirudh Goyal Anthony Hartshorn Aobo Yang Archi Mitra Archie Sravankumar Artem Korenev Arthur Hinsvark Arun Rao Aston Zhang Aur\u00e9lien Rodriguez Austen Gregerson Ava Spataru Baptiste Rozi\u00e8re Bethany Biron Binh Tang Bobbie Chern Charlotte Caucheteux Chaya Nayak Chloe Bi Chris Marra Chris McConnell Christian Keller Christophe Touret Chunyang Wu Corinne Wong Cristian Canton Ferrer Cyrus Nikolaidis Damien Allonsius Daniel Song Danielle Pintz Danny Livshits David Esiobu Dhruv Choudhary Dhruv Mahajan Diego Garcia-Olano Diego Perino Dieuwke Hupkes Egor Lakomkin Ehab AlBadawy Elina Lobanova Emily Dinan Eric Michael Smith Filip Radenovic Frank Zhang Gabriel Synnaeve Gabrielle Lee Georgia Lewis Anderson Graeme Nail Gr\u00e9goire Mialon Guan Pang Guillem Cucurell Hailey Nguyen Hannah Korevaar Hu Xu Hugo Touvron Iliyan Zarov Imanol Arrieta Ibarra Isabel M. Kloumann Ishan Misra Ivan Evtimov Jade Copet Jaewon Lee Jan Geffert Jana Vranes Jason Park Jay Mahadeokar Jeet Shah Jelmer van der Linde Jennifer Billock Jenny Hong Jenya Lee Jeremy Fu Jianfeng Chi Jianyu Huang Jiawen Liu Jie Wang Jiecao Yu Joanna Bitton Joe Spisak Jongsoo Park Joseph Rocca Joshua Johnstun Joshua Saxe Junteng Jia Kalyan Vasuden Alwala Kartikeya Upasani Kate Plawiak Ke Li Kenneth Heafield Kevin Stone and et al. 2024. The Llama 3 Herd of Models. ArXiv preprint Vol. abs\/2407.21783 (2024)."},{"key":"e_1_3_2_1_14_1","unstructured":"Chaoyou Fu Yuhan Dai Yondong Luo Lei Li Shuhuai Ren Renrui Zhang Zihan Wang Chenyu Zhou Yunhang Shen Mengdan Zhang et al. 2024. Video-MME: The First-Ever Comprehensive Evaluation Benchmark of Multi-modal LLMs in Video Analysis. ArXiv preprint Vol. abs\/2405.21075 (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Unlocking multimodal understanding across millions of tokens of context. ArXiv preprint","author":"Team Gemini","year":"2024","unstructured":"Gemini Team. 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. ArXiv preprint, Vol. abs\/2403.05530 (2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"Minicpm: Unveiling the potential of small language models with scalable training strategies. arXiv preprint arXiv:2404.06395","author":"Hu Shengding","year":"2024","unstructured":"Shengding Hu, Yuge Tu, Xu Han, Chaoqun He, Ganqu Cui, Xiang Long, Zhi Zheng, Yewei Fang, Yuxiang Huang, Weilin Zhao, et al., 2024. Minicpm: Unveiling the potential of small language models with scalable training strategies. arXiv preprint arXiv:2404.06395 (2024)."},{"key":"e_1_3_2_1_17_1","volume-title":"Online video understanding: A comprehensive benchmark and memory-augmented method. arXiv preprint arXiv:2501.00584","author":"Huang Zhenpeng","year":"2024","unstructured":"Zhenpeng Huang, Xinhao Li, Jiaqi Li, Jing Wang, Xiangyu Zeng, Cheng Liang, Tao Wu, Xi Chen, Liang Li, and Limin Wang. 2024. Online video understanding: A comprehensive benchmark and memory-augmented method. arXiv preprint arXiv:2501.00584 (2024)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01300"},{"key":"e_1_3_2_1_19_1","volume-title":"2024 e. LLaVA-OneVision: Easy Visual Task Transfer. ArXiv preprint","author":"Li Bo","year":"2024","unstructured":"Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Yanwei Li, Ziwei Liu, and Chunyuan Li. 2024 e. LLaVA-OneVision: Easy Visual Task Transfer. ArXiv preprint, Vol. abs\/2408.03326 (2024)."},{"key":"e_1_3_2_1_20_1","volume-title":"LLaVA-ST: A Multimodal Large Language Model for Fine-Grained Spatial-Temporal Understanding. arXiv preprint arXiv:2501.08282","author":"Li Hongyu","year":"2025","unstructured":"Hongyu Li, Jinyu Chen, Ziyu Wei, Shaofei Huang, Tianrui Hui, Jialin Gao, Xiaoming Wei, and Si Liu. 2025a. LLaVA-ST: A Multimodal Large Language Model for Fine-Grained Spatial-Temporal Understanding. arXiv preprint arXiv:2501.08282 (2025)."},{"key":"e_1_3_2_1_21_1","volume-title":"VideoChat: Chat-Centric Video Understanding. ArXiv preprint","author":"Li Kunchang","year":"2023","unstructured":"Kunchang Li, Yinan He, Yi Wang, Yizhuo Li, Wenhai Wang, Ping Luo, Yali Wang, Limin Wang, and Yu Qiao. 2023. VideoChat: Chat-Centric Video Understanding. ArXiv preprint, Vol. abs\/2305.06355 (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"ICLR","author":"Li Lei","year":"2025","unstructured":"Lei Li, Yuanxin Liu, Linli Yao, Peiyuan Zhang, Chenxin An, Lean Wang, Xu Sun, Lingpeng Kong, and Qi Liu. 2025b. Temporal Reasoning Transfer from Text to Video. In ICLR 2025. OpenReview.net. https:\/\/openreview.net\/forum?id=sHAvMp5J4R"},{"key":"e_1_3_2_1_23_1","volume-title":"Videochat-flash: Hierarchical compression for long-context video modeling. arXiv preprint arXiv:2501.00574","author":"Li Xinhao","year":"2024","unstructured":"Xinhao Li, Yi Wang, Jiashuo Yu, Xiangyu Zeng, Yuhan Zhu, Haian Huang, Jianfei Gao, Kunchang Li, Yinan He, Chenting Wang, et al., 2024c. Videochat-flash: Hierarchical compression for long-context video modeling. arXiv preprint arXiv:2501.00574 (2024)."},{"key":"e_1_3_2_1_24_1","volume-title":"Videochat-flash: Hierarchical compression for long-context video modeling. arXiv preprint arXiv:2501.00574","author":"Li Xinhao","year":"2024","unstructured":"Xinhao Li, Yi Wang, Jiashuo Yu, Xiangyu Zeng, Yuhan Zhu, Haian Huang, Jianfei Gao, Kunchang Li, Yinan He, Chenting Wang, et al., 2024d. Videochat-flash: Hierarchical compression for long-context video modeling. arXiv preprint arXiv:2501.00574 (2024)."},{"key":"e_1_3_2_1_25_1","unstructured":"Yifei Li Junbo Niu Ziyang Miao Chunjiang Ge Yuanhang Zhou Qihao He Xiaoyi Dong Haodong Duan Shuangrui Ding Rui Qian Pan Zhang Yuhang Zang Yuhang Cao Conghui He and Jiaqi Wang. 2025c. OVO-Bench: How Far is Your Video-LLMs from Real-World Online Video Understanding? arXiv:2501.05510 [cs.CV] https:\/\/arxiv.org\/abs\/2501.05510"},{"key":"e_1_3_2_1_26_1","volume-title":"European Conference on Computer Vision. Springer, 323-340","author":"Li Yanwei","year":"2024","unstructured":"Yanwei Li, Chengyao Wang, and Jiaya Jia. 2024a. Llama-vid: An image is worth 2 tokens in large language models. In European Conference on Computer Vision. Springer, 323-340."},{"key":"e_1_3_2_1_27_1","volume-title":"European Conference on Computer Vision. Springer, 323-340","author":"Li Yanwei","year":"2024","unstructured":"Yanwei Li, Chengyao Wang, and Jiaya Jia. 2024b. Llama-vid: An image is worth 2 tokens in large language models. In European Conference on Computer Vision. Springer, 323-340."},{"key":"e_1_3_2_1_28_1","volume-title":"StreamingBench: Assessing the Gap for MLLMs to Achieve Streaming Video Understanding. arXiv preprint arXiv:2411.03628","author":"Lin Junming","year":"2024","unstructured":"Junming Lin, Zheng Fang, Chi Chen, Zihao Wan, Fuwen Luo, Peng Li, Yang Liu, and Maosong Sun. 2024a. StreamingBench: Assessing the Gap for MLLMs to Achieve Streaming Video Understanding. arXiv preprint arXiv:2411.03628 (2024)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"e_1_3_2_1_30_1","unstructured":"Haotian Liu Chunyuan Li Yuheng Li Bo Li Yuanhan Zhang Sheng Shen and Yong Jae Lee. 2024a. LLaVA-NeXT: Improved reasoning OCR and world knowledge."},{"key":"e_1_3_2_1_31_1","first-page":"32076","article-title":"Et bench: Towards open-ended event-level video-language understanding","volume":"37","author":"Liu Ye","year":"2024","unstructured":"Ye Liu, Zongyang Ma, Zhongang Qi, Yang Wu, Ying Shan, and Chang W Chen. 2024b. Et bench: Towards open-ended event-level video-language understanding. Advances in Neural Information Processing Systems, Vol. 37 (2024), 32076-32110.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_32_1","volume-title":"Hybrid-Level Instruction Injection for Video Token Compression in Multi-modal Large Language Models. arXiv preprint arXiv:2503.16036","author":"Liu Zhihang","year":"2025","unstructured":"Zhihang Liu, Chen-Wei Xie, Pandeng Li, Liming Zhao, Longxiang Tang, Yun Zheng, Chuanbin Liu, and Hongtao Xie. 2025. Hybrid-Level Instruction Injection for Video Token Compression in Multi-modal Large Language Models. arXiv preprint arXiv:2503.16036 (2025)."},{"key":"e_1_3_2_1_33_1","volume-title":"Inf-MLLM: Efficient streaming inference of multimodal large language models on a single GPU. arXiv preprint arXiv:2409.09086","author":"Ning Zhenyu","year":"2024","unstructured":"Zhenyu Ning, Jieru Zhao, Qihao Jin, Wenchao Ding, and Minyi Guo. 2024. Inf-MLLM: Efficient streaming inference of multimodal large language models on a single GPU. arXiv preprint arXiv:2409.09086 (2024)."},{"key":"e_1_3_2_1_34_1","unstructured":"OpenAI. 2024. GPT-4o System Card."},{"key":"e_1_3_2_1_35_1","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby et al. 2023. Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)."},{"key":"e_1_3_2_1_36_1","volume-title":"Dispider: Enabling Video LLMs with Active Real-Time Interaction via Disentangled Perception, Decision, and Reaction. arXiv preprint arXiv:2501.03218","author":"Qian Rui","year":"2025","unstructured":"Rui Qian, Shuangrui Ding, Xiaoyi Dong, Pan Zhang, Yuhang Zang, Yuhang Cao, Dahua Lin, and Jiaqi Wang. 2025. Dispider: Enabling Video LLMs with Active Real-Time Interaction via Disentangled Perception, Decision, and Reaction. arXiv preprint arXiv:2501.03218 (2025)."},{"key":"e_1_3_2_1_37_1","first-page":"119336","article-title":"Streaming long video understanding with large language models","volume":"37","author":"Qian Rui","year":"2024","unstructured":"Rui Qian, Xiaoyi Dong, Pan Zhang, Yuhang Zang, Shuangrui Ding, Dahua Lin, and Jiaqi Wang. 2024. Streaming long video understanding with large language models. Advances in Neural Information Processing Systems, Vol. 37 (2024), 119336-119360.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_38_1","volume-title":"TESTA: Temporal-Spatial Token Aggregation for Long-form Video-Language Understanding. ArXiv","author":"Ren Shuhuai","year":"2023","unstructured":"Shuhuai Ren, Sishuo Chen, Shicheng Li, Xu Sun, and Lu Hou. 2023. TESTA: Temporal-Spatial Token Aggregation for Long-form Video-Language Understanding. ArXiv, Vol. abs\/2310.19060 (2023)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01357"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1146\/annurev.psych.53.100901.135125"},{"key":"e_1_3_2_1_41_1","volume-title":"Longvu: Spatiotemporal adaptive compression for long video-language understanding. arXiv preprint arXiv:2410.17434","author":"Shen Xiaoqian","year":"2024","unstructured":"Xiaoqian Shen, Yunyang Xiong, Changsheng Zhao, Lemeng Wu, Jun Chen, Chenchen Zhu, Zechun Liu, Fanyi Xiao, Balakrishnan Varadarajan, Florian Bordes, et al., 2024. Longvu: Spatiotemporal adaptive compression for long video-language understanding. arXiv preprint arXiv:2410.17434 (2024)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1111\/j.0963-7214.2005.00332.x"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589335.3651526"},{"key":"e_1_3_2_1_45_1","volume-title":"DyCoke: Dynamic Compression of Tokens for Fast Video Large Language Models. arXiv preprint arXiv:2411.15024","author":"Tao Keda","year":"2024","unstructured":"Keda Tao, Can Qin, Haoxuan You, Yang Sui, and Huan Wang. 2024. DyCoke: Dynamic Compression of Tokens for Fast Video Large Language Models. arXiv preprint arXiv:2411.15024 (2024)."},{"key":"e_1_3_2_1_46_1","volume-title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv preprint arXiv:2409.12191","author":"Wang Peng","year":"2024","unstructured":"Peng Wang, Shuai Bai, Sinan Tan, Shijie Wang, Zhihao Fan, Jinze Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Yang Fan, Kai Dang, Mengfei Du, Xuancheng Ren, Rui Men, Dayiheng Liu, Chang Zhou, Jingren Zhou, and Junyang Lin. 2024a. Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Yueqian Wang Xiaojun Meng Yuxuan Wang Jianxin Liang Jiansheng Wei Huishuai Zhang and Dongyan Zhao. 2024b. VideoLLM Knows When to Speak: Enhancing Time-Sensitive Video Comprehension with Video-Text Duet Interaction Format. arXiv:2411.17991 [cs.CV] https:\/\/arxiv.org\/abs\/2411.17991","DOI":"10.32388\/1SMZIQ"},{"key":"e_1_3_2_1_48_1","volume-title":"Stop Looking for Important Tokens in Multimodal Language Models: Duplication Matters More. arXiv preprint arXiv:2502.11494","author":"Wen Zichen","year":"2025","unstructured":"Zichen Wen, Yifeng Gao, Shaobo Wang, Junyuan Zhang, Qintong Zhang, Weijia Li, Conghui He, and Linfeng Zhang. 2025. Stop Looking for Important Tokens in Multimodal Language Models: Duplication Matters More. arXiv preprint arXiv:2502.11494 (2025)."},{"key":"e_1_3_2_1_49_1","volume-title":"LongVideoBench: A Benchmark for Long-context Interleaved Video-Language Understanding. ArXiv preprint","author":"Wu Haoning","year":"2024","unstructured":"Haoning Wu, Dongxu Li, Bei Chen, and Junnan Li. 2024b. LongVideoBench: A Benchmark for Long-context Interleaved Video-Language Understanding. ArXiv preprint, Vol. abs\/2407.15754 (2024)."},{"key":"e_1_3_2_1_50_1","first-page":"109922","article-title":"Videollm-mod: Efficient video-language streaming with mixture-of-depths vision computation","volume":"37","author":"Wu Shiwei","year":"2024","unstructured":"Shiwei Wu, Joya Chen, Kevin Qinghong Lin, Qimeng Wang, Yan Gao, Qianli Xu, Tong Xu, Yao Hu, Enhong Chen, and Mike Zheng Shou. 2024a. Videollm-mod: Efficient video-language streaming with mixture-of-depths vision computation. Advances in Neural Information Processing Systems, Vol. 37 (2024), 109922-109947.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_51_1","volume-title":"Streaming Video Understanding and Multi-round Interaction with Memory-enhanced Knowledge. arXiv preprint arXiv:2501.13468","author":"Xiong Haomiao","year":"2025","unstructured":"Haomiao Xiong, Zongxin Yang, Jiazuo Yu, Yunzhi Zhuge, Lu Zhang, Jiawen Zhu, and Huchuan Lu. 2025. Streaming Video Understanding and Multi-round Interaction with Memory-enhanced Knowledge. arXiv preprint arXiv:2501.13468 (2025)."},{"key":"e_1_3_2_1_52_1","unstructured":"Jin Xu Zhifang Guo Jinzheng He Hangrui Hu Ting He Shuai Bai Keqin Chen Jialin Wang Yang Fan Kai Dang et al. 2025. Qwen2. 5-Omni Technical Report. arXiv preprint arXiv:2503.20215 (2025)."},{"key":"e_1_3_2_1_53_1","volume-title":"Slowfast-llava: A strong training-free baseline for video large language models. arXiv preprint arXiv:2407.15841","author":"Xu Mingze","year":"2024","unstructured":"Mingze Xu, Mingfei Gao, Zhe Gan, Hong-You Chen, Zhengfeng Lai, Haiming Gang, Kai Kang, and Afshin Dehghan. 2024. Slowfast-llava: A strong training-free baseline for video large language models. arXiv preprint arXiv:2407.15841 (2024)."},{"key":"e_1_3_2_1_54_1","unstructured":"An Yang Baosong Yang Binyuan Hui Bo Zheng Bowen Yu Chang Zhou Chengpeng Li Chengyuan Li Dayiheng Liu Fei Huang Guanting Dong Haoran Wei Huan Lin Jialong Tang Jialin Wang Jian Yang Jianhong Tu Jianwei Zhang Jianxin Ma Jin Xu Jingren Zhou Jinze Bai Jinzheng He Junyang Lin Kai Dang Keming Lu Keqin Chen Kexin Yang Mei Li Mingfeng Xue Na Ni Pei Zhang Peng Wang Ru Peng Rui Men Ruize Gao Runji Lin Shijie Wang Shuai Bai Sinan Tan Tianhang Zhu Tianhao Li Tianyu Liu Wenbin Ge Xiaodong Deng Xiaohuan Zhou Xingzhang Ren Xinyu Zhang Xipin Wei Xuancheng Ren Yang Fan Yang Yao Yichang Zhang Yu Wan Yunfei Chu Yuqiong Liu Zeyu Cui Zhenru Zhang and Zhihao Fan. 2024c. Qwen2 Technical Report. ArXiv preprint Vol. abs\/2407.10671 (2024)."},{"key":"e_1_3_2_1_55_1","volume-title":"PVC: Progressive Visual Token Compression for Unified Image and Video Processing in Large Vision-Language Models. arXiv preprint arXiv:2412.09613","author":"Yang Chenyu","year":"2024","unstructured":"Chenyu Yang, Xuan Dong, Xizhou Zhu, Weijie Su, Jiahao Wang, Hao Tian, Zhe Chen, Wenhai Wang, Lewei Lu, and Jifeng Dai. 2024b. PVC: Progressive Visual Token Compression for Unified Image and Video Processing in Large Vision-Language Models. arXiv preprint arXiv:2412.09613 (2024)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"crossref","unstructured":"Cheng Yang Yang Sui Jinqi Xiao Lingyi Huang Yu Gong Chendi Li Jinghua Yan Yu Bai Ponnuswamy Sadayappan Xia Hu et al. 2025. TopV: Compatible Token Pruning with Inference Time Optimization for Fast and Low-Memory Multimodal Vision Language Model. arXiv preprint arXiv:2503.18278 (2025).","DOI":"10.1109\/CVPR52734.2025.01844"},{"key":"e_1_3_2_1_57_1","volume-title":"Visionzip: Longer is better but not necessary in vision language models. arXiv preprint arXiv:2412.04467","author":"Yang Senqiao","year":"2024","unstructured":"Senqiao Yang, Yukang Chen, Zhuotao Tian, Chengyao Wang, Jingyao Li, Bei Yu, and Jiaya Jia. 2024a. Visionzip: Longer is better but not necessary in vision language models. arXiv preprint arXiv:2412.04467 (2024)."},{"key":"e_1_3_2_1_58_1","volume-title":"Deco: Decoupling token compression from semantic abstraction in multimodal large language models. arXiv preprint arXiv:2405.20985","author":"Yao Linli","year":"2024","unstructured":"Linli Yao, Lei Li, Shuhuai Ren, Lean Wang, Yuanxin Liu, Xu Sun, and Lu Hou. 2024. Deco: Decoupling token compression from semantic abstraction in multimodal large language models. arXiv preprint arXiv:2405.20985 (2024)."},{"key":"e_1_3_2_1_59_1","volume-title":"Tarsier2: Advancing Large Vision-Language Models from Detailed Video Description to Comprehensive Video Understanding. arXiv preprint arXiv:2501.07888","author":"Yuan Liping","year":"2025","unstructured":"Liping Yuan, Jiawei Wang, Haomiao Sun, Yuchen Zhang, and Yuan Lin. 2025. Tarsier2: Advancing Large Vision-Language Models from Detailed Video Description to Comprehensive Video Understanding. arXiv preprint arXiv:2501.07888 (2025)."},{"key":"e_1_3_2_1_60_1","unstructured":"Boqiang Zhang Kehan Li Zesen Cheng Zhiqiang Hu Yuqian Yuan Guanzheng Chen Sicong Leng Yuming Jiang Hang Zhang Xin Li et al. 2025. VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video Understanding. arXiv preprint arXiv:2501.13106 (2025)."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"e_1_3_2_1_62_1","volume-title":"Flash-vstream: Memory-based real-time understanding for long video streams. arXiv preprint arXiv:2406.08085","author":"Zhang Haoji","year":"2024","unstructured":"Haoji Zhang, Yiqin Wang, Yansong Tang, Yong Liu, Jiashi Feng, Jifeng Dai, and Xiaojie Jin. 2024d. Flash-vstream: Memory-based real-time understanding for long video streams. arXiv preprint arXiv:2406.08085 (2024)."},{"key":"e_1_3_2_1_63_1","volume-title":"Vinoground: Scrutinizing LMMs over Dense Temporal Reasoning with Short Videos. arXiv preprint arXiv:2410.02763","author":"Zhang Jianrui","year":"2024","unstructured":"Jianrui Zhang, Mu Cai, and Yong Jae Lee. 2024a. Vinoground: Scrutinizing LMMs over Dense Temporal Reasoning with Short Videos. arXiv preprint arXiv:2410.02763 (2024)."},{"key":"e_1_3_2_1_64_1","unstructured":"Pan Zhang Xiaoyi Dong Yuhang Cao Yuhang Zang Rui Qian Xilin Wei Lin Chen Yifei Li Junbo Niu Shuangrui Ding et al. 2024b. Internlm-xcomposer2. 5-omnilive: A comprehensive multimodal system for long-term streaming video and audio interactions. arXiv preprint arXiv:2412.09596 (2024)."},{"key":"e_1_3_2_1_65_1","volume-title":"A Comprehensive Multimodal System for Long-term Streaming Video and Audio Interactions. arXiv preprint arXiv:2412.09596","author":"Zhang Pan","year":"2024","unstructured":"Pan Zhang, Xiaoyi Dong, Yuhang Cao, Yuhang Zang, Rui Qian, Xilin Wei, Lin Chen, Yifei Li, Junbo Niu, Shuangrui Ding, Qipeng Guo, Haodong Duan, Xin Chen, Han Lv, Zheng Nie, Min Zhang, Bin Wang, Wenwei Zhang, Xinyue Zhang, Jiaye Ge, Wei Li, Jingwen Li, Zhongying Tu, Conghui He, Xingcheng Zhang, Kai Chen, Yu Qiao, Dahua Lin, and Jiaqi Wang. 2024c. InternLM-XComposer2.5-OmniLive: A Comprehensive Multimodal System for Long-term Streaming Video and Audio Interactions. arXiv preprint arXiv:2412.09596 (2024)."},{"key":"e_1_3_2_1_66_1","unstructured":"Yuanhan Zhang Jinming Wu Wei Li Bo Li Zejun Ma Ziwei Liu and Chunyuan Li. 2024 e. Video Instruction Tuning With Synthetic Data. arXiv:2410.02713 [cs.CV] https:\/\/arxiv.org\/abs\/2410.02713"},{"key":"e_1_3_2_1_67_1","first-page":"34661","article-title":"H2o: Heavy-hitter oracle for efficient generative inference of large language models","volume":"36","author":"Zhang Zhenyu","year":"2023","unstructured":"Zhenyu Zhang, Ying Sheng, Tianyi Zhou, Tianlong Chen, Lianmin Zheng, Ruisi Cai, Zhao Song, Yuandong Tian, Christopher R\u00e9, Clark Barrett, et al., 2023b. H2o: Heavy-hitter oracle for efficient generative inference of large language models. Advances in Neural Information Processing Systems, Vol. 36 (2023), 34661-34710.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_68_1","volume-title":"MLVU: A Comprehensive Benchmark for Multi-Task Long Video Understanding. ArXiv preprint","author":"Zhou Junjie","year":"2024","unstructured":"Junjie Zhou, Yan Shu, Bo Zhao, Boya Wu, Shitao Xiao, Xi Yang, Yongping Xiong, Bo Zhang, Tiejun Huang, and Zheng Liu. 2024. MLVU: A Comprehensive Benchmark for Multi-Task Long Video Understanding. ArXiv preprint, Vol. abs\/2406.04264 (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754839","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:41:40Z","timestamp":1765309300000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754839"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":68,"alternative-id":["10.1145\/3746027.3754839","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754839","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}