{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:41:17Z","timestamp":1765309277986,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755307","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:54:17Z","timestamp":1761375257000},"page":"4068-4076","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CITR: Efficient Long Video Understanding Needs Causal Importance"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2397-2163","authenticated-orcid":false,"given":"Ziqi","family":"Yuan","sequence":"first","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5473-2042","authenticated-orcid":false,"given":"Jun","family":"Li","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-7283-7344","authenticated-orcid":false,"given":"Yanghao","family":"Li","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9208-9840","authenticated-orcid":false,"given":"Yuxiang","family":"Huang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5254-4080","authenticated-orcid":false,"given":"Chi","family":"Chen","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5408-3145","authenticated-orcid":false,"given":"Shuo","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4421-273X","authenticated-orcid":false,"given":"Zhinan","family":"Gou","sequence":"additional","affiliation":[{"name":"Hebei University of Economics and Business, Shijiazhuang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Longvila: Scaling long-context visual language models for long videos. arXiv preprint arXiv:2408.10188","author":"Chen Yukang","year":"2024","unstructured":"Yukang Chen, Fuzhao Xue, Dacheng Li, Qinghao Hu, Ligeng Zhu, Xiuyu Li, Yunhao Fang, Haotian Tang, Shang Yang, Zhijian Liu, et al. 2024. Longvila: Scaling long-context visual language models for long videos. arXiv preprint arXiv:2408.10188 (2024)."},{"key":"e_1_3_2_1_2_1","volume-title":"Video-ccam: Enhancing video-language understanding with causal crossattention masks for short and long videos. arXiv preprint arXiv:2408.14023","author":"Fei Jiajun","year":"2024","unstructured":"Jiajun Fei, Dian Li, Zhidong Deng, Zekun Wang, Gang Liu, and Hui Wang. 2024. Video-ccam: Enhancing video-language understanding with causal crossattention masks for short and long videos. arXiv preprint arXiv:2408.14023 (2024)."},{"key":"e_1_3_2_1_3_1","volume-title":"Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis. arXiv preprint arXiv:2405.21075","author":"Fu Chaoyou","year":"2024","unstructured":"Chaoyou Fu, Yuhan Dai, Yongdong Luo, Lei Li, Shuhuai Ren, Renrui Zhang, Zihan Wang, Chenyu Zhou, Yunhang Shen, Mengdan Zhang, et al. 2024. Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis. arXiv preprint arXiv:2405.21075 (2024)."},{"key":"e_1_3_2_1_4_1","volume-title":"FrameFusion: Combining Similarity and Importance for Video Token Reduction on Large Visual Language Models. arXiv preprint arXiv:2501.01986","author":"Fu Tianyu","year":"2024","unstructured":"Tianyu Fu, Tengxuan Liu, Qinghao Han, Guohao Dai, Shengen Yan, Huazhong Yang, Xuefei Ning, and Yu Wang. 2024. FrameFusion: Combining Similarity and Importance for Video Token Reduction on Large Visual Language Models. arXiv preprint arXiv:2501.01986 (2024)."},{"key":"e_1_3_2_1_5_1","volume-title":"Locret: Enhancing Eviction in Long-Context LLM Inference with Trained Retaining Heads. arXiv preprint arXiv:2410.01805","author":"Huang Yuxiang","year":"2024","unstructured":"Yuxiang Huang, Binhang Yuan, Xu Han, Chaojun Xiao, and Zhiyuan Liu. 2024. Locret: Enhancing Eviction in Long-Context LLM Inference with Trained Retaining Heads. arXiv preprint arXiv:2410.01805 (2024)."},{"key":"e_1_3_2_1_6_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. Gpt-4o system card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_1_7_1","volume-title":"Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326","author":"Li Bo","year":"2024","unstructured":"Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Peiyuan Zhang, Yanwei Li, Ziwei Liu, et al. 2024. Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326 (2024)."},{"key":"e_1_3_2_1_8_1","volume-title":"Llava-next-interleave: Tackling multi-image, video, and 3d in large multimodal models. arXiv preprint arXiv:2407.07895","author":"Li Feng","year":"2024","unstructured":"Feng Li, Renrui Zhang, Hao Zhang, Yuanhan Zhang, Bo Li, Wei Li, Zejun Ma, and Chunyuan Li. 2024. Llava-next-interleave: Tackling multi-image, video, and 3d in large multimodal models. arXiv preprint arXiv:2407.07895 (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"Videochatflash: Hierarchical compression for long-context video modeling. arXiv preprint arXiv:2501.00574","author":"Li Xinhao","year":"2024","unstructured":"Xinhao Li, Yi Wang, Jiashuo Yu, Xiangyu Zeng, Yuhan Zhu, Haian Huang, Jianfei Gao, Kunchang Li, Yinan He, Chenting Wang, et al. 2024. Videochatflash: Hierarchical compression for long-context video modeling. arXiv preprint arXiv:2501.00574 (2024)."},{"key":"e_1_3_2_1_10_1","first-page":"22947","article-title":"Snapkv: Llm knows what you are looking for before generation","volume":"37","author":"Li Yuhong","year":"2024","unstructured":"Yuhong Li, Yingbing Huang, Bowen Yang, Bharat Venkitesh, Acyr Locatelli, Hanchen Ye, Tianle Cai, Patrick Lewis, and Deming Chen. 2024. Snapkv: Llm knows what you are looking for before generation. Advances in Neural Information Processing Systems 37 (2024), 22947-22970.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_11_1","volume-title":"European Conference on Computer Vision. Springer, 323-340","author":"Li Yanwei","year":"2024","unstructured":"Yanwei Li, Chengyao Wang, and Jiaya Jia. 2024. Llama-vid: An image is worth 2 tokens in large language models. In European Conference on Computer Vision. Springer, 323-340."},{"key":"e_1_3_2_1_12_1","volume-title":"Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, and Li Yuan. 2023. Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122 (2023)."},{"key":"e_1_3_2_1_13_1","volume-title":"Streamingbench: Assessing the gap for mllms to achieve streaming video understanding. arXiv preprint arXiv:2411.03628","author":"Lin Junming","year":"2024","unstructured":"Junming Lin, Zheng Fang, Chi Chen, Zihao Wan, Fuwen Luo, Peng Li, Yang Liu, and Maosong Sun. 2024. Streamingbench: Assessing the gap for mllms to achieve streaming video understanding. arXiv preprint arXiv:2411.03628 (2024)."},{"key":"e_1_3_2_1_14_1","volume-title":"Jianbin Jiao, EnhuaWu, and Jie Hu.","author":"Liu Jiajun","year":"2024","unstructured":"Jiajun Liu, Yibing Wang, Hanghang Ma, Xiaoping Wu, Xiaoqi Ma, xiaoming Wei, Jianbin Jiao, EnhuaWu, and Jie Hu. 2024. Kangaroo: A Powerful Video-Language Model Supporting Long-context Video Input. arXiv preprint arXiv:2408.15542 (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Ppllava: Varied video sequence understanding with prompt guidance. arXiv preprint arXiv:2411.02327","author":"Liu Ruyang","year":"2024","unstructured":"Ruyang Liu, Haoran Tang, Haibo Liu, Yixiao Ge, Ying Shan, Chen Li, and Jiankun Yang. 2024. Ppllava: Varied video sequence understanding with prompt guidance. arXiv preprint arXiv:2411.02327 (2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"NVILA: Efficient frontier visual language models. arXiv preprint arXiv:2412.04468","author":"Liu Zhijian","year":"2024","unstructured":"Zhijian Liu, Ligeng Zhu, Baifeng Shi, Zhuoyang Zhang, Yuming Lou, Shang Yang, Haocheng Xi, Shiyi Cao, Yuxian Gu, Dacheng Li, et al. 2024. NVILA: Efficient frontier visual language models. arXiv preprint arXiv:2412.04468 (2024)."},{"key":"e_1_3_2_1_17_1","volume-title":"Inf- MLLM: Efficient streaming inference of multimodal large language models on a single GPU. arXiv preprint arXiv:2409.09086","author":"Ning Zhenyu","year":"2024","unstructured":"Zhenyu Ning, Jieru Zhao, Qihao Jin, Wenchao Ding, and Minyi Guo. 2024. Inf- MLLM: Efficient streaming inference of multimodal large language models on a single GPU. arXiv preprint arXiv:2409.09086 (2024)."},{"key":"e_1_3_2_1_18_1","volume-title":"xgenmm- vid (blip-3-video): You only need 32 tokens to represent a video even in vlms. arXiv preprint arXiv:2410.16267","author":"Ryoo Michael S","year":"2024","unstructured":"Michael S Ryoo, Honglu Zhou, Shrikant Kendre, Can Qin, Le Xue, Manli Shu, Silvio Savarese, Ran Xu, Caiming Xiong, and Juan Carlos Niebles. 2024. xgenmm- vid (blip-3-video): You only need 32 tokens to represent a video even in vlms. arXiv preprint arXiv:2410.16267 (2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"Fast transformer decoding: One write-head is all you need. arXiv preprint arXiv:1911.02150","author":"Shazeer Noam","year":"2019","unstructured":"Noam Shazeer. 2019. Fast transformer decoding: One write-head is all you need. arXiv preprint arXiv:1911.02150 (2019)."},{"key":"e_1_3_2_1_20_1","volume-title":"Longvu: Spatiotemporal adaptive compression for long video-language understanding. arXiv preprint arXiv:2410.17434","author":"Shen Xiaoqian","year":"2024","unstructured":"Xiaoqian Shen, Yunyang Xiong, Changsheng Zhao, Lemeng Wu, Jun Chen, Chenchen Zhu, Zechun Liu, Fanyi Xiao, Balakrishnan Varadarajan, Florian Bordes, et al. 2024. Longvu: Spatiotemporal adaptive compression for long video-language understanding. arXiv preprint arXiv:2410.17434 (2024)."},{"key":"e_1_3_2_1_21_1","volume-title":"Video-xl: Extra-long vision language model for hour-scale video understanding. arXiv preprint arXiv:2409.14485","author":"Shu Yan","year":"2024","unstructured":"Yan Shu, Zheng Liu, Peitian Zhang, Minghao Qin, Junjie Zhou, Zhengyang Liang, Tiejun Huang, and Bo Zhao. 2024. Video-xl: Extra-long vision language model for hour-scale video understanding. arXiv preprint arXiv:2409.14485 (2024)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"e_1_3_2_1_23_1","volume-title":"Moviechat: Question-aware sparse memory for long video question answering. arXiv preprint arXiv:2404.17176","author":"Song Enxin","year":"2024","unstructured":"Enxin Song, Wenhao Chai, Tian Ye, Jenq-Neng Hwang, Xi Li, and Gaoang Wang. 2024. Moviechat: Question-aware sparse memory for long video question answering. arXiv preprint arXiv:2404.17176 (2024)."},{"key":"e_1_3_2_1_24_1","volume-title":"Ryan Burnell, Libin Bai, Anmol Gulati, Garrett Tanzer, Damien Vincent, Zhufeng Pan, Shibo Wang, et al.","author":"Team Gemini","year":"2024","unstructured":"Gemini Team, Petko Georgiev, Ving Ian Lei, Ryan Burnell, Libin Bai, Anmol Gulati, Garrett Tanzer, Damien Vincent, Zhufeng Pan, Shibo Wang, et al. 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530 (2024)."},{"key":"e_1_3_2_1_25_1","volume-title":"FlashSloth: Lightning Multimodal Large Language Models via Embedded Visual Compression. arXiv preprint arXiv:2412.04317","author":"Tong Bo","year":"2024","unstructured":"Bo Tong, Bokai Lai, Yiyi Zhou, Gen Luo, Yunhang Shen, Ke Li, Xiaoshuai Sun, and Rongrong Ji. 2024. FlashSloth: Lightning Multimodal Large Language Models via Embedded Visual Compression. arXiv preprint arXiv:2412.04317 (2024)."},{"key":"e_1_3_2_1_26_1","volume-title":"Look-m: Look-once optimization in kv cache for efficient multimodal long-context inference. arXiv preprint arXiv:2406.18139","author":"Liu Che","year":"2024","unstructured":"ZhongweiWan, ZiangWu, Che Liu, Jinfa Huang, Zhihong Zhu, Peng Jin, Longyue Wang, and Li Yuan. 2024. Look-m: Look-once optimization in kv cache for efficient multimodal long-context inference. arXiv preprint arXiv:2406.18139 (2024)."},{"key":"e_1_3_2_1_27_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_28_1","volume-title":"LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via a Hybrid Architecture. arXiv preprint arXiv:2409.02889","author":"Wang Xidong","year":"2024","unstructured":"Xidong Wang, Dingjie Song, Shunian Chen, Chen Zhang, and Benyou Wang. 2024. LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via a Hybrid Architecture. arXiv preprint arXiv:2409.02889 (2024)."},{"key":"e_1_3_2_1_29_1","volume-title":"European Conference on Computer Vision. Springer, 396-416","author":"Wang Yi","year":"2024","unstructured":"Yi Wang, Kunchang Li, Xinhao Li, Jiashuo Yu, Yinan He, Guo Chen, Baoqi Pei, Rongkun Zheng, Zun Wang, Yansong Shi, et al. 2024. Internvideo2: Scaling foundation models for multimodal video understanding. In European Conference on Computer Vision. Springer, 396-416."},{"key":"e_1_3_2_1_30_1","volume-title":"Internvideo: General video foundation models via generative and discriminative learning. arXiv preprint arXiv:2212.03191","author":"Li Kunchang","year":"2022","unstructured":"YiWang, Kunchang Li, Yizhuo Li, Yinan He, Bingkun Huang, Zhiyu Zhao, Hongjie Zhang, Jilan Xu, Yi Liu, Zun Wang, et al. 2022. Internvideo: General video foundation models via generative and discriminative learning. arXiv preprint arXiv:2212.03191 (2022)."},{"key":"e_1_3_2_1_31_1","volume-title":"Visual context window extension: A new perspective for long video understanding. arXiv preprint arXiv:2409.20018","author":"Wei Hongchen","year":"2024","unstructured":"Hongchen Wei and Zhenzhong Chen. 2024. Visual context window extension: A new perspective for long video understanding. arXiv preprint arXiv:2409.20018 (2024)."},{"key":"e_1_3_2_1_32_1","volume-title":"European Conference on Computer Vision. Springer, 453-470","author":"Weng Yuetian","year":"2024","unstructured":"Yuetian Weng, Mingfei Han, Haoyu He, Xiaojun Chang, and Bohan Zhuang. 2024. Longvlm: Efficient long video understanding via large language models. In European Conference on Computer Vision. Springer, 453-470."},{"key":"e_1_3_2_1_33_1","first-page":"28828","article-title":"Longvideobench: A benchmark for long-context interleaved video-language understanding","volume":"37","author":"Wu Haoning","year":"2024","unstructured":"Haoning Wu, Dongxu Li, Bei Chen, and Junnan Li. 2024. Longvideobench: A benchmark for long-context interleaved video-language understanding. Advances in Neural Information Processing Systems 37 (2024), 28828-28857.","journal-title":"Advances in Neural Information Processing Systems"},{"volume-title":"Efficient Streaming Language Models with Attention Sinks. In The Twelfth International Conference on Learning Representations.","author":"Xiao Guangxuan","key":"e_1_3_2_1_34_1","unstructured":"Guangxuan Xiao, Yuandong Tian, Beidi Chen, Song Han, and Mike Lewis. [n.d.]. Efficient Streaming Language Models with Attention Sinks. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"e_1_3_2_1_36_1","volume-title":"Slowfast-llava: A strong training-free baseline for video large language models. arXiv preprint arXiv:2407.15841","author":"Xu Mingze","year":"2024","unstructured":"Mingze Xu, Mingfei Gao, Zhe Gan, Hong-You Chen, Zhengfeng Lai, Haiming Gang, Kai Kang, and Afshin Dehghan. 2024. Slowfast-llava: A strong training-free baseline for video large language models. arXiv preprint arXiv:2407.15841 (2024)."},{"key":"e_1_3_2_1_37_1","volume-title":"The dawn of lmms: Preliminary explorations with gpt-4v (ision). arXiv preprint arXiv:2309.17421 9, 1","author":"Yang Zhengyuan","year":"2023","unstructured":"Zhengyuan Yang, Linjie Li, Kevin Lin, JianfengWang, Chung-Ching Lin, Zicheng Liu, and Lijuan Wang. 2023. The dawn of lmms: Preliminary explorations with gpt-4v (ision). arXiv preprint arXiv:2309.17421 9, 1 (2023), 1."},{"key":"e_1_3_2_1_38_1","volume-title":"Minicpm-v: A gpt-4v level mllm on your phone. arXiv preprint arXiv:2408.01800","author":"Yao Yuan","year":"2024","unstructured":"Yuan Yao, Tianyu Yu, Ao Zhang, Chongyi Wang, Junbo Cui, Hongji Zhu, Tianchi Cai, Haoyu Li, Weilin Zhao, Zhihui He, et al. 2024. Minicpm-v: A gpt-4v level mllm on your phone. arXiv preprint arXiv:2408.01800 (2024)."},{"key":"e_1_3_2_1_39_1","volume-title":"Video-llama: An instructiontuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858","author":"Zhang Hang","year":"2023","unstructured":"Hang Zhang, Xin Li, and Lidong Bing. 2023. Video-llama: An instructiontuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)."},{"key":"e_1_3_2_1_40_1","volume-title":"Long Context Transfer from Language to Vision. arXiv preprint arXiv:2406.16852","author":"Zhang Peiyuan","year":"2024","unstructured":"Peiyuan Zhang, Kaichen Zhang, Bo Li, Guangtao Zeng, Jingkang Yang, Yuanhan Zhang, Ziyue Wang, Haoran Tan, Chunyuan Li, and Ziwei Liu. 2024. Long Context Transfer from Language to Vision. arXiv preprint arXiv:2406.16852 (2024). https:\/\/arxiv.org\/abs\/2406.16852"},{"key":"e_1_3_2_1_41_1","volume-title":"Long context transfer from language to vision. arXiv preprint arXiv:2406.16852","author":"Zhang Peiyuan","year":"2024","unstructured":"Peiyuan Zhang, Kaichen Zhang, Bo Li, Guangtao Zeng, Jingkang Yang, Yuanhan Zhang, Ziyue Wang, Haoran Tan, Chunyuan Li, and Ziwei Liu. 2024. Long context transfer from language to vision. arXiv preprint arXiv:2406.16852 (2024)."},{"key":"e_1_3_2_1_42_1","volume-title":"Video instruction tuning with synthetic data. arXiv preprint arXiv:2410.02713","author":"Zhang Yuanhan","year":"2024","unstructured":"Yuanhan Zhang, Jinming Wu, Wei Li, Bo Li, Zejun Ma, Ziwei Liu, and Chunyuan Li. 2024. Video instruction tuning with synthetic data. arXiv preprint arXiv:2410.02713 (2024)."},{"key":"e_1_3_2_1_43_1","first-page":"34661","article-title":"H2o: Heavy-hitter oracle for efficient generative inference of large language models","volume":"36","author":"Zhang Zhenyu","year":"2023","unstructured":"Zhenyu Zhang, Ying Sheng, Tianyi Zhou, Tianlong Chen, Lianmin Zheng, Ruisi Cai, Zhao Song, Yuandong Tian, Christopher R\u00e9, Clark Barrett, et al. 2023. H2o: Heavy-hitter oracle for efficient generative inference of large language models. Advances in Neural Information Processing Systems 36 (2023), 34661-34710.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_44_1","volume-title":"Needle In A Video Haystack: A Scalable Synthetic Evaluator for Video MLLMs. arXiv preprint arXiv:2406.09367","author":"Zhao Zijia","year":"2024","unstructured":"Zijia Zhao, Haoyu Lu, Yuqi Huo, Yifan Du, Tongtian Yue, Longteng Guo, Bingning Wang,Weipeng Chen, and Jing Liu. 2024. Needle In A Video Haystack: A Scalable Synthetic Evaluator for Video MLLMs. arXiv preprint arXiv:2406.09367 (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755307","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:38:06Z","timestamp":1765309086000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755307"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":44,"alternative-id":["10.1145\/3746027.3755307","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755307","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}