{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:55:40Z","timestamp":1781538940805,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No.62222209"],"award-info":[{"award-number":["No.62222209"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100017582","name":"Beijing National Research Center For Information Science And Technology","doi-asserted-by":"publisher","award":["No.BNR2023TD03006"],"award-info":[{"award-number":["No.BNR2023TD03006"]}],"id":[{"id":"10.13039\/501100017582","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810717","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1413-1422","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Reflective Cross-Granularity Grounding with Preference Optimization for Long Video Understanding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-4796-4205","authenticated-orcid":false,"given":"Wei","family":"Feng","sequence":"first","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0351-2939","authenticated-orcid":false,"given":"Xin","family":"Wang","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0943-2286","authenticated-orcid":false,"given":"Hong","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5822-5646","authenticated-orcid":false,"given":"Yu-Wei","family":"Zhan","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1485-0536","authenticated-orcid":false,"given":"Zihan","family":"Song","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2504-3689","authenticated-orcid":false,"given":"Bin","family":"Huang","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3450-400X","authenticated-orcid":false,"given":"Kecheng","family":"Zheng","sequence":"additional","affiliation":[{"name":"Ant Research, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2236-9290","authenticated-orcid":false,"given":"Wenwu","family":"Zhu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Jinze Bai Shuai Bai Shusheng Yang Shijie Wang Sinan Tan Peng Wang Junyang Lin Chang Zhou and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.12966 (2023)."},{"key":"e_1_3_3_2_3_2","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et\u00a0al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.13923 (2025)."},{"key":"e_1_3_3_2_4_2","unstructured":"Yuntao Bai Saurav Kadavath Sandipan Kundu Amanda Askell Jackson Kernion Andy Jones Anna Chen Anna Goldie Azalia Mirhoseini Cameron McKinnon et\u00a0al. 2022. Constitutional ai: Harmlessness from ai feedback. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2212.08073 (2022)."},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_3_2_6_2","unstructured":"Rohan Bavishi Erich Elsen Curtis Hawthorne Maxwell Nye Augustus Odena Arushi Somani and Sa\u011fnak Ta\u015f\u0131rlar. 2023. Fuyu-8B: A multimodal architecture for AI agents."},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"crossref","unstructured":"Lin Chen Xilin Wei Jinsong Li Xiaoyi Dong Pan Zhang Yuhang Zang Zehui Chen Haodong Duan Bin Lin Zhenyu Tang et\u00a0al. 2024. Sharegpt4video: Improving video understanding and generation with better captions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.04325 (2024).","DOI":"10.52202\/079017-0614"},{"key":"e_1_3_3_2_8_2","unstructured":"Zesen Cheng Sicong Leng Hang Zhang Yifei Xin Xin Li Guanzheng Chen Yongxin Zhu Wenqi Zhang Ziyang Luo Deli Zhao et\u00a0al. 2024. VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.07476 (2024)."},{"key":"e_1_3_3_2_9_2","unstructured":"Paul\u00a0F Christiano Jan Leike Tom Brown Miljan Martic Shane Legg and Dario Amodei. 2017. Deep reinforcement learning from human preferences. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i3.32276"},{"key":"e_1_3_3_2_11_2","unstructured":"Chaoyou Fu Yuhan Dai Yongdong Luo Lei Li Shuhuai Ren Renrui Zhang Zihan Wang Chenyu Zhou Yunhang Shen Mengdan Zhang et\u00a0al. 2024. Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.21075 (2024)."},{"key":"e_1_3_3_2_12_2","unstructured":"Chaoyou Fu Haojia Lin Xiong Wang Yi-Fan Zhang Yunhang Shen Xiaoyu Liu Haoyu Cao Zuwei Long Heting Gao Ke Li et\u00a0al. 2025. Vita-1.5: Towards gpt-4o level real-time vision and speech interaction. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.01957 (2025)."},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i3.32341"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01353"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01300"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_3_2_19_2","unstructured":"KunChang Li Yinan He Yi Wang Yizhuo Li Wenhai Wang Ping Luo Yali Wang Limin Wang and Yu Qiao. 2023. Videochat: Chat-centric video understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.06355 (2023)."},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"e_1_3_3_2_21_2","first-page":"323","volume-title":"European Conference on Computer Vision","author":"Li Yanwei","year":"2024","unstructured":"Yanwei Li, Chengyao Wang, and Jiaya Jia. 2024. Llama-vid: An image is worth 2 tokens in large language models. In European Conference on Computer Vision. Springer, 323\u2013340."},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_3_2_24_2","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong\u00a0Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems 36 (2024)."},{"key":"e_1_3_3_2_25_2","unstructured":"Yongdong Luo Xiawu Zheng Xiao Yang Guilin Li Haojia Lin Jinfa Huang Jiayi Ji Fei Chao Jiebo Luo and Rongrong Ji. 2024. Video-rag: Visually-aligned retrieval-augmented long video comprehension. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.13093 (2024)."},{"key":"e_1_3_3_2_26_2","volume-title":"Forty-first International Conference on Machine Learning","author":"Qian Long","year":"2024","unstructured":"Long Qian, Juncheng Li, Yu Wu, Yaobo Ye, Hao Fei, Tat-Seng Chua, Yueting Zhuang, and Siliang Tang. 2024. Momentor: Advancing Video Large Language Model with Fine-Grained Temporal Reasoning. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_3_2_27_2","unstructured":"Minghao Qin Xiangrui Liu Zhengyang Liang Yan Shu Huaying Yuan Juenjie Zhou Shitao Xiao Bo Zhao and Zheng Liu. 2025. Video-XL-2: Towards Very Long-Video Understanding Through Task-Aware KV Sparsification. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.19225 (2025)."},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00191"},{"key":"e_1_3_3_2_29_2","unstructured":"Rafael Rafailov Archit Sharma Eric Mitchell Christopher\u00a0D Manning Stefano Ermon and Chelsea Finn. 2024. Direct preference optimization: Your language model is secretly a reward model. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01357"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"crossref","unstructured":"Christoph Schuhmann Romain Beaumont Richard Vencu Cade Gordon Ross Wightman Mehdi Cherti Theo Coombes Aarush Katta Clayton Mullis Mitchell Wortsman et\u00a0al. 2022. Laion-5b: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems 35 (2022) 25278\u201325294.","DOI":"10.52202\/068431-1833"},{"key":"e_1_3_3_2_32_2","volume-title":"Forty-second International Conference on Machine Learning","author":"Shen Xiaoqian","year":"2025","unstructured":"Xiaoqian Shen, Yunyang Xiong, Changsheng Zhao, Lemeng Wu, Jun Chen, Chenchen Zhu, Zechun Liu, Fanyi Xiao, Balakrishnan Varadarajan, Florian Bordes, et\u00a0al. 2025. LongVU: Spatiotemporal Adaptive Compression for Long Video-Language Understanding. In Forty-second International Conference on Machine Learning."},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02436"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i7.32775"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02711"},{"key":"e_1_3_3_2_37_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar et\u00a0al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971 (2023)."},{"key":"e_1_3_3_2_38_2","unstructured":"Weihan Wang Zehai He Wenyi Hong Yean Cheng Xiaohan Zhang Ji Qi Xiaotao Gu Shiyu Huang Bin Xu Yuxiao Dong et\u00a0al. 2024. Lvbench: An extreme long video understanding benchmark. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.08035 (2024)."},{"key":"e_1_3_3_2_39_2","first-page":"58","volume-title":"European Conference on Computer Vision","author":"Wang Xiaohan","year":"2024","unstructured":"Xiaohan Wang, Yuhui Zhang, Orr Zohar, and Serena Yeung-Levy. 2024. Videoagent: Long-form video understanding with large language model as agent. In European Conference on Computer Vision. Springer, 58\u201376."},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00311"},{"key":"e_1_3_3_2_41_2","volume-title":"NIPS","author":"Waswani A","year":"2017","unstructured":"A Waswani, N Shazeer, N Parmar, J Uszkoreit, L Jones, A Gomez, L Kaiser, and I Polosukhin. 2017. Attention is all you need. In NIPS."},{"key":"e_1_3_3_2_42_2","first-page":"453","volume-title":"European Conference on Computer Vision","author":"Weng Yuetian","year":"2024","unstructured":"Yuetian Weng, Mingfei Han, Haoyu He, Xiaojun Chang, and Bohan Zhuang. 2024. Longvlm: Efficient long video understanding via large language models. In European Conference on Computer Vision. Springer, 453\u2013470."},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"crossref","unstructured":"Haoning Wu Dongxu Li Bei Chen and Junnan Li. 2024. Longvideobench: A benchmark for long-context interleaved video-language understanding. Advances in Neural Information Processing Systems 37 (2024) 28828\u201328857.","DOI":"10.52202\/079017-0907"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01284"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"crossref","unstructured":"Shoubin Yu Jaemin Cho Prateek Yadav and Mohit Bansal. 2023. Self-chained image-language model for video localization and question answering. Advances in Neural Information Processing Systems 36 (2023) 76749\u201376771.","DOI":"10.52202\/075280-3354"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019127"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.1209"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"e_1_3_3_2_50_2","unstructured":"Peiyuan Zhang Kaichen Zhang Bo Li Guangtao Zeng Jingkang Yang Yuanhan Zhang Ziyue Wang Haoran Tan Chunyuan Li and Ziwei Liu. 2024. Long context transfer from language to vision. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.16852 (2024)."},{"key":"e_1_3_3_2_51_2","unstructured":"Yuanhan Zhang Bo Li haotian Liu Yong\u00a0jae Lee Liangke Gui Di Fu Jiashi Feng Ziwei Liu and Chunyuan Li. 2024. LLaVA-NeXT: A Strong Zero-shot Video Understanding Model. https:\/\/llava-vl.github.io\/blog\/2024-04-30-llava-next-video\/"},{"key":"e_1_3_3_2_52_2","unstructured":"Yuanhan Zhang Jinming Wu Wei Li Bo Li Zejun Ma Ziwei Liu and Chunyuan Li. 2024. Video instruction tuning with synthetic data. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.02713 (2024)."},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01278"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:19:41Z","timestamp":1781536781000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810717"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":52,"alternative-id":["10.1145\/3805622.3810717","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810717","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}