{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:06:45Z","timestamp":1765343205069,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","funder":[{"name":"RGC GRF","award":["15200321, 15201322, 15230624, 15239925"],"award-info":[{"award-number":["15200321, 15201322, 15230624, 15239925"]}]},{"name":"ITC","award":["ITF-ITS\/056\/22MX, ITS\/052\/23MX"],"award-info":[{"award-number":["ITF-ITS\/056\/22MX, ITS\/052\/23MX"]}]},{"name":"PolyU","award":["1-CDKK, G-SAC8, K-ZYAP"],"award-info":[{"award-number":["1-CDKK, G-SAC8, K-ZYAP"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754765","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:27:39Z","timestamp":1761377259000},"page":"965-974","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Accelerating Long Video Understanding via Compressed Scene Graph-Enabled Chain-of-Thought"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9301-8610","authenticated-orcid":false,"given":"Tao","family":"Ling","sequence":"first","affiliation":[{"name":"The Hong Kong Polytechnic University, Hong Kong SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5555-417X","authenticated-orcid":false,"given":"Siping","family":"Shi","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, Hong Kong SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0921-2726","authenticated-orcid":false,"given":"Dan","family":"Wang","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, Hong Kong SAR, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Towards reasoning era: A survey of long chain-of-thought for reasoning large language models. arXiv preprint arXiv:2503.09567","author":"Chen Qiguang","year":"2025","unstructured":"Qiguang Chen, Libo Qin, Jinhao Liu, Dengyun Peng, Jiannan Guan, Peng Wang, Mengkang Hu, Yuhang Zhou, Te Gao, and Wangxiang Che. 2025. Towards reasoning era: A survey of long chain-of-thought for reasoning large language models. arXiv preprint arXiv:2503.09567 (2025)."},{"key":"e_1_3_2_1_2_1","unstructured":"Zesen Cheng Sicong Leng Hang Zhang Yifei Xin Xin Li Guanzheng Chen Yongxin Zhu Wenqi Zhang Ziyang Luo Deli Zhao et al. 2024. Videollama 2: Advancing spatial-temporal modeling and audio understanding in video-llms. arXiv preprint arXiv:2406.07476 (2024)."},{"key":"e_1_3_2_1_3_1","volume-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. https:\/\/vicuna.lmsys.org","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Ziqing Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E Gonzalez, et al., 2023. Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. https:\/\/vicuna.lmsys.org (2023)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681484"},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the 41st International Conference on Machine Learning. 13109-13125","author":"Fei Hao","year":"2024","unstructured":"Hao Fei, Shengqiong Wu, Wei Ji, Hanwang Zhang, Meishan Zhang, Mong Li Lee, and Wynne Hsu. 2024. Video-of-thought: step-by-step video reasoning from perception to cognition. In Proceedings of the 41st International Conference on Machine Learning. 13109-13125."},{"key":"e_1_3_2_1_6_1","volume-title":"Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis. arXiv preprint arXiv:2405.21075","author":"Fu Chaoyou","year":"2024","unstructured":"Chaoyou Fu, Yuhan Dai, Yongdong Luo, Lei Li, Shuhuai Ren, Renrui Zhang, Zihan Wang, Chenyu Zhou, Yunhang Shen, Mengdan Zhang, et al., 2024. Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis. arXiv preprint arXiv:2405.21075 (2024)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681249"},{"key":"e_1_3_2_1_8_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","author":"Guo Ziyu","year":"2025","unstructured":"Ziyu Guo, Renrui Zhang, Chengzhuo Tong, Zhizheng Zhao, Peng Gao, Hongsheng Li, and Pheng-Ann Heng. 2025. Can We Generate Images with CoT? Let's Verify and Reinforce Image Generation Step by Step. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_10_1","volume-title":"Advances in Neural Information Processing Systems","volume":"31","author":"Herzig Roei","year":"2018","unstructured":"Roei Herzig, Moshiko Raboh, Gal Chechik, Jonathan Berant, and Amir Globerson. 2018. Mapping images to scene graphs with permutation-invariant structured prediction. Advances in Neural Information Processing Systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00133"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448321"},{"key":"e_1_3_2_1_13_1","volume-title":"RelViT: Concept-guided Vision Transformer for Visual Relational Reasoning. In International Conference on Learning Representations.","author":"Ma Xiaojian","year":"2022","unstructured":"Xiaojian Ma, Weili Nie, Zhiding Yu, Huaizu Jiang, Chaowei Xiao, Yuke Zhu, Song-Chun Zhu, and Anima Anandkumar. 2022. RelViT: Concept-guided Vision Transformer for Visual Relational Reasoning. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_14_1","volume-title":"European Conference on Computer Vision. Springer, 403-420","author":"Ma Yingzi","year":"2024","unstructured":"Yingzi Ma, Yulong Cao, Jiachen Sun, Marco Pavone, and Chaowei Xiao. 2024. Dolphins: Multimodal language model for driving. In European Conference on Computer Vision. Springer, 403-420."},{"key":"e_1_3_2_1_15_1","first-page":"46212","article-title":"Egoschema: A diagnostic benchmark for very long-form video language understanding","volume":"36","author":"Mangalam Karttikeya","year":"2023","unstructured":"Karttikeya Mangalam, Raiymbek Akshulakov, and Jitendra Malik. 2023. Egoschema: A diagnostic benchmark for very long-form video language understanding. Advances in Neural Information Processing Systems, Vol. 36 (2023), 46212-46244.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01367"},{"key":"e_1_3_2_1_17_1","volume-title":"STEP: Enhancing Video-LLMs' Compositional Reasoning by Spatio-Temporal Graph-guided Self-Training. arXiv preprint arXiv:2412.00161","author":"Qiu Haiyi","year":"2024","unstructured":"Haiyi Qiu, Minghe Gao, Long Qian, Kaihang Pan, Qifan Yu, Juncheng Li, Wenjie Wang, Siliang Tang, Yueting Zhuang, and Tat-Seng Chua. 2024. STEP: Enhancing Video-LLMs' Compositional Reasoning by Spatio-Temporal Graph-guided Self-Training. arXiv preprint arXiv:2412.00161 (2024)."},{"key":"e_1_3_2_1_18_1","volume-title":"European Conference on Computer Vision. Springer, 305-322","author":"Tan Cheng","year":"2024","unstructured":"Cheng Tan, Jingxuan Wei, Zhangyang Gao, Linzhuang Sun, Siyuan Li, Ruifeng Guo, Bihui Yu, and Stan Z Li. 2024. Boosting the power of small multimodal reasoning models to match larger models with self-consistency training. In European Conference on Computer Vision. Springer, 305-322."},{"key":"e_1_3_2_1_19_1","volume-title":"ALLVB: All-in-One Long Video Understanding Benchmark. arXiv preprint arXiv:2503.07298","author":"Tan Xichen","year":"2025","unstructured":"Xichen Tan, Yuanjing Luo, Yunfan Ye, Fang Liu, and Zhiping Cai. 2025. ALLVB: All-in-One Long Video Understanding Benchmark. arXiv preprint arXiv:2503.07298 (2025)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680730"},{"key":"e_1_3_2_1_21_1","unstructured":"Yunlong Tang Jing Bi Siting Xu Luchuan Song Susan Liang Teng Wang Daoan Zhang Jie An Jingyang Lin Rongyi Zhu et al. 2023. Video understanding with large language models: A survey. arXiv preprint arXiv:2312.17432 (2023)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612035"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29884"},{"key":"e_1_3_2_1_24_1","volume-title":"Lvbench: An extreme long video understanding benchmark. arXiv preprint arXiv:2406.08035","author":"Wang Weihan","year":"2024","unstructured":"Weihan Wang, Zehai He, Wenyi Hong, Yean Cheng, Xiaohan Zhang, Ji Qi, Xiaotao Gu, Shiyu Huang, Bin Xu, Yuxiao Dong, et al., 2024a. Lvbench: An extreme long video understanding benchmark. arXiv preprint arXiv:2406.08035 (2024)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612152"},{"key":"e_1_3_2_1_26_1","volume-title":"European Conference on Computer Vision. Springer, 58-76","author":"Wang Xiaohan","year":"2024","unstructured":"Xiaohan Wang, Yuhui Zhang, Orr Zohar, and Serena Yeung-Levy. 2024d. Videoagent: Long-form video understanding with large language model as agent. In European Conference on Computer Vision. Springer, 58-76."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612283"},{"key":"e_1_3_2_1_28_1","volume-title":"Multimodal chain-of-thought reasoning: A comprehensive survey. arXiv preprint arXiv:2503.12605","author":"Wang Yaoting","year":"2025","unstructured":"Yaoting Wang, Shengqiong Wu, Yuecheng Zhang, William Wang, Ziwei Liu, Jiebo Luo, and Hao Fei. 2025. Multimodal chain-of-thought reasoning: A comprehensive survey. arXiv preprint arXiv:2503.12605 (2025)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.alvr-1.8"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00192"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680810"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681442"},{"key":"e_1_3_2_1_33_1","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Wu Wenshan","year":"2024","unstructured":"Wenshan Wu, Shaoguang Mao, Yadong Zhang, Yan Xia, Li Dong, Lei Cui, and Furu Wei. 2024a. Mind's Eye of LLMs: Visualization-of-Thought Elicits Spatial Reasoning in Large Language Models. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_34_1","volume-title":"European Conference on Computer Vision. Springer, 164-182","author":"Wu Yixuan","year":"2024","unstructured":"Yixuan Wu, Yizhou Wang, Shixiang Tang, Wenhao Wu, Tong He, Wanli Ouyang, Philip Torr, and Jian Wu. 2024b. Dettoolchain: A new prompting paradigm to unleash detection ability of mllm. In European Conference on Computer Vision. Springer, 164-182."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00965"},{"volume-title":"The First Workshop on System-2 Reasoning at Scale, NeurIPS'24","author":"Xie Yuxi","key":"e_1_3_2_1_36_1","unstructured":"Yuxi Xie, Anirudh Goyal, Wenyue Zheng, Min-Yen Kan, Timothy P Lillicrap, Kenji Kawaguchi, and Michael Shieh. [n.d.]. Monte Carlo Tree Search Boosts Reasoning via Iterative Preference Learning. In The First Workshop on System-2 Reasoning at Scale, NeurIPS'24."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612871"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_11"},{"key":"e_1_3_2_1_39_1","first-page":"15024","volume-title":"Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING","author":"Yang Jun Cheng","year":"2024","unstructured":"Jun Cheng Yang, Zuchao Li, Shuai Xie, Wei Yu, Shijun Li, and Bo Du. 2024. Soft-Prompting with Graph-of-Thought for Multi-modal Representation Learning. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024). 15024-15036."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00671"},{"key":"e_1_3_2_1_41_1","volume-title":"Movqa: A benchmark of versatile question-answering for long-form movie understanding. arXiv preprint arXiv:2312.04817","author":"Zhang Hongjie","year":"2023","unstructured":"Hongjie Zhang, Yi Liu, Lu Dong, Yifei Huang, Zhen-Hua Ling, Yali Wang, Limin Wang, and Yu Qiao. 2023a. Movqa: A benchmark of versatile question-answering for long-form movie understanding. arXiv preprint arXiv:2312.04817 (2023)."},{"key":"e_1_3_2_1_42_1","volume-title":"Multimodal chain-of-thought reasoning in language models. arXiv preprint arXiv:2302.00923","author":"Zhang Zhuosheng","year":"2023","unstructured":"Zhuosheng Zhang, Aston Zhang, Mu Li, Hai Zhao, George Karypis, and Alex Smola. 2023b. Multimodal chain-of-thought reasoning in language models. arXiv preprint arXiv:2302.00923 (2023)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681407"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612096"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681102"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01258"},{"key":"e_1_3_2_1_47_1","volume-title":"Mlvu: A comprehensive benchmark for multi-task long video understanding. arXiv preprint arXiv:2406.04264","author":"Zhou Junjie","year":"2024","unstructured":"Junjie Zhou, Yan Shu, Bo Zhao, Boya Wu, Shitao Xiao, Xi Yang, Yongping Xiong, Bo Zhang, Tiejun Huang, and Zheng Liu. 2024. Mlvu: A comprehensive benchmark for multi-task long video understanding. arXiv preprint arXiv:2406.04264 (2024)."},{"key":"e_1_3_2_1_48_1","unstructured":"Heqing Zou Tianze Luo Guiyang Xie Fengmao Lv Guangcong Wang Junyang Chen Zhuochen Wang Hansheng Zhang Huaijian Zhang et al. 2024. From seconds to hours: Reviewing multimodal large language models on comprehensive long video understanding. arXiv preprint arXiv:2409.18938 (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754765","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:03:50Z","timestamp":1765343030000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754765"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":48,"alternative-id":["10.1145\/3746027.3754765","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754765","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}