{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:07:06Z","timestamp":1765339626406,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":75,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China (NSFC)","award":["62306239"],"award-info":[{"award-number":["62306239"]}]},{"name":"National Key Lab of Unmanned Aerial Vehicle Technology","award":["WR202413"],"award-info":[{"award-number":["WR202413"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754885","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:55:00Z","timestamp":1761375300000},"page":"2909-2918","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["FineQuest: Adaptive Knowledge-Assisted Sports Video Understanding via Agent-of-Thoughts Reasoning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-1666-2037","authenticated-orcid":false,"given":"Haodong","family":"Chen","sequence":"first","affiliation":[{"name":"School of Automation, Northwestern Polytechnical University, Xi'an City, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0661-712X","authenticated-orcid":false,"given":"Haojian","family":"Huang","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3021-6439","authenticated-orcid":false,"given":"Xinxiang","family":"Yin","sequence":"additional","affiliation":[{"name":"School of Software, Northwestern Polytechnical University, Xi'an City, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0862-9941","authenticated-orcid":false,"given":"Dian","family":"Shao","sequence":"additional","affiliation":[{"name":"Unmanned System Research Institute, Northwestern Polytechnical University, Xi'an City, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"VisualSem: a high-quality knowledge graph for vision and language. arXiv preprint arXiv:2008.09150","author":"Alberts Houda","year":"2020","unstructured":"Houda Alberts, Teresa Huang, Yash Deshpande, Yibo Liu, Kyunghyun Cho, Clara Vania, and Iacer Calixto. 2020. VisualSem: a high-quality knowledge graph for vision and language. arXiv preprint arXiv:2008.09150 (2020)."},{"key":"e_1_3_2_1_2_1","volume-title":"Agla: Mitigating object hallucinations in large vision-language models with assembly of global and local attention. arXiv preprint arXiv:2406.12718","author":"An Wenbin","year":"2024","unstructured":"Wenbin An, Feng Tian, Sicong Leng, Jiahao Nie, Haonan Lin, QianYing Wang, Guang Dai, Ping Chen, and Shijian Lu. 2024. Agla: Mitigating object hallucinations in large vision-language models with assembly of global and local attention. arXiv preprint arXiv:2406.12718 (2024)."},{"key":"e_1_3_2_1_3_1","volume-title":"InfiniBench: A Comprehensive Benchmark for Large Multimodal Models in Very Long Video Understanding. arXiv preprint arXiv:2406.19875","author":"Ataallah Kirolos","year":"2024","unstructured":"Kirolos Ataallah, Chenhui Gou, Eslam Abdelrahman, Khushbu Pahwa, Jian Ding, and Mohamed Elhoseiny. 2024. InfiniBench: A Comprehensive Benchmark for Large Multimodal Models in Very Long Video Understanding. arXiv preprint arXiv:2406.19875 (2024)."},{"key":"e_1_3_2_1_4_1","volume-title":"Hallucination of multimodal large language models: A survey. arXiv preprint arXiv:2404.18930","author":"Bai Zechen","year":"2024","unstructured":"Zechen Bai, Pichao Wang, Tianjun Xiao, Tong He, Zongbo Han, Zheng Zhang, and Mike Zheng Shou. 2024. Hallucination of multimodal large language models: A survey. arXiv preprint arXiv:2404.18930 (2024)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICTAI56018.2022.00106"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_7_1","volume-title":"Xing","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E. Gonzalez, Ion Stoica, and Eric P. Xing. 2023. Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality. https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"e_1_3_2_1_8_1","volume-title":"MMBench-Video: A Long-Form Multi-Shot Benchmark for Holistic Video Understanding. arXiv preprint arXiv:2406.14515","author":"Fang Xinyu","year":"2024","unstructured":"Xinyu Fang, Kangrui Mao, Haodong Duan, Xiangyu Zhao, Yining Li, Dahua Lin, and Kai Chen. 2024. MMBench-Video: A Long-Form Multi-Shot Benchmark for Holistic Video Understanding. arXiv preprint arXiv:2406.14515 (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"Video-of-thought: Step-by-step video reasoning from perception to cognition. arXiv preprint arXiv:2501.03230","author":"Fei Hao","year":"2024","unstructured":"Hao Fei, Shengqiong Wu, Wei Ji, Hanwang Zhang, Meishan Zhang, Mong-Li Lee, and Wynne Hsu. 2024. Video-of-thought: Step-by-step video reasoning from perception to cognition. arXiv preprint arXiv:2501.03230 (2024)."},{"key":"e_1_3_2_1_10_1","first-page":"84","volume-title":"Vienna","author":"Ferrada Sebasti\u00e1n","year":"2017","unstructured":"Sebasti\u00e1n Ferrada, Benjamin Bustos, and Aidan Hogan. 2017. IMGpedia: a linked dataset with content-based analysis of Wikimedia images. In The Semantic Web-ISWC 2017: 16th International Semantic Web Conference, Vienna, Austria, October 21-25, 2017, Proceedings, Part II 16. Springer, 84-93."},{"key":"e_1_3_2_1_11_1","unstructured":"Chaoyou Fu Yuhan Dai Yondong Luo Lei Li Shuhuai Ren Renrui Zhang Zihan Wang Chenyu Zhou Yunhang Shen Mengdan Zhang et al. 2024. Video-MME: The First-Ever Comprehensive Evaluation Benchmark of Multi-modal LLMs in Video Analysis. arXiv preprint arXiv:2405.21075 (2024)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2018.00223"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01358"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.2307\/2346830"},{"key":"e_1_3_2_1_16_1","volume-title":"VistaDPO: Video Hierarchical Spatial-Temporal Direct Preference Optimization for Large Video Models. arXiv preprint arXiv:2504.13122","author":"Huang Haojian","year":"2025","unstructured":"Haojian Huang, Haodong Chen, Shengqiong Wu, Meng Luo, Jinlan Fu, Xinya Du, Hanwang Zhang, and Hao Fei. 2025a. VistaDPO: Video Hierarchical Spatial-Temporal Direct Preference Optimization for Large Video Models. arXiv preprint arXiv:2504.13122 (2025)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i4.32400"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.28017"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.149"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422844.3423051"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01300"},{"key":"e_1_3_2_1_22_1","volume-title":"Language repository for long video understanding. arXiv preprint arXiv:2403.14622","author":"Kahatapitiya Kumara","year":"2024","unstructured":"Kumara Kahatapitiya, Kanchana Ranasinghe, Jongwoo Park, and Michael S Ryoo. 2024. Language repository for long video understanding. arXiv preprint arXiv:2403.14622 (2024)."},{"key":"e_1_3_2_1_23_1","unstructured":"Daniel Kahneman. 2011. Thinking fast and slow. macmillan."},{"key":"e_1_3_2_1_24_1","volume-title":"Multimodal Reasoning with Multimodal Knowledge Graph. arXiv preprint arXiv:2406.02030","author":"Lee Junlin","year":"2024","unstructured":"Junlin Lee, Yequan Wang, Jing Li, and Min Zhang. 2024. Multimodal Reasoning with Multimodal Knowledge Graph. arXiv preprint arXiv:2406.02030 (2024)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01316"},{"key":"e_1_3_2_1_26_1","volume-title":"Sports-qa: A large-scale video question answering benchmark for complex and professional sports. arXiv preprint arXiv:2401.01505","author":"Li Haopeng","year":"2024","unstructured":"Haopeng Li, Andong Deng, Qiuhong Ke, Jun Liu, Hossein Rahmani, Yulan Guo, Bernt Schiele, and Chen Chen. 2024a. Sports-qa: A large-scale video question answering benchmark for complex and professional sports. arXiv preprint arXiv:2401.01505 (2024)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"e_1_3_2_1_28_1","volume-title":"Contrastive decoding: Open-ended text generation as optimization. arXiv preprint arXiv:2210.15097","author":"Li Xiang Lisa","year":"2022","unstructured":"Xiang Lisa Li, Ari Holtzman, Daniel Fried, Percy Liang, Jason Eisner, Tatsunori Hashimoto, Luke Zettlemoyer, and Mike Lewis. 2022a. Contrastive decoding: Open-ended text generation as optimization. arXiv preprint arXiv:2210.15097 (2022)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01328"},{"key":"e_1_3_2_1_30_1","volume-title":"Wayne Xin Zhao, and Ji-Rong Wen","author":"Li Yifan","year":"2023","unstructured":"Yifan Li, Yifan Du, Kun Zhou, Jinpeng Wang, Wayne Xin Zhao, and Ji-Rong Wen. 2023a. Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"Llama-vid: An image is worth 2 tokens in large language models. arXiv preprint arXiv:2311.17043","author":"Li Yanwei","year":"2023","unstructured":"Yanwei Li, Chengyao Wang, and Jiaya Jia. 2023b. Llama-vid: An image is worth 2 tokens in large language models. arXiv preprint arXiv:2311.17043 (2023)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-emnlp.171"},{"key":"e_1_3_2_1_33_1","volume-title":"Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Bin Zhu, Yang Ye, Munan Ning, Peng Jin, and Li Yuan. 2023b. Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122 (2023)."},{"key":"e_1_3_2_1_34_1","volume-title":"Mm-vid: Advancing video understanding with gpt-4v (ision). arXiv preprint arXiv:2310.19773","author":"Lin Kevin","year":"2023","unstructured":"Kevin Lin, Faisal Ahmed, Linjie Li, Chung-Ching Lin, Ehsan Azarnasab, Zhengyuan Yang, Jianfeng Wang, Lin Liang, Zicheng Liu, Yumao Lu, et al., 2023a. Mm-vid: Advancing video understanding with gpt-4v (ision). arXiv preprint arXiv:2310.19773 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Llava-next: Improved reasoning, ocr, and world knowledge.","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, and Yong Jae Lee. 2024. Llava-next: Improved reasoning, ocr, and world knowledge."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-21348-0_30"},{"key":"e_1_3_2_1_37_1","volume-title":"Vista-llama: Reliable video narrator via equal distance to visual tokens. arXiv preprint arXiv:2312.08870","author":"Ma Fan","year":"2023","unstructured":"Fan Ma, Xiaojie Jin, Heng Wang, Yuchen Xian, Jiashi Feng, and Yi Yang. 2023. Vista-llama: Reliable video narrator via equal distance to visual tokens. arXiv preprint arXiv:2312.08870 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"Video-chatgpt: Towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424","author":"Maaz Muhammad","year":"2023","unstructured":"Muhammad Maaz, Hanoona Rasheed, Salman Khan, and Fahad Shahbaz Khan. 2023. Video-chatgpt: Towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Mangalam Karttikeya","year":"2024","unstructured":"Karttikeya Mangalam, Raiymbek Akshulakov, and Jitendra Malik. 2024. Egoschema: A diagnostic benchmark for very long-form video language understanding. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_40_1","unstructured":"OpenAI. 2024. Hello GPT-4o. https:\/\/openai.com\/index\/hello-gpt-4o\/. Accessed: 2024-07-27."},{"key":"e_1_3_2_1_41_1","volume-title":"Too Many Frames, not all Useful: Efficient Strategies for Long-Form Video QA. arXiv preprint arXiv:2406.09396","author":"Park Jongwoo","year":"2024","unstructured":"Jongwoo Park, Kanchana Ranasinghe, Kumara Kahatapitiya, Wonjeong Ryoo, Donghyun Kim, and Michael S Ryoo. 2024. Too Many Frames, not all Useful: Efficient Strategies for Long-Form Video QA. arXiv preprint arXiv:2406.09396 (2024)."},{"key":"e_1_3_2_1_42_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_43_1","volume-title":"Towards universal Soccer video understanding. arXiv preprint arXiv:2412.01820","author":"Rao Jiayuan","year":"2024","unstructured":"Jiayuan Rao, Haoning Wu, Hao Jiang, Ya Zhang, Yanfeng Wang, and Weidi Xie. 2024. Towards universal Soccer video understanding. arXiv preprint arXiv:2412.01820 (2024)."},{"key":"e_1_3_2_1_44_1","volume-title":"Nicolas Carion, Chao-Yuan Wu, Ross Girshick, Piotr Doll\u00e1r, and Christoph Feichtenhofer.","author":"Ravi Nikhila","year":"2024","unstructured":"Nikhila Ravi, Valentin Gabeur, Yuan-Ting Hu, Ronghang Hu, Chaitanya Ryali, Tengyu Ma, Haitham Khedr, Roman R\u00e4dle, Chloe Rolland, Laura Gustafson, Eric Mintun, Junting Pan, Kalyan Vasudev Alwala, Nicolas Carion, Chao-Yuan Wu, Ross Girshick, Piotr Doll\u00e1r, and Christoph Feichtenhofer. 2024. SAM 2: Segment Anything in Images and Videos. arXiv preprint (2024)."},{"key":"e_1_3_2_1_45_1","volume-title":"Layered Chain-of-Thought Prompting for Multi-Agent LLM Systems: A Comprehensive Approach to Explainable Large Language Models. arXiv preprint arXiv:2501.18645","author":"Sanwal Manish","year":"2025","unstructured":"Manish Sanwal. 2025. Layered Chain-of-Thought Prompting for Multi-Agent LLM Systems: A Comprehensive Approach to Explainable Large Language Models. arXiv preprint arXiv:2501.18645 (2025)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00401"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00269"},{"key":"e_1_3_2_1_48_1","volume-title":"Unlocking Video-LLM via Agent-of-Thoughts Distillation. arXiv preprint arXiv:2412.01694","author":"Shi Yudi","year":"2024","unstructured":"Yudi Shi, Shangzhe Di, Qirui Chen, and Weidi Xie. 2024. Unlocking Video-LLM via Agent-of-Thoughts Distillation. arXiv preprint arXiv:2412.01694 (2024)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"e_1_3_2_1_50_1","volume-title":"Yongfeng Zhang, and Dongfang Liu.","author":"Sun Guangyan","year":"2024","unstructured":"Guangyan Sun, Mingyu Jin, Zhenting Wang, Cheng-Long Wang, Siqi Ma, Qifan Wang, Tong Geng, Ying Nian Wu, Yongfeng Zhang, and Dongfang Liu. 2024. Visual agents as fast and slow thinkers. arXiv preprint arXiv:2408.08862 (2024)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2946378"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00914"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00392"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00456"},{"key":"e_1_3_2_1_55_1","volume-title":"Tarsier: Recipes for training and evaluating large video description models. arXiv preprint arXiv:2407.00634","author":"Wang Jiawei","year":"2024","unstructured":"Jiawei Wang, Liping Yuan, Yuchen Zhang, and Haomiao Sun. 2024b. Tarsier: Recipes for training and evaluating large video description models. arXiv preprint arXiv:2407.00634 (2024)."},{"key":"e_1_3_2_1_56_1","volume-title":"Temporal segment networks for action recognition in videos","author":"Wang Limin","year":"2018","unstructured":"Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, and Luc Van Gool. 2018. Temporal segment networks for action recognition in videos. IEEE transactions on pattern analysis and machine intelligence, Vol. 41, 11 (2018), 2740-2755."},{"key":"e_1_3_2_1_57_1","volume-title":"Nakul Agarwal, Kwonjoon Lee, and Chen Sun.","author":"Wang Shijie","year":"2023","unstructured":"Shijie Wang, Qi Zhao, Minh Quan Do, Nakul Agarwal, Kwonjoon Lee, and Chen Sun. 2023. Vamos: Versatile action models for video understanding. arXiv preprint arXiv:2311.13627 (2023)."},{"key":"e_1_3_2_1_58_1","volume-title":"Videoagent: Long-form video understanding with large language model as agent. arXiv preprint arXiv:2403.10517","author":"Wang Xiaohan","year":"2024","unstructured":"Xiaohan Wang, Yuhui Zhang, Orr Zohar, and Serena Yeung-Levy. 2024c. Videoagent: Long-form video understanding with large language model as agent. arXiv preprint arXiv:2403.10517 (2024)."},{"key":"e_1_3_2_1_59_1","first-page":"8483","article-title":"Language models with image descriptors are strong few-shot video-language learners","volume":"35","author":"Wang Zhenhailong","year":"2022","unstructured":"Zhenhailong Wang, Manling Li, Ruochen Xu, Luowei Zhou, Jie Lei, Xudong Lin, Shuohang Wang, Ziyi Yang, Chenguang Zhu, Derek Hoiem, et al., 2022. Language models with image descriptors are strong few-shot video-language learners. Advances in Neural Information Processing Systems, Vol. 35 (2022), 8483-8497.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_60_1","volume-title":"VideoTree: Adaptive Tree-based Video Representation for LLM Reasoning on Long Videos. arXiv preprint arXiv:2405.19209","author":"Wang Ziyang","year":"2024","unstructured":"Ziyang Wang, Shoubin Yu, Elias Stengel-Eskin, Jaehong Yoon, Feng Cheng, Gedas Bertasius, and Mohit Bansal. 2024a. VideoTree: Adaptive Tree-based Video Representation for LLM Reasoning on Long Videos. arXiv preprint arXiv:2405.19209 (2024)."},{"key":"e_1_3_2_1_61_1","volume-title":"Sportu: A comprehensive sports understanding benchmark for multimodal large language models. arXiv preprint arXiv:2410.08474","author":"Xia Haotian","year":"2024","unstructured":"Haotian Xia, Zhengbang Yang, Junbo Zou, Rhys Tracy, Yuqing Wang, Chi Lu, Christopher Lai, Yanjun He, Xun Shao, Zhuoqing Xie, et al., 2024. Sportu: A comprehensive sports understanding benchmark for multimodal large language models. arXiv preprint arXiv:2410.08474 (2024)."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01804"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123427"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00296"},{"key":"e_1_3_2_1_66_1","volume-title":"Sports Intelligence: Assessing the Sports Understanding Capabilities of Language Models through Question Answering from Text to Video. arXiv preprint arXiv:2406.14877","author":"Yang Zhengbang","year":"2024","unstructured":"Zhengbang Yang, Haotian Xia, Jingxi Li, Zezhi Chen, Zhuangdi Zhu, and Weining Shen. 2024. Sports Intelligence: Assessing the Sports Understanding Capabilities of Language Models through Question Answering from Text to Video. arXiv preprint arXiv:2406.14877 (2024)."},{"key":"e_1_3_2_1_67_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Yu Shoubin","year":"2024","unstructured":"Shoubin Yu, Jaemin Cho, Prateek Yadav, and Mohit Bansal. 2024. Self-chained image-language model for video localization and question answering. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019127"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00612"},{"key":"e_1_3_2_1_70_1","volume-title":"Long-CLIP: Unlocking the Long-Text Capability of CLIP. arXiv preprint arXiv:2403.15378","author":"Zhang Beichen","year":"2024","unstructured":"Beichen Zhang, Pan Zhang, Xiaoyi Dong, Yuhang Zang, and Jiaqi Wang. 2024c. Long-CLIP: Unlocking the Long-Text Capability of CLIP. arXiv preprint arXiv:2403.15378 (2024)."},{"key":"e_1_3_2_1_71_1","volume-title":"Ziyang Wang, Shoubin Yu, Mohit Bansal, and Gedas Bertasius.","author":"Zhang Ce","year":"2023","unstructured":"Ce Zhang, Taixi Lu, Md Mohaiminul Islam, Ziyang Wang, Shoubin Yu, Mohit Bansal, and Gedas Bertasius. 2023. A simple llm framework for long-range video question-answering. arXiv preprint arXiv:2312.17235 (2023)."},{"key":"e_1_3_2_1_72_1","volume-title":"Flash-VStream: Memory-Based Real-Time Understanding for Long Video Streams. arXiv preprint arXiv:2406.08085","author":"Zhang Haoji","year":"2024","unstructured":"Haoji Zhang, Yiqin Wang, Yansong Tang, Yong Liu, Jiashi Feng, Jifeng Dai, and Xiaojie Jin. 2024b. Flash-VStream: Memory-Based Real-Time Understanding for Long Video Streams. arXiv preprint arXiv:2406.08085 (2024)."},{"key":"e_1_3_2_1_73_1","volume-title":"Yong jae Lee, Liangke Gui, Di Fu, Jiashi Feng, Ziwei Liu, and Chunyuan Li.","author":"Zhang Yuanhan","year":"2024","unstructured":"Yuanhan Zhang, Bo Li, haotian Liu, Yong jae Lee, Liangke Gui, Di Fu, Jiashi Feng, Ziwei Liu, and Chunyuan Li. 2024a. LLaVA-NeXT: A Strong Zero-shot Video Understanding Model. https:\/\/llava-vl.github.io\/blog\/2024-04-30-llava-next-video\/"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.317"},{"key":"e_1_3_2_1_75_1","volume-title":"Languagebind: Extending video-language pretraining to n-modality by language-based semantic alignment. arXiv preprint arXiv:2310.01852","author":"Zhu Bin","year":"2023","unstructured":"Bin Zhu, Bin Lin, Munan Ning, Yang Yan, Jiaxi Cui, HongFa Wang, Yatian Pang, Wenhao Jiang, Junwu Zhang, Zongwei Li, et al., 2023. Languagebind: Extending video-language pretraining to n-modality by language-based semantic alignment. arXiv preprint arXiv:2310.01852 (2023)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754885","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:03:47Z","timestamp":1765339427000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754885"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":75,"alternative-id":["10.1145\/3746027.3754885","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754885","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}