{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:09:46Z","timestamp":1765339786934,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":25,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3759207","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:51Z","timestamp":1761377211000},"page":"14346-14347","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Video Question Answering and Beyond"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5659-793X","authenticated-orcid":false,"given":"Yicong","family":"Li","sequence":"first","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5573-6195","authenticated-orcid":false,"given":"Junbin","family":"Xiao","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7418-6141","authenticated-orcid":false,"given":"Angela","family":"Yao","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6097-7807","authenticated-orcid":false,"given":"Tat-Seng","family":"Chua","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Daichi Azuma Taiki Miyanishi Shuhei Kurita and Motoaki Kawanabe. 2022. ScanQA: 3D Question Answering for Spatial Scene Understanding. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01854"},{"key":"e_1_3_2_1_2_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2. 5-vl technical report. arXiv (2025)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Ankan Bansal Yuting Zhang and Rama Chellappa. 2020. Visual Question Answering on Image Sets. In ECCV.","DOI":"10.1007\/978-3-030-58589-1_4"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Leonard B''armann and AlexWaibel. 2022. Where Did I Leave My Keys? - Episodic-Memory-Based Question Answering on Egocentric Videos. In CVPRW.","DOI":"10.1109\/CVPRW56347.2022.00162"},{"key":"e_1_3_2_1_5_1","volume-title":"Understanding long videos via llm-powered entity relation graphs. ACM MM","author":"Chu Meng","year":"2025","unstructured":"Meng Chu, Yicong Li, and Tat-Seng Chua. 2025. Understanding long videos via llm-powered entity relation graphs. ACM MM (2025)."},{"key":"e_1_3_2_1_6_1","unstructured":"Abhishek Das Satwik Kottur Paul Guerrero Vignesh Ramanathan Stefan Lee Dhruv Batra and Devi Parikh. 2018. Embodied Question Answering. In CVPR."},{"key":"e_1_3_2_1_7_1","volume-title":"Streaming video question-answering with in-context video kv-cache retrieval. arXiv","author":"Di Shangzhe","year":"2025","unstructured":"Shangzhe Di, Zhelun Yu, Guanghao Zhang, Haoyuan Li, Tao Zhong, Hao Cheng, Bolin Li,Wanggui He, Fangxun Shu, and Hao Jiang. 2025. Streaming video question-answering with in-context video kv-cache retrieval. arXiv (2025)."},{"key":"e_1_3_2_1_8_1","volume-title":"Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis. In CVPR.","author":"Fu Chaoyou","year":"2025","unstructured":"Chaoyou Fu, Yuhan Dai, Yongdong Luo, Lei Li, Shuhuai Ren, Renrui Zhang, Zihan Wang, Chenyu Zhou, Yunhang Shen, Mengdan Zhang, et al. 2025. Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis. In CVPR."},{"key":"e_1_3_2_1_9_1","unstructured":"Yicong Li XiangWang Junbin Xiao and Tat-Seng Chua. 2022. Equivariant and invariant grounding for video question answering. In ACM MM."},{"key":"e_1_3_2_1_10_1","unstructured":"Yicong Li Xiang Wang Junbin Xiao Wei Ji and Tat-Seng Chua. 2022. Invariant grounding for video question answering. In CVPR."},{"key":"e_1_3_2_1_11_1","volume-title":"Transformer-empowered invariant grounding for video question answering. TPAMI","author":"Li Yicong","year":"2023","unstructured":"Yicong Li, Xiang Wang, Junbin Xiao, Wei Ji, and Tat-Seng Chua. 2023. Transformer-empowered invariant grounding for video question answering. TPAMI (2023)."},{"key":"e_1_3_2_1_12_1","unstructured":"Yicong Li Junbin Xiao Chun Feng Xiang Wang and Tat-Seng Chua. 2023. Discovering spatio-temporal rationales for video question answering. In ICCV."},{"key":"e_1_3_2_1_13_1","unstructured":"Yicong Li Xun Yang An Zhang Chun Feng Xiang Wang and Tat-Seng Chua. 2023. Redundancy-aware transformer for video question answering. In ACM MM."},{"key":"e_1_3_2_1_14_1","unstructured":"Xiaojian Ma Silong Yong Zilong Zheng Qing Li Yitao Liang Song-Chun Zhu and Siyuan Huang. 2023. SQA3D: Situated Question Answering in 3D Scenes. In ICLR."},{"key":"e_1_3_2_1_15_1","volume-title":"Egoschema: A diagnostic benchmark for very long-form video language understanding. NeurIPS 36","author":"Mangalam Karttikeya","year":"2023","unstructured":"Karttikeya Mangalam, Raiymbek Akshulakov, and Jitendra Malik. 2023. Egoschema: A diagnostic benchmark for very long-form video language understanding. NeurIPS 36 (2023)."},{"key":"e_1_3_2_1_16_1","volume-title":"Advancing 3D Scene Understanding with MV-ScanQA: Multi-View Reasoning Evaluation and TripAlign Pre-training Dataset. ACM MM","author":"Mo Wentao","year":"2025","unstructured":"Wentao Mo, Qingchao Chen, Yuxin Peng, Siyuan Huang, and Yang Liu. 2025. Advancing 3D Scene Understanding with MV-ScanQA: Multi-View Reasoning Evaluation and TripAlign Pre-training Dataset. ACM MM (2025)."},{"key":"e_1_3_2_1_17_1","volume-title":"Advancing Egocentric Video Question Answering with Multimodal Large Language Models. arXiv preprint","author":"Patel Alkesh","year":"2025","unstructured":"Alkesh Patel, Vibhav Chitalia, and Yinfei Yang. 2025. Advancing Egocentric Video Question Answering with Multimodal Large Language Models. arXiv preprint (2025)."},{"key":"e_1_3_2_1_18_1","volume-title":"Longvideobench: A benchmark for long-context interleaved video-language understanding. NeurIPS 37","author":"Wu Haoning","year":"2024","unstructured":"Haoning Wu, Dongxu Li, Bei Chen, and Junnan Li. 2024. Longvideobench: A benchmark for long-context interleaved video-language understanding. NeurIPS 37 (2024)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Junbin Xiao Nanxin Huang Hangyu Qin Dongyang Li Yicong Li Fengbin Zhu Zhulin Tao Jianxing Yu Liang Lin Tat-Seng Chua et al. 2025. Videoqa in the era of llms: An empirical study. IJCV (2025).","DOI":"10.1007\/s11263-025-02385-8"},{"key":"e_1_3_2_1_20_1","volume-title":"EgoBlind: Towards Egocentric Visual Assistance for the Blind People. arXiv","author":"Xiao Junbin","year":"2025","unstructured":"Junbin Xiao, Nanxin Huang, Hao Qiu, Zhulin Tao, Xun Yang, Richang Hong, Meng Wang, and Angela Yao. 2025. EgoBlind: Towards Egocentric Visual Assistance for the Blind People. arXiv (2025)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Junbin Xiao Angela Yao Yicong Li and Tat-Seng Chua. 2024. Can I trust your answer? visually grounded video question answering. In CVPR.","DOI":"10.1109\/CVPR52733.2024.01254"},{"key":"e_1_3_2_1_22_1","unstructured":"Jin Xu Zhifang Guo Jinzheng He Hangrui Hu Ting He Shuai Bai Keqin Chen JialinWang Yang Fan Kai Dang Bin Zhang XiongWang Yunfei Chu and Junyang Lin. 2025. Qwen2.5-Omni Technical Report. arXiv:2503.20215"},{"key":"e_1_3_2_1_23_1","unstructured":"Andy Zeng Maria Attarian Krzysztof Marcin Choromanski AdrianWong Stefan Welker Federico Tombari Aveek Purohit Michael S Ryoo Vikas Sindhwani Johnny Lee et al. [n.d.]. Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language. In ICLR."},{"key":"e_1_3_2_1_24_1","volume-title":"Towards Omnidirectional Reasoning with 360-R1: A Dataset, Benchmark, and GRPO-Based Method. arXiv preprint","author":"Zhang Xinshen","year":"2025","unstructured":"Xinshen Zhang, Zhen Ye, and Xu Zheng. 2025. Towards Omnidirectional Reasoning with 360-R1: A Dataset, Benchmark, and GRPO-Based Method. arXiv preprint (2025)."},{"key":"e_1_3_2_1_25_1","volume-title":"Video question answering: Datasets, algorithms and challenges. arXiv","author":"Zhong Yaoyao","year":"2022","unstructured":"Yaoyao Zhong, Junbin Xiao,Wei Ji, Yicong Li,Weihong Deng, and Tat-Seng Chua. 2022. Video question answering: Datasets, algorithms and challenges. arXiv (2022)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3759207","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:06:25Z","timestamp":1765339585000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3759207"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":25,"alternative-id":["10.1145\/3746027.3759207","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3759207","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}