{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T14:23:35Z","timestamp":1779373415375,"version":"3.53.1"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,8,3]],"date-time":"2025-08-03T00:00:00Z","timestamp":1754179200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Surface Transportation Safety Center for Excellence","award":["238717"],"award-info":[{"award-number":["238717"]}]},{"name":"National Science Foundation","award":["IIS-2339989, 2406439"],"award-info":[{"award-number":["IIS-2339989, 2406439"]}]},{"name":"DARPA","award":["HR00112490370 and HR001124S0013"],"award-info":[{"award-number":["HR00112490370 and HR001124S0013"]}]},{"name":"Department of Homeland Security","award":["17STCIN00001-08-00"],"award-info":[{"award-number":["17STCIN00001-08-00"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,3]]},"DOI":"10.1145\/3711896.3737396","type":"proceedings-article","created":{"date-parts":[[2025,8,3]],"date-time":"2025-08-03T21:03:27Z","timestamp":1754255007000},"page":"5972-5983","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Are Vision LLMs Road-Ready? A Comprehensive Benchmark for Safety-Critical Driving Video Understanding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5533-8506","authenticated-orcid":false,"given":"Tong","family":"Zeng","sequence":"first","affiliation":[{"name":"Computer Science, Virginia Polytechnic Institute and State University, Blacksburg, VA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7422-4398","authenticated-orcid":false,"given":"Longfeng","family":"Wu","sequence":"additional","affiliation":[{"name":"Computer Science, Virginia Polytechnic Institute and State University, Blacksburg, VA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9308-333X","authenticated-orcid":false,"given":"Liang","family":"Shi","sequence":"additional","affiliation":[{"name":"Statistics, Virginia Polytechnic Institute and State University, Blacksburg, VA, USA and Virginia Tech Transportation Institute, Virginia Polytechnic Institute and State University, Blacksburg, VA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7065-2990","authenticated-orcid":false,"given":"Dawei","family":"Zhou","sequence":"additional","affiliation":[{"name":"Computer Science, Virginia Polytechnic Institute and State University, Blacksburg, VA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2572-481X","authenticated-orcid":false,"given":"Feng","family":"Guo","sequence":"additional","affiliation":[{"name":"Statistics, Virginia Polytechnic Institute and State University, Blacksburg, VA, USA and Virginia Tech Transportation Institute, Virginia Polytechnic Institute and State University, Blacksburg, VA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,8,3]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems Vol. 35 (2022) 23716-23736.","DOI":"10.52202\/068431-1723"},{"key":"e_1_3_2_1_2_1","unstructured":"G Audi AG Volkswagen et al. [n.d.]. The PEGASUS Method."},{"key":"e_1_3_2_1_3_1","unstructured":"Xinyun Chen Ryan A Chi Xuezhi Wang and Denny Zhou. 2024. Premise order matters in reasoning with large language models. arXiv preprint arXiv:2402.08939(2024)."},{"key":"e_1_3_2_1_4_1","volume-title":"Luc Van Gool, and Marie-Francine Moens","author":"Deruyttere Thierry","year":"2019","unstructured":"Thierry Deruyttere, Simon Vandenhende, Dusan Grujicic, Luc Van Gool, and Marie-Francine Moens. 2019. Talk2car: Taking control of your self-driving car. arXiv preprint arXiv:1909.10838(2019)."},{"key":"e_1_3_2_1_5_1","unstructured":"Chaoyou Fu Yuhan Dai Yondong Luo Lei Li Shuhuai Ren Renrui Zhang Zihan Wang Chenyu Zhou Yunhang Shen Mengdan Zhang et al. 2024. Video-MME: The First-Ever Comprehensive Evaluation Benchmark of Multi-modal LLMs in Video Analysis. arXiv preprint arXiv:2405.21075(2024)."},{"key":"e_1_3_2_1_6_1","volume-title":"James Haworth, Jonathan Cardoso-Silva, and Ming Li.","author":"Gao Xiaowei","year":"2023","unstructured":"Xiaowei Gao, Pengxiang Li, xinke Jiang, James Haworth, Jonathan Cardoso-Silva, and Ming Li. 2023. DriveScenify: Boosting Driving Scene Understanding with Advanced Vision-Language Models. https:\/\/github.com\/pixeli99\/DSify"},{"key":"e_1_3_2_1_7_1","volume-title":"Drivemllm: A benchmark for spatial understanding with multimodal large language models in autonomous driving. arXiv preprint arXiv:2411.13112(2024).","author":"Guo Xianda","year":"2024","unstructured":"Xianda Guo, Ruijun Zhang, Yiqun Duan, Yuhang He, Chenming Zhang, Shuai Liu, and Long Chen. 2024. Drivemllm: A benchmark for spatial understanding with multimodal large language models in autonomous driving. arXiv preprint arXiv:2411.13112(2024)."},{"key":"e_1_3_2_1_8_1","volume-title":"Dme-driver: Integrating human decision logic and 3d scene perception in autonomous driving. arXiv preprint arXiv:2401.03641(2024).","author":"Han Wencheng","year":"2024","unstructured":"Wencheng Han, Dongqian Guo, Cheng-Zhong Xu, and Jianbing Shen. 2024. Dme-driver: Integrating human decision logic and 3d scene perception in autonomous driving. arXiv preprint arXiv:2401.03641(2024)."},{"key":"e_1_3_2_1_9_1","unstructured":"Jonathan M Hankey Miguel A Perez and Julie A McClafferty. 2016. Description of the SHRP 2 naturalistic database and the crash near-crash and baseline data sets. Technical Report. Virginia Tech Transportation Institute."},{"key":"e_1_3_2_1_10_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. GPT-4o System Card. arXiv preprint arXiv:2410.21276(2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Semantic Understanding of Traffic Scenes with Large Vision Language Models. In 2024 IEEE Intelligent Vehicles Symposium (IV). IEEE.","author":"Jain Sandesh","year":"2024","unstructured":"Sandesh Jain, Surendrabikram Thapa, Kuan-Ting Chen, A Lynn Abbott, and Abhijit Sarkar. 2024. Semantic Understanding of Traffic Scenes with Large Vision Language Models. In 2024 IEEE Intelligent Vehicles Symposium (IV). IEEE."},{"key":"e_1_3_2_1_12_1","unstructured":"Jean Kaddour Joshua Harris Maximilian Mozes Herbie Bradley Roberta Raileanu and Robert McHardy. 2023. Challenges and applications of large language models. arXiv preprint arXiv:2307.10169(2023)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_35"},{"key":"e_1_3_2_1_14_1","volume-title":"Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326(2024).","author":"Li Bo","year":"2024","unstructured":"Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Yanwei Li, Ziwei Liu, and Chunyuan Li. 2024b. Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326(2024)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72952-2_19"},{"key":"e_1_3_2_1_17_1","volume-title":"Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122(2023).","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, and Li Yuan. 2023. Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122(2023)."},{"key":"e_1_3_2_1_18_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024a. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00638"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"e_1_3_2_1_21_1","unstructured":"Yuhang Lu Yichen Yao Jiadong Tu Jiangnan Shao Yuexin Ma and Xinge Zhu. 2024. Can LVLMs Obtain a Driver's License? A Benchmark Towards Reliable AGI for Autonomous Driving. arXiv preprint arXiv:2409.02914(2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"Video-chatgpt: Towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424(2023).","author":"Maaz Muhammad","year":"2023","unstructured":"Muhammad Maaz, Hanoona Rasheed, Salman Khan, and Fahad Shahbaz Khan. 2023. Video-chatgpt: Towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424(2023)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00110"},{"key":"e_1_3_2_1_24_1","volume-title":"Gpt-driver: Learning to drive with gpt. arXiv preprint arXiv:2310.01415(2023).","author":"Mao Jiageng","year":"2023","unstructured":"Jiageng Mao, Yuxi Qian, Junjie Ye, Hang Zhao, and Yue Wang. 2023. Gpt-driver: Learning to drive with gpt. arXiv preprint arXiv:2310.01415(2023)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72980-5_15"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73347-5_17"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28253"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00734"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW65960.2025.00119"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00707"},{"key":"e_1_3_2_1_31_1","volume-title":"European Conference on Computer Vision. Springer, 256-274","author":"Sima Chonghao","year":"2024","unstructured":"Chonghao Sima, Katrin Renz, Kashyap Chitta, Li Chen, Hanxue Zhang, Chengen Xie, Jens Bei\u00dfwenger, Ping Luo, Andreas Geiger, and Hongyang Li. 2024. Drivelm: Driving with graph visual question answering. In European Conference on Computer Vision. Springer, 256-274."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00252"},{"key":"e_1_3_2_1_33_1","volume-title":"Neurips 2024 Workshop Foundation Models for Science: Progress, Opportunities, and Challenges.","author":"Taechoyotin Pawin","unstructured":"Pawin Taechoyotin, Guanchao Wang, Tong Zeng, Bradley Sides, and Daniel Acuna. [n.d.]. MAMORX: Multi-agent multi-modal scientific review generation with external knowledge. In Neurips 2024 Workshop Foundation Models for Science: Progress, Opportunities, and Challenges."},{"key":"e_1_3_2_1_34_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth Katie Millican et al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805(2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Booz Allen Hamilton, et al","author":"Thorn Eric","year":"2018","unstructured":"Eric Thorn, Shawn C Kimmel, Michelle Chaka, Booz Allen Hamilton, et al., 2018. A framework for automated driving system testable cases and scenarios. Technical Report. United States. Department of Transportation. National Highway Traffic Safety \u2026"},{"key":"e_1_3_2_1_36_1","volume-title":"Drivevlm: The convergence of autonomous driving and large vision-language models. arXiv preprint arXiv:2402.12289(2024).","author":"Tian Xiaoyu","year":"2024","unstructured":"Xiaoyu Tian, Junru Gu, Bailin Li, Yicheng Liu, Yang Wang, Zhiyong Zhao, Kun Zhan, Peng Jia, Xianpeng Lang, and Hang Zhao. 2024. Drivevlm: The convergence of autonomous driving and large vision-language models. arXiv preprint arXiv:2402.12289(2024)."},{"key":"e_1_3_2_1_37_1","first-page":"73098","article-title":"Towards heterogeneous long-tailed learning: Benchmarking, metrics, and toolbox","volume":"37","author":"Wang Haohui","year":"2024","unstructured":"Haohui Wang, Weijie Guan, Chen Jianpeng, Zi Wang, and Dawei Zhou. 2024b. Towards heterogeneous long-tailed learning: Benchmarking, metrics, and toolbox. Advances in Neural Information Processing Systems, Vol. 37 (2024), 73098-73123.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671880"},{"key":"e_1_3_2_1_39_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024a. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191(2024)."},{"key":"e_1_3_2_1_40_1","volume-title":"Towards Trustworthy Graph Neural Networks and Their Applications in Recommender Systems. In 2024 IEEE International Conference on Big Data (BigData). IEEE, 8250-8252","author":"Wu Longfeng","year":"2024","unstructured":"Longfeng Wu. 2024. Towards Trustworthy Graph Neural Networks and Their Applications in Recommender Systems. In 2024 IEEE International Conference on Big Data (BigData). IEEE, 8250-8252."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599525"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3696410.3714909"},{"key":"e_1_3_2_1_43_1","volume-title":"Ziwei Liu, and Liang Pan.","author":"Xie Shaoyuan","year":"2025","unstructured":"Shaoyuan Xie, Lingdong Kong, Yuhao Dong, Chonghao Sima, Wenwei Zhang, Qi Alfred Chen, Ziwei Liu, and Liang Pan. 2025. Are VLMs Ready for Autonomous Driving? An Empirical Study from the Reliability, Data, and Metric Perspectives. arXiv preprint arXiv:2501.04003(2025)."},{"key":"e_1_3_2_1_44_1","volume-title":"See Kiong Ng, and Jiashi Feng","author":"Xu Lin","year":"2024","unstructured":"Lin Xu, Yilin Zhao, Daquan Zhou, Zhijie Lin, See Kiong Ng, and Jiashi Feng. 2024b. Pllava: Parameter-free llava extension from images to videos for video dense captioning. arXiv preprint arXiv:2404.16994(2024)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00954"},{"key":"e_1_3_2_1_46_1","volume-title":"Drivegpt4: Interpretable end-to-end autonomous driving via large language model","author":"Xu Zhenhua","year":"2024","unstructured":"Zhenhua Xu, Yujia Zhang, Enze Xie, Zhen Zhao, Yong Guo, Kwan-Yee K Wong, Zhenguo Li, and Hengshuang Zhao. 2024a. Drivegpt4: Interpretable end-to-end autonomous driving via large language model. IEEE Robotics and Automation Letters(2024)."},{"key":"e_1_3_2_1_47_1","volume-title":"Minicpm-v: A gpt-4v level mllm on your phone. arXiv preprint arXiv:2408.01800(2024).","author":"Yao Yuan","year":"2024","unstructured":"Yuan Yao, Tianyu Yu, Ao Zhang, Chongyi Wang, Junbo Cui, Hongji Zhu, Tianchi Cai, Haoyu Li, Weilin Zhao, Zhihui He, et al., 2024. Minicpm-v: A gpt-4v level mllm on your phone. arXiv preprint arXiv:2408.01800(2024)."},{"key":"e_1_3_2_1_48_1","unstructured":"Mang Ye Xuankun Rong Wenke Huang Bo Du Nenghai Yu and Dacheng Tao. 2025. A survey of safety on large vision-language models: Attacks defenses and evaluations. arXiv preprint arXiv:2502.14881(2025)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"e_1_3_2_1_50_1","volume-title":"Yong jae Lee, Liangke Gui, Di Fu, Jiashi Feng, Ziwei Liu, and Chunyuan Li.","author":"Zhang Yuanhan","year":"2024","unstructured":"Yuanhan Zhang, Bo Li, haotian Liu, Yong jae Lee, Liangke Gui, Di Fu, Jiashi Feng, Ziwei Liu, and Chunyuan Li. 2024. LLaVA-NeXT: A Strong Zero-shot Video Understanding Model."},{"key":"e_1_3_2_1_51_1","first-page":"4098","article-title":"Muvir: Multi-view rare category detection","author":"Zhou Dawei","year":"2015","unstructured":"Dawei Zhou, Jingrui He, K Sel\u00e7uk Candan, and Hasan Davulcu. 2015. Muvir: Multi-view rare category detection. In IJCAI. Citeseer, 4098-4104.","journal-title":"IJCAI. Citeseer"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2016.0083"},{"key":"e_1_3_2_1_53_1","unstructured":"Deyao Zhu Jun Chen Xiaoqian Shen Xiang Li and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592(2023)."},{"key":"e_1_3_2_1_54_1","first-page":"50117","article-title":"Toolqa: A dataset for llm question answering with external tools","volume":"36","author":"Zhuang Yuchen","year":"2023","unstructured":"Yuchen Zhuang, Yue Yu, Kuan Wang, Haotian Sun, and Chao Zhang. 2023. Toolqa: A dataset for llm question answering with external tools. Advances in Neural Information Processing Systems, Vol. 36 (2023), 50117-50143. endthebibl","journal-title":"Advances in Neural Information Processing Systems"}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Toronto ON Canada","acronym":"KDD '25","sponsor":["SIGKDD ACM Special Interest Group on Knowledge Discovery in Data","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3711896.3737396","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3711896.3737396","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T18:01:46Z","timestamp":1777572106000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3711896.3737396"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,3]]},"references-count":54,"alternative-id":["10.1145\/3711896.3737396","10.1145\/3711896"],"URL":"https:\/\/doi.org\/10.1145\/3711896.3737396","relation":{},"subject":[],"published":{"date-parts":[[2025,8,3]]},"assertion":[{"value":"2025-08-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}