{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T17:44:01Z","timestamp":1772905441782,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":69,"publisher":"ACM","funder":[{"name":"the National Natural Science Foundation of China","award":["No. 62332016"],"award-info":[{"award-number":["No. 62332016"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755730","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:55:00Z","timestamp":1761375300000},"page":"5040-5049","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["VLMPlanner: Integrating Visual Language Models with Motion Planning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-2339-8863","authenticated-orcid":false,"given":"Zhipeng","family":"Tang","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1814-6783","authenticated-orcid":false,"given":"Sha","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9624-7451","authenticated-orcid":false,"given":"Jiajun","family":"Deng","sequence":"additional","affiliation":[{"name":"University of Adelaide, Adelaide, SA, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9207-2076","authenticated-orcid":false,"given":"Chenjie","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0964-7279","authenticated-orcid":false,"given":"Guoliang","family":"You","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5533-3972","authenticated-orcid":false,"given":"Yuting","family":"Huang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6125-0341","authenticated-orcid":false,"given":"Xinrui","family":"Lin","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6520-255X","authenticated-orcid":false,"given":"Yanyong","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2019. Motional: nuplan challange. https:\/\/github.com\/motional\/nuplan-devkit"},{"key":"e_1_3_2_1_2_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"Whye Kit Fong, Eric Wolff, Alex Lang, Luke Fletcher, Oscar Beijbom, and Sammy Omari.","author":"Caesar Holger","year":"2021","unstructured":"Holger Caesar, Juraj Kabzan, Kok Seang Tan, Whye Kit Fong, Eric Wolff, Alex Lang, Luke Fletcher, Oscar Beijbom, and Sammy Omari. 2021. nuplan: A closedloop ml-based planning benchmark for autonomous vehicles. arXiv preprint arXiv:2106.11810 (2021)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611018"},{"key":"e_1_3_2_1_6_1","volume-title":"Towards end-to-end embodied decision making via multi-modal large language model: Explorations with gpt4- vision and beyond. arXiv preprint arXiv:2310.02071","author":"Chen Liang","year":"2023","unstructured":"Liang Chen, Yichi Zhang, Shuhuai Ren, Haozhe Zhao, Zefan Cai, Yuchi Wang, PeiyiWang, Tianyu Liu, and Baobao Chang. 2023. Towards end-to-end embodied decision making via multi-modal large language model: Explorations with gpt4- vision and beyond. arXiv preprint arXiv:2310.02071 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"European Conference on Computer Vision. Springer, 22--38","author":"Chen Yuan","year":"2024","unstructured":"Yuan Chen, Zi-han Ding, Ziqin Wang, Yan Wang, Lijun Zhang, and Si Liu. 2024. Asynchronous large language model enhanced planner for autonomous driving. In European Conference on Computer Vision. Springer, 22--38."},{"key":"e_1_3_2_1_8_1","volume-title":"Pluto: Pushing the limit of imitation learning-based planning for autonomous driving. arXiv preprint arXiv:2404.14327","author":"Cheng Jie","year":"2024","unstructured":"Jie Cheng, Yingbing Chen, and Qifeng Chen. 2024. Pluto: Pushing the limit of imitation learning-based planning for autonomous driving. arXiv preprint arXiv:2404.14327 (2024)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611364"},{"key":"e_1_3_2_1_10_1","volume-title":"Transfuser: Imitation with transformer-based sensor fusion for autonomous driving","author":"Chitta Kashyap","year":"2022","unstructured":"Kashyap Chitta, Aditya Prakash, Bernhard Jaeger, Zehao Yu, Katrin Renz, and Andreas Geiger. 2022. Transfuser: Imitation with transformer-based sensor fusion for autonomous driving. IEEE transactions on pattern analysis and machine intelligence 45, 11 (2022), 12878--12895."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00942"},{"key":"e_1_3_2_1_12_1","volume-title":"reason, and react: Drive as you say, with large language models in autonomous vehicles","author":"Cui Can","year":"2024","unstructured":"Can Cui, Yunsheng Ma, Xu Cao, Wenqian Ye, and Ziran Wang. 2024. Receive, reason, and react: Drive as you say, with large language models in autonomous vehicles. IEEE Intelligent Transportation Systems Magazine (2024)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00106"},{"key":"e_1_3_2_1_14_1","volume-title":"Conference on Robot Learning. PMLR, 1268--1281","author":"Dauner Daniel","year":"2023","unstructured":"Daniel Dauner, Marcel Hallgarten, Andreas Geiger, and Kashyap Chitta. 2023. Parting with misconceptions about learning-based vehicle motion planning. In Conference on Robot Learning. PMLR, 1268--1281."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 13668--13677","author":"Ding Xinpeng","year":"2024","unstructured":"Xinpeng Ding, Jianhua Han, Hang Xu, Xiaodan Liang,Wei Zhang, and Xiaomeng Li. 2024. Holistic autonomous driving understanding by bird's-eye-view injected multi-modal large models. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 13668--13677."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00102"},{"key":"e_1_3_2_1_17_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC57777.2023.10421854"},{"key":"e_1_3_2_1_19_1","volume-title":"Dmedriver: Integrating human decision logic and 3d scene perception in autonomous driving. arXiv preprint arXiv:2401.03641","author":"Han Wencheng","year":"2024","unstructured":"Wencheng Han, Dongqian Guo, Cheng-Zhong Xu, and Jianbing Shen. 2024. Dmedriver: Integrating human decision logic and 3d scene perception in autonomous driving. arXiv preprint arXiv:2401.03641 (2024)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19839-7_31"},{"key":"e_1_3_2_1_21_1","volume-title":"European Conference on Computer Vision. Springer, 386--404","author":"Hu Yihan","year":"2024","unstructured":"Yihan Hu, Siqi Chai, Zhening Yang, Jingyu Qian, Kun Li, Wenxin Shao, Haichao Zhang, Wei Xu, and Qiang Liu. 2024. Solving motion planning tasks with a scalable generative model. In European Conference on Computer Vision. Springer, 386--404."},{"key":"e_1_3_2_1_22_1","volume-title":"Imitation with spatial-temporal heatmap: 2nd place solution for nuplan challenge. arXiv preprint arXiv:2306.15700","author":"Hu Yihan","year":"2023","unstructured":"Yihan Hu, Kun Li, Pingyuan Liang, Jingyu Qian, Zhening Yang, Haichao Zhang, Wenxin Shao, Zhuangzhuang Ding, Wei Xu, and Qiang Liu. 2023. Imitation with spatial-temporal heatmap: 2nd place solution for nuplan challenge. arXiv preprint arXiv:2306.15700 (2023)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01712"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610550"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00361"},{"key":"e_1_3_2_1_26_1","volume-title":"Differentiable integrated motion prediction and planning with learnable cost function for autonomous driving","author":"Huang Zhiyu","year":"2023","unstructured":"Zhiyu Huang, Haochen Liu, Jingda Wu, and Chen Lv. 2023. Differentiable integrated motion prediction and planning with learnable cost function for autonomous driving. IEEE transactions on neural networks and learning systems (2023)."},{"key":"e_1_3_2_1_27_1","volume-title":"Emma: End-to-end multimodal model for autonomous driving. arXiv preprint arXiv:2410.23262","author":"Hwang Jyh-Jing","year":"2024","unstructured":"Jyh-Jing Hwang, Runsheng Xu, Hubert Lin, Wei-Chih Hung, Jingwei Ji, Kristy Choi, Di Huang, Tong He, Paul Covington, Benjamin Sapp, et al. 2024. Emma: End-to-end multimodal model for autonomous driving. arXiv preprint arXiv:2410.23262 (2024)."},{"key":"e_1_3_2_1_28_1","volume-title":"NAVSIM: Data-Driven Non-Reactive Autonomous Vehicle Simulation and Benchmarking. arXiv preprint arXiv:2406.15349","author":"NAV","year":"2024","unstructured":"NAV IM. 2024. NAVSIM: Data-Driven Non-Reactive Autonomous Vehicle Simulation and Benchmarking. arXiv preprint arXiv:2406.15349 (2024)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02105"},{"key":"e_1_3_2_1_30_1","volume-title":"Senna: Bridging large vision-language models and end-to-end autonomous driving. arXiv preprint arXiv:2410.22313","author":"Jiang Bo","year":"2024","unstructured":"Bo Jiang, Shaoyu Chen, Bencheng Liao, Xingyu Zhang, Wei Yin, Qian Zhang, Chang Huang, Wenyu Liu, and Xinggang Wang. 2024. Senna: Bridging large vision-language models and end-to-end autonomous driving. arXiv preprint arXiv:2410.22313 (2024)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00766"},{"key":"e_1_3_2_1_32_1","volume-title":"Surrealdriver: Designing generative driver agent simulation framework in urban contexts based on large language model. arXiv preprint arXiv:2309.13193 5, 7","author":"Jin Ye","year":"2023","unstructured":"Ye Jin, Xiaoxi Shen, Huiling Peng, Xiaoan Liu, Jingli Qin, Jiayang Li, Jintao Xie, Peizhong Gao, Guyue Zhou, and Jiangtao Gong. 2023. Surrealdriver: Designing generative driver agent simulation framework in urban contexts based on large language model. arXiv preprint arXiv:2309.13193 5, 7 (2023), 8."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8793742"},{"key":"e_1_3_2_1_34_1","volume-title":"Minimum required attention: A human-centered approach to driver inattention. Human factors 59, 3","author":"Kircher Katja","year":"2017","unstructured":"Katja Kircher and Christer Ahlstrom. 2017. Minimum required attention: A human-centered approach to driver inattention. Human factors 59, 3 (2017), 471--484."},{"key":"e_1_3_2_1_35_1","volume-title":"International conference on machine learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_36_1","volume-title":"Generative Planning with 3D-vision Language Pre-training for End-to-End Autonomous Driving. arXiv preprint arXiv:2501.08861","author":"Li Tengpeng","year":"2025","unstructured":"Tengpeng Li, Hanli Wang, Xianfei Li, Wenlong Liao, Tao He, and Pai Peng. 2025. Generative Planning with 3D-vision Language Pre-training for End-to-End Autonomous Driving. arXiv preprint arXiv:2501.08861 (2025)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611197"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC57777.2023.10421993"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_31"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.3390\/ijerph18073716"},{"key":"e_1_3_2_1_41_1","volume-title":"European Conference on Computer Vision. Springer, 403--420","author":"Ma Yingzi","year":"2024","unstructured":"Yingzi Ma, Yulong Cao, Jiachen Sun, Marco Pavone, and Chaowei Xiao. 2024. Dolphins: Multimodal language model for driving. In European Conference on Computer Vision. Springer, 403--420."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01434"},{"key":"e_1_3_2_1_43_1","volume-title":"Gpt-driver: Learning to drive with gpt. arXiv preprint arXiv:2310.01415","author":"Mao Jiageng","year":"2023","unstructured":"Jiageng Mao, Yuxi Qian, Junjie Ye, Hang Zhao, and Yue Wang. 2023. Gpt-driver: Learning to drive with gpt. arXiv preprint arXiv:2310.01415 (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"A language agent for autonomous driving. arXiv preprint arXiv:2311.10813","author":"Mao Jiageng","year":"2023","unstructured":"Jiageng Mao, Junjie Ye, Yuxi Qian, Marco Pavone, and YueWang. 2023. A language agent for autonomous driving. arXiv preprint arXiv:2311.10813 (2023)."},{"key":"e_1_3_2_1_45_1","volume-title":"European Conference on Computer Vision. Springer, 292--308","author":"Nie Ming","year":"2024","unstructured":"Ming Nie, Renyuan Peng, ChunweiWang, Xinyue Cai, Jianhua Han, Hang Xu, and Li Zhang. 2024. Reason2drive: Towards interpretable and chain-based reasoning for autonomous driving. In European Conference on Computer Vision. Springer, 292--308."},{"key":"e_1_3_2_1_46_1","volume-title":"International conference on machine learning. PmLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748--8763."},{"key":"e_1_3_2_1_47_1","volume-title":"Plant: Explainable planning transformers via objectlevel representations. arXiv preprint arXiv:2210.14222","author":"Renz Katrin","year":"2022","unstructured":"Katrin Renz, Kashyap Chitta, Otniel-Bogdan Mercea, A Koepke, Zeynep Akata, and Andreas Geiger. 2022. Plant: Explainable planning transformers via objectlevel representations. arXiv preprint arXiv:2210.14222 (2022)."},{"key":"e_1_3_2_1_48_1","volume-title":"Conference on Robot Learning. PMLR, 718--728","author":"Scheel Oliver","year":"2022","unstructured":"Oliver Scheel, Luca Bergamini, Maciej Wolczyk, B\u0142a\u017cej Osi\u0144ski, and Peter Ondruska. 2022. Urban driver: Learning to drive from real-world demonstrations using policy gradients. In Conference on Robot Learning. PMLR, 718--728."},{"key":"e_1_3_2_1_49_1","volume-title":"Masayoshi Tomizuka,Wei Zhan, and Mingyu Ding.","author":"Sha Hao","year":"2023","unstructured":"Hao Sha, Yao Mu, Yuxuan Jiang, Li Chen, Chenfeng Xu, Ping Luo, Shengbo Eben Li, Masayoshi Tomizuka,Wei Zhan, and Mingyu Ding. 2023. Languagempc: Large language models as decision makers for autonomous driving. arXiv preprint arXiv:2310.03026 (2023)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01432"},{"key":"e_1_3_2_1_51_1","volume-title":"Llm-assist: Enhancing closed-loop planning with language-based reasoning. arXiv preprint arXiv:2401.00125","author":"Sharan SP","year":"2023","unstructured":"SP Sharan, Francesco Pittaluga, Manmohan Chandraker, et al. 2023. Llm-assist: Enhancing closed-loop planning with language-based reasoning. arXiv preprint arXiv:2401.00125 (2023)."},{"key":"e_1_3_2_1_52_1","volume-title":"European Conference on Computer Vision. Springer, 256--274","author":"Sima Chonghao","year":"2024","unstructured":"Chonghao Sima, Katrin Renz, Kashyap Chitta, Li Chen, Hanxue Zhang, Chengen Xie, Jens Bei\u00dfwenger, Ping Luo, Andreas Geiger, and Hongyang Li. 2024. Drivelm: Driving with graph visual question answering. In European Conference on Computer Vision. Springer, 256--274."},{"key":"e_1_3_2_1_53_1","volume-title":"International conference on machine learning. PMLR, 6105--6114","author":"Tan Mingxing","year":"2019","unstructured":"Mingxing Tan and Quoc Le. 2019. Efficientnet: Rethinking model scaling for convolutional neural networks. In International conference on machine learning. PMLR, 6105--6114."},{"key":"e_1_3_2_1_54_1","volume-title":"LHPF: Look back the History and Plan for the Future in Autonomous Driving. arXiv preprint arXiv:2411.17253","author":"Wang Sheng","year":"2024","unstructured":"Sheng Wang, Yao Tian, Xiaodong Mei, Ge Sun, Jie Cheng, Fulong Ma, Pedro V Sander, and Junwei Liang. 2024. LHPF: Look back the History and Plan for the Future in Autonomous Driving. arXiv preprint arXiv:2411.17253 (2024)."},{"key":"e_1_3_2_1_55_1","volume-title":"Omnidrive: A holistic llm-agent framework for autonomous driving with 3d perception, reasoning and planning. arXiv preprint arXiv:2405.01533","author":"Wang Shihao","year":"2024","unstructured":"Shihao Wang, Zhiding Yu, Xiaohui Jiang, Shiyi Lan, Min Shi, Nadine Chang, Jan Kautz, Ying Li, and Jose M Alvarez. 2024. Omnidrive: A holistic llm-agent framework for autonomous driving with 3d perception, reasoning and planning. arXiv preprint arXiv:2405.01533 (2024)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIV.2023.3325300"},{"key":"e_1_3_2_1_57_1","volume-title":"Drivemlm: Aligning multi-modal large language models with behavioral planning states for autonomous driving. arXiv preprint arXiv:2312.09245","author":"Xie Jiangwei","year":"2023","unstructured":"WenhaiWang, Jiangwei Xie, ChuanYang Hu, Haoming Zou, Jianan Fan,Wenwen Tong, Yang Wen, Silei Wu, Hanming Deng, Zhiqi Li, et al. 2023. Drivemlm: Aligning multi-modal large language models with behavioral planning states for autonomous driving. arXiv preprint arXiv:2312.09245 (2023)."},{"key":"e_1_3_2_1_58_1","volume-title":"Chengtian Lang, Chao Huang, Zhaoran Wang, Zhuoran Yang, and Qi Zhu.","author":"Wang Yixuan","year":"2023","unstructured":"Yixuan Wang, Ruochen Jiao, Sinong Simon Zhan, Chengtian Lang, Chao Huang, Zhaoran Wang, Zhuoran Yang, and Qi Zhu. 2023. Empowering autonomous driving with large language models: A safety perspective. arXiv preprint arXiv:2312.00812 (2023)."},{"key":"e_1_3_2_1_59_1","volume-title":"Dilu: A knowledge-driven approach to autonomous driving with large language models. arXiv preprint arXiv:2309.16292","author":"Wen Licheng","year":"2023","unstructured":"Licheng Wen, Daocheng Fu, Xin Li, Xinyu Cai, Tao Ma, Pinlong Cai, Min Dou, Botian Shi, Liang He, and Yu Qiao. 2023. Dilu: A knowledge-driven approach to autonomous driving with large language models. arXiv preprint arXiv:2309.16292 (2023)."},{"key":"e_1_3_2_1_60_1","volume-title":"Siddhartha Srinivasa, Eric M Wolff, and Xin Huang.","author":"Xu Yi","year":"2024","unstructured":"Yi Xu, Yuxin Hu, Zaiwei Zhang, Gregory P Meyer, Siva Karthik Mustikovela, Siddhartha Srinivasa, Eric M Wolff, and Xin Huang. 2024. Vlm-ad: End-to-end autonomous driving through vision-language model supervision. arXiv preprint arXiv:2412.14446 (2024)."},{"key":"e_1_3_2_1_61_1","volume-title":"Drivegpt4: Interpretable end-to-end autonomous driving via large language model","author":"Xu Zhenhua","year":"2024","unstructured":"Zhenhua Xu, Yujia Zhang, Enze Xie, Zhen Zhao, Yong Guo, Kwan-Yee K Wong, Zhenguo Li, and Hengshuang Zhao. 2024. Drivegpt4: Interpretable end-to-end autonomous driving via large language model. IEEE Robotics and Automation Letters (2024)."},{"key":"e_1_3_2_1_62_1","volume-title":"Diffusion-es: Gradient-free planning with diffusion for autonomous driving and zero-shot instruction following. arXiv preprint arXiv:2402.06559","author":"Yang Brian","year":"2024","unstructured":"Brian Yang, Huangyuan Su, Nikolaos Gkanatsios, Tsung-Wei Ke, Ayush Jain, Jeff Schneider, and Katerina Fragkiadaki. 2024. Diffusion-es: Gradient-free planning with diffusion for autonomous driving and zero-shot instruction following. arXiv preprint arXiv:2402.06559 (2024)."},{"key":"e_1_3_2_1_63_1","volume-title":"Llm4drive: A survey of large language models for autonomous driving. arXiv preprint arXiv:2311.01043","author":"Yang Zhenjie","year":"2023","unstructured":"Zhenjie Yang, Xiaosong Jia, Hongyang Li, and Junchi Yan. 2023. Llm4drive: A survey of large language models for autonomous driving. arXiv preprint arXiv:2311.01043 (2023)."},{"key":"e_1_3_2_1_64_1","volume-title":"CALMM-Drive: Confidence-Aware Autonomous Driving with Large Multimodal Model. arXiv preprint arXiv:2412.04209","author":"Yao Ruoyu","year":"2024","unstructured":"Ruoyu Yao, Yubin Wang, Haichao Liu, Rui Yang, Zengqi Peng, Lei Zhu, and Jun Ma. 2024. CALMM-Drive: Confidence-Aware Autonomous Driving with Large Multimodal Model. arXiv preprint arXiv:2412.04209 (2024)."},{"key":"e_1_3_2_1_65_1","volume-title":"Rag-driver: Generalisable driving explanations with retrieval-augmented in-context learning in multi-modal large language model. arXiv preprint arXiv:2402.10828","author":"Yuan Jianhao","year":"2024","unstructured":"Jianhao Yuan, Shuyang Sun, Daniel Omeiza, Bo Zhao, Paul Newman, Lars Kunze, and Matthew Gadd. 2024. Rag-driver: Generalisable driving explanations with retrieval-augmented in-context learning in multi-modal large language model. arXiv preprint arXiv:2402.10828 (2024)."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00886"},{"key":"e_1_3_2_1_67_1","first-page":"1781","article-title":"Multi-modality fusion perception and computing in autonomous driving","volume":"57","author":"Zhang Y","year":"2020","unstructured":"Y Zhang, S Zhang, Y Zhang, J Ji, Y Duan, Y Huang, J Peng, and Y Zahng. 2020. Multi-modality fusion perception and computing in autonomous driving. J. Comput. Res. Dev 57 (2020), 1781--1799.","journal-title":"J. Comput. Res. Dev"},{"key":"e_1_3_2_1_68_1","volume-title":"Planagent: A multi-modal large language agent for closed-loop vehicle motion planning. arXiv preprint arXiv:2406.01587","author":"Zheng Yupeng","year":"2024","unstructured":"Yupeng Zheng, Zebin Xing, Qichao Zhang, Bu Jin, Pengfei Li, Yuhang Zheng, Zhongpu Xia, Kun Zhan, Xianpeng Lang, Yaran Chen, et al. 2024. Planagent: A multi-modal large language agent for closed-loop vehicle motion planning. arXiv preprint arXiv:2406.01587 (2024)."},{"key":"e_1_3_2_1_69_1","volume-title":"Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159","author":"Zhu Xizhou","year":"2020","unstructured":"Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, and Jifeng Dai. 2020. Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159 (2020)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755730","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:03:01Z","timestamp":1765339381000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755730"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":69,"alternative-id":["10.1145\/3746027.3755730","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755730","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}