{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,11]],"date-time":"2026-05-11T20:13:54Z","timestamp":1778530434352,"version":"3.51.4"},"reference-count":220,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"10","license":[{"start":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T00:00:00Z","timestamp":1778803200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T00:00:00Z","timestamp":1778803200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T00:00:00Z","timestamp":1778803200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"State Key Laboratory of Intelligent Transportation System","award":["2025-B006"],"award-info":[{"award-number":["2025-B006"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62273198"],"award-info":[{"award-number":["62273198"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["52221005"],"award-info":[{"award-number":["52221005"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Beijing Natural Science Foundation Program","award":["L241017"],"award-info":[{"award-number":["L241017"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Internet Things J."],"published-print":{"date-parts":[[2026,5,15]]},"DOI":"10.1109\/jiot.2026.3668085","type":"journal-article","created":{"date-parts":[[2026,2,25]],"date-time":"2026-02-25T20:58:08Z","timestamp":1772053088000},"page":"20375-20397","source":"Crossref","is-referenced-by-count":0,"title":["Multimodal Large Language Models for Perception in Autonomous Driving: Architecture, Taxonomy, and Challenges"],"prefix":"10.1109","volume":"13","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1901-8967","authenticated-orcid":false,"given":"Ying","family":"Jing","sequence":"first","affiliation":[{"name":"State Key Laboratory of Intelligent Green Vehicle and Mobility and School of Vehicle and Mobility,, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0034-9037","authenticated-orcid":false,"given":"Xinyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Intelligent Green Vehicle and Mobility and School of Vehicle and Mobility,, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2192-8673","authenticated-orcid":false,"given":"Mo","family":"Zhou","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Intelligent Green Vehicle and Mobility and School of Vehicle and Mobility,, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuchuan","family":"Ji","sequence":"additional","affiliation":[{"name":"School of Rail Transportation, Soochow University, Suzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yanchao","family":"Ding","sequence":"additional","affiliation":[{"name":"School of Transportation Science and Engineering, Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jialun","family":"Yin","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Intelligent Green Vehicle and Mobility and School of Vehicle and Mobility,, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5692-3310","authenticated-orcid":false,"given":"Ruizhi","family":"Jia","sequence":"additional","affiliation":[{"name":"Department of Electrical and Photonics Engineering, Technical University of Denmark, Kongens Lyngby, Denmark"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yijin","family":"Xiong","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Intelligent Green Vehicle and Mobility and School of Vehicle and Mobility,, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kun","family":"Zhao","sequence":"additional","affiliation":[{"name":"Suzhou Automobile Research Institute, Tsinghua University, Suzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jun","family":"Yang","sequence":"additional","affiliation":[{"name":"Suzhou Automobile Research Institute, Tsinghua University, Suzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0437-5112","authenticated-orcid":false,"given":"Jun","family":"Li","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Intelligent Green Vehicle and Mobility and School of Vehicle and Mobility,, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4042-6044","authenticated-orcid":false,"given":"Huaping","family":"Liu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, State Key Laboratory of Intelligent Technology and Systems, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"European Regional Status Report on Road Safety 2019","year":"2020"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1155\/2020\/8867757"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/IRPS.2018.8353618"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-26250-1_26"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01784-z"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00534"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01436"},{"key":"ref8","article-title":"BEVFusion: Multi-task multi-sensor fusion with unified bird\u2019s-eye view representation","author":"Liu","year":"2022","journal-title":"arXiv:2205.13542"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2022.3231369"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2022.3175375"},{"key":"ref11","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv:2307.09288"},{"key":"ref12","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv:2302.13971"},{"key":"ref13","article-title":"GPT-4 technical report","volume-title":"arXiv:2303.08774","author":"Achiam","year":"2023"},{"key":"ref14","volume-title":"ChatGPT: Optimizing Language Models for Dialogue","year":"2022"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/tiv.2024.3402136"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00106"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC58415.2024.10919629"},{"key":"ref18","article-title":"LLM4Drive: A survey of large language models for autonomous driving","author":"Yang","year":"2023","journal-title":"arXiv:2311.01043"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TIV.2024.3406372"},{"key":"ref20","article-title":"A survey on large language model-empowered autonomous driving","author":"Zhu","year":"2024","journal-title":"arXiv:2409.14165"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/OJVT.2025.3604823"},{"key":"ref22","article-title":"Foundation models in autonomous driving: A survey on scenario generation and scenario analysis","author":"Gao","year":"2025","journal-title":"arXiv:2506.11526"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.3390\/drones9040238"},{"key":"ref24","article-title":"Tokenize the world into object-level knowledge to address long-tail events in autonomous driving","author":"Tian","year":"2024","journal-title":"arXiv:2407.00959"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73347-5_17"},{"key":"ref26","article-title":"RoboTron-drive: All-in-one large multimodal model for autonomous driving","author":"Huang","year":"2024","journal-title":"arXiv:2412.07689"},{"key":"ref27","article-title":"Is a 3D-tokenized LLM the key to reliable autonomous driving?","author":"Bai","year":"2024","journal-title":"arXiv:2405.18361"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i2.32220"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.02090"},{"key":"ref30","article-title":"HiLM-D: Towards high-resolution understanding in multimodal large language models for autonomous driving","author":"Ding","year":"2023","journal-title":"arXiv:2309.05186"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01297"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28253"},{"key":"ref33","article-title":"DriveMLLM: A benchmark for spatial understanding with multimodal large language models in autonomous driving","author":"Guo","year":"2024","journal-title":"arXiv:2411.13112"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01129"},{"key":"ref35","article-title":"MTA: Multimodal task alignment for BEV perception and captioning","author":"Ma","year":"2024","journal-title":"arXiv:2411.10639"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2025.3621971"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00110"},{"key":"ref38","article-title":"EMMA: End-to-end multimodal model for autonomous driving","author":"Hwang","year":"2024","journal-title":"arXiv:2410.23262"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72980-5_15"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72649-1_21"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611485"},{"key":"ref42","article-title":"LLMI3D: MLLM-based 3D perception from a single 2D image","author":"Yang","year":"2024","journal-title":"arXiv:2408.07422"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i9.33001"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i9.33067"},{"key":"ref45","first-page":"311","article-title":"BLEU: A method for automatic evaluation of machine translation","volume-title":"Proc. 40th Annu. Meeting Assoc. Comput. Linguistics","author":"Papineni"},{"key":"ref46","article-title":"ROUGE 2.0: Updated and improved measures for evaluation of summarization tasks","author":"Ganesan","year":"2018","journal-title":"arXiv:1803.01937"},{"key":"ref47","first-page":"65","article-title":"METEOR: An automatic metric for MT evaluation with improved correlation with human judgments","volume-title":"Proc. ACL Workshop Intrinsic Extrinsic Eval. Measures Mach. Transl. Summarization","author":"Banerjee"},{"key":"ref48","first-page":"4566","article-title":"CIDEr: Consensus-based image description evaluation","volume-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit. (CVPR)","author":"Vedantam"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"ref50","first-page":"1","article-title":"BERTScore: Evaluating text generation with BERT","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Zhang"},{"key":"ref51","volume-title":"BEVLM: Got-Based Integration of BEV and LLM for Driving With Language","year":"2025"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72943-0_15"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1215"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i8.32902"},{"key":"ref56","article-title":"Talk2Radar: Bridging natural language with 4D mmWave radar for 3D referring expression comprehension","author":"Guan","year":"2024","journal-title":"arXiv:2405.12821"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3147324"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02061"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00734"},{"key":"ref60","article-title":"Language-image models with 3D understanding","author":"Hyun Cho","year":"2024","journal-title":"arXiv:2405.03685"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v40i7.37441"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/WACV61041.2025.00195"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00252"},{"key":"ref64","article-title":"One million scenes for autonomous driving: ONCE dataset","author":"Mao","year":"2021","journal-title":"arXiv:2106.11037"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/IROS60139.2025.11247360"},{"key":"ref66","article-title":"NuGrounding: A multi-view 3D visual grounding framework in autonomous driving","author":"Li","year":"2025","journal-title":"arXiv:2503.22436"},{"key":"ref67","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023","journal-title":"arXiv:2304.10592"},{"key":"ref68","article-title":"Kosmos-2: Grounding multimodal large language models to the world","author":"Peng","year":"2023","journal-title":"arXiv:2306.14824"},{"key":"ref69","article-title":"MultiModal-GPT: A vision and language model for dialogue with humans","author":"Gong","year":"2023","journal-title":"arXiv:2305.04790"},{"key":"ref70","article-title":"Prismer: A vision-language model with multi-task experts","author":"Liu","year":"2023","journal-title":"arXiv:2303.02506"},{"key":"ref71","article-title":"Visual ChatGPT: Talking, drawing and editing with visual foundation models","author":"Wu","year":"2023","journal-title":"arXiv:2303.04671"},{"key":"ref72","article-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020","journal-title":"arXiv:2010.11929"},{"key":"ref73","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. 38th Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"ref75","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023","journal-title":"arXiv:2301.12597"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_12"},{"key":"ref77","article-title":"BEVDet: High-performance multi-camera 3D object detection in bird-eye-view","author":"Huang","year":"2021","journal-title":"arXiv:2112.11790"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_1"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25233"},{"key":"ref80","article-title":"Time will tell: New outlooks and a baseline for temporal multi-view 3D object detection","author":"Park","year":"2022","journal-title":"arXiv:2210.02443"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01703"},{"key":"ref82","first-page":"180","article-title":"DETR3D: 3D object detection from multi-view images via 3D-to-2D queries","volume-title":"Proc. Conf. Robot Learn.","author":"Wang"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_31"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01161"},{"key":"ref85","article-title":"OccLLaMA: An occupancy-language-action generative world model for autonomous driving","author":"Wei","year":"2024","journal-title":"arXiv:2409.03272"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA55743.2025.11127665"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00472"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01298"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19839-7_29"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8794195"},{"key":"ref91","first-page":"726","article-title":"Safety-enhanced autonomous driving using interpretable sensor fusion transformer","volume-title":"Proc. Conf. Robot Learn.","author":"Shao"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref93","article-title":"The llama 3 herd of models","author":"Grattafiori","year":"2024","journal-title":"arXiv:2407.21783"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01311"},{"key":"ref95","article-title":"Lyrics: Boosting fine-grained language-vision alignment and comprehension via semantic-aware visual objects","author":"Lu","year":"2023","journal-title":"arXiv:2312.05278"},{"key":"ref96","first-page":"57","article-title":"Bootstrapping vision-language learning with decoupled language pre-training","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Jian"},{"key":"ref97","article-title":"Deformable DETR: Deformable transformers for end-to-end object detection","author":"Zhu","year":"2020","journal-title":"arXiv:2010.04159"},{"key":"ref98","volume-title":"GPT-4V(Ision) System Card","year":"2023"},{"key":"ref99","volume-title":"GPT-4O Technical Report","year":"2024"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"issue":"70","key":"ref102","first-page":"1","article-title":"Scaling instruction-finetuned language models","volume":"25","author":"Chung","year":"2022","journal-title":"J. Mach. Learn. Res."},{"key":"ref103","article-title":"Gemini: A family of highly capable multimodal models","author":"Team","year":"2023","journal-title":"arXiv:2312.11805"},{"key":"ref104","article-title":"Qwen technical report","volume-title":"arXiv:2309.16609","author":"Bai","year":"2023"},{"issue":"3","key":"ref105","first-page":"6","article-title":"Vicuna: An open-source chatbot impressing GPT-4 with 90%* ChatGPT quality","volume":"2","author":"Chiang","year":"2023","journal-title":"See"},{"key":"ref106","article-title":"MLLM-SUL: Multimodal large language model for semantic scene understanding and localization in traffic scenarios","author":"Fan","year":"2024","journal-title":"arXiv:2412.19406"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref108","article-title":"V2V-LLM: Vehicle-to-vehicle cooperative autonomous driving with multimodal large language models","author":"Chiu","year":"2025","journal-title":"arXiv:2502.09980"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v40i16.38386"},{"key":"ref110","article-title":"ALN-p3: Unified language alignment for perception, prediction, and planning in autonomous driving","author":"Ma","year":"2025","journal-title":"arXiv:2505.15158"},{"key":"ref111","article-title":"Hint-AD: Holistically aligned interpretability in end-to-end autonomous driving","author":"Ding","year":"2024","journal-title":"arXiv:2409.06702"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW65960.2025.00113"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01392"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW65960.2025.00120"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73004-7_18"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610779"},{"key":"ref117","article-title":"GPT-4V explorations: Mining autonomous driving","author":"Li","year":"2024","journal-title":"arXiv:2406.16817"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.3390\/automation5040029"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW65960.2025.00074"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP51287.2024.10647129"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/IROS60139.2025.11247237"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2026.104314"},{"key":"ref123","article-title":"DrivePI: Spatial-aware 4D MLLM for unified autonomous driving understanding, perception, prediction and planning","author":"Liu","year":"2025","journal-title":"arXiv:2512.12799"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00356"},{"key":"ref125","article-title":"LaVida drive: Vision-text interaction VLM for autonomous driving with token selection, recovery and enhancement","author":"Jiao","year":"2024","journal-title":"arXiv:2411.12980"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72995-9_23"},{"key":"ref127","first-page":"129","article-title":"Embodied understanding of driving scenarios","volume-title":"Proc. Eur. Conf. Comput. Vis.","author":"Yun-song"},{"key":"ref128","article-title":"RAG-driver: Generalisable driving explanations with retrieval-augmented in-context learning in multi-modal large language model","author":"Yuan","year":"2024","journal-title":"arXiv:2402.10828"},{"key":"ref129","article-title":"UniUGP: Unifying understanding, generation, and planing for end-to-end autonomous driving","author":"Lu","year":"2025","journal-title":"arXiv:2512.09864"},{"key":"ref130","article-title":"MiniDrive: More efficient vision-language models with multi-level 2D features as text tokens for autonomous driving","author":"Zhang","year":"2024","journal-title":"arXiv:2409.07267"},{"key":"ref131","article-title":"SimpleLLM4AD: An end-to-end vision-language model with graph visual question answering for autonomous driving","author":"Zheng","year":"2024","journal-title":"arXiv:2407.21293"},{"key":"ref132","article-title":"A novel MLLM-based approach for autonomous driving in different weather conditions","author":"Fourati","year":"2024","journal-title":"arXiv:2411.10603"},{"key":"ref133","article-title":"Tracking meets large multimodal models for driving scenario understanding","author":"Ishaq","year":"2025","journal-title":"arXiv:2503.14498"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01120"},{"key":"ref135","article-title":"Prune2Drive: A plug-and-play framework for accelerating vision-language models in autonomous driving","author":"Xiong","year":"2025","journal-title":"arXiv:2508.13305"},{"key":"ref136","article-title":"FutureSightDrive: Thinking visually with spatio-temporal CoT for autonomous driving","author":"Zeng","year":"2025","journal-title":"arXiv:2505.17685"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2024.3518615"},{"key":"ref138","article-title":"Driving with InternVL: Oustanding champion in the track on driving with language of the autonomous grand challenge at CVPR 2024","author":"Li","year":"2024","journal-title":"arXiv:2412.07247"},{"key":"ref139","article-title":"OccVLA: Vision-language-action model with implicit 3D occupancy supervision","author":"Liu","year":"2025","journal-title":"arXiv:2509.05578"},{"key":"ref140","article-title":"LMAD: Integrated end-to-end vision-language model for explainable autonomous driving","author":"Song","year":"2025","journal-title":"arXiv:2508.12404"},{"key":"ref141","article-title":"BeLLA: End-to-end birds eye view large language assistant for autonomous driving","author":"Mohan","year":"2025","journal-title":"arXiv:2512.06096"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.52202\/068431-0732"},{"key":"ref144","volume-title":"Introducing MPT-7b: A New Standard for Open-Source, Commercially Usable LLMs","year":"2023"},{"key":"ref145","article-title":"OPT: Open pre-trained transformer language models","author":"Zhang","year":"2022","journal-title":"arXiv:2205.01068"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2024.105171"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00335"},{"key":"ref148","article-title":"TopoMLP: A simple yet strong pipeline for driving topology reasoning","author":"Wu","year":"2023","journal-title":"arXiv:2310.06753"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01463"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02283"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00527"},{"issue":"140","key":"ref153","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2019","journal-title":"J. Mach. Learn. Res."},{"key":"ref154","article-title":"InternLM2 technical report","volume-title":"arXiv:2403.17297","author":"Cai","year":"2024"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1800"},{"key":"ref158","first-page":"4582","article-title":"Prefix-tuning: Optimizing continuous prompts for generation","volume-title":"Proc. 59th Annu. Meeting Assoc. Comput. Linguistics 11th Int. Joint Conf. Natural Lang. Process. (ACL\/IJCNLP)","author":"Li"},{"key":"ref159","first-page":"1","article-title":"LoRA: Low-rank adaptation of large language models","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Hu"},{"key":"ref160","first-page":"506","article-title":"Learning multiple visual domains with residual adapters","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"30","author":"Rebuffi"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2024.3440097"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01282"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.1109\/jproc.2026.3678829"},{"key":"ref166","article-title":"SafeDrive: Knowledge- and data-driven risk-sensitive decision-making for autonomous vehicles with large language models","author":"Zhou","year":"2024","journal-title":"arXiv:2412.13238"},{"key":"ref167","article-title":"BEV-CLIP: Multi-modal BEV retrieval methodology for complex scene in autonomous driving","author":"Jia","year":"2024"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-96-7008-6_5"},{"key":"ref169","article-title":"VideoPoet: A large language model for zero-shot video generation","author":"Kondratyuk","year":"2023","journal-title":"arXiv:2312.14125"},{"key":"ref170","article-title":"Zero-shot video question answering with procedural programs","author":"Choudhury","year":"2023","journal-title":"arXiv:2312.00937"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023141"},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3402952"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73229-4_27"},{"key":"ref174","article-title":"ADriver-I: A general world model for autonomous driving","author":"Jia","year":"2023","journal-title":"arXiv:2311.13549"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i4.28077"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW69036.2025.00080"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00325"},{"key":"ref179","article-title":"Explaining how a deep neural network trained with end-to-end learning steers a car","author":"Bojarski","year":"2017","journal-title":"arXiv:1704.07911"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.320"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00178"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1145\/3639372"},{"key":"ref183","article-title":"Can large language models explain themselves? A study of LLM-generated self-explanations","author":"Huang","year":"2023","journal-title":"arXiv:2310.11207"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01432"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00793"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73242-3_7"},{"key":"ref187","doi-asserted-by":"publisher","DOI":"10.1109\/TAI.2025.3564594"},{"key":"ref188","article-title":"DSDrive: Distilling large language model for lightweight end-to-end autonomous driving with unified reasoning and planning","author":"Liu","year":"2025","journal-title":"arXiv:2505.05360"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02568"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1109\/IV55156.2024.10588799"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2025.3551098"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.1109\/IV64158.2025.11097672"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2025.XXI.140"},{"key":"ref194","article-title":"Towards a unified view of parameter-efficient transfer learning","author":"He","year":"2021","journal-title":"arXiv:2110.04366"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"ref196","article-title":"Multi-frame, lightweight & efficient vision-language models for question answering in autonomous driving","author":"Gopalkrishnan","year":"2024","journal-title":"arXiv:2403.19838"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1109\/TIM.2025.3618738"},{"key":"ref198","article-title":"QuantV2X: A fully quantized multi-agent system for cooperative perception","author":"Zhao","year":"2025","journal-title":"arXiv:2509.03704"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW69036.2025.00423"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2025.3574725"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.128656"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.111522"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2025.107673"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1111\/mice.13503"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1109\/IROS60139.2025.11245999"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.1016\/j.dsp.2024.104428"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1016\/j.comcom.2025.108152"},{"key":"ref208","article-title":"DriveVLM: The convergence of autonomous driving and large vision-language models","author":"Tian","year":"2024","journal-title":"arXiv:2402.12289"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2022.3219049"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.1145\/3517820"},{"key":"ref211","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2021.3124599"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2021.106775"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02308"},{"key":"ref214","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2025.3526056"},{"key":"ref215","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02296"},{"key":"ref216","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW67362.2025.00341"},{"key":"ref217","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01791"},{"key":"ref218","doi-asserted-by":"publisher","DOI":"10.1109\/WACV61041.2025.00155"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1109\/IV51971.2022.9827222"},{"key":"ref220","article-title":"Enhancing LLM-based autonomous driving agents to mitigate perception attacks","author":"Song","year":"2024","journal-title":"arXiv:2409.14488"}],"container-title":["IEEE Internet of Things Journal"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6488907\/11513275\/11411796.pdf?arnumber=11411796","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,11]],"date-time":"2026-05-11T19:48:21Z","timestamp":1778528901000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11411796\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,15]]},"references-count":220,"journal-issue":{"issue":"10"},"URL":"https:\/\/doi.org\/10.1109\/jiot.2026.3668085","relation":{},"ISSN":["2327-4662","2372-2541"],"issn-type":[{"value":"2327-4662","type":"electronic"},{"value":"2372-2541","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,5,15]]}}}