{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,9]],"date-time":"2026-05-09T17:30:13Z","timestamp":1778347813255,"version":"3.51.4"},"reference-count":47,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100013061","name":"Jilin Scientific and Technological Development Program","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100013061","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Information Fusion"],"published-print":{"date-parts":[[2026,9]]},"DOI":"10.1016\/j.inffus.2026.104314","type":"journal-article","created":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T16:29:30Z","timestamp":1774110570000},"page":"104314","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":1,"special_numbering":"C","title":["MMDrive: Interactive Scene Understanding Beyond Vision with Multi-representational Fusion"],"prefix":"10.1016","volume":"133","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9515-6061","authenticated-orcid":false,"given":"Minghui","family":"Hou","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2405-2936","authenticated-orcid":false,"given":"Wei-Hsing","family":"Huang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8134-2188","authenticated-orcid":false,"given":"Shaofeng","family":"Liang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8179-4508","authenticated-orcid":false,"given":"Daizong","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0692-1084","authenticated-orcid":false,"given":"Tai-Hao","family":"Wen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9125-8983","authenticated-orcid":false,"given":"Gang","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4013-2107","authenticated-orcid":false,"given":"Runwei","family":"Guan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3180-7347","authenticated-orcid":false,"given":"Weiping","family":"Ding","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.inffus.2026.104314_bib0001","article-title":"A comprehensive survey of vision-language models: pretrained models, fine-tuning, prompt engineering, adapters, and benchmark datasets","author":"Danish","year":"2025","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.inffus.2026.104314_bib0002","series-title":"European Conference on Computer Vision","first-page":"256","article-title":"Drivelm: driving with graph visual question answering","author":"Sima","year":"2024"},{"key":"10.1016\/j.inffus.2026.104314_sbref0003","doi-asserted-by":"crossref","DOI":"10.1109\/TCSVT.2025.3546767","article-title":"PolarBEVU: multi-Camera 3D object detection in polar bird\u2019s-Eye view via unprojection","author":"Hou","year":"2025","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.inffus.2026.104314_sbref0004","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2025.103575","article-title":"Object detection with multimodal large vision-language models: an in-depth review","author":"Sapkota","year":"2026","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.inffus.2026.104314_sbref0005","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"4950","article-title":"Generative planning with 3d-vision language pre-training for end-to-end autonomous driving","author":"Li","year":"2025"},{"key":"10.1016\/j.inffus.2026.104314_bib0006","unstructured":"A. Gopalkrishnan, R. Greer, M. Trivedi, Multi-frame, lightweight & efficient vision-language models for question answering in autonomous driving, (2024). https:\/\/doi.org\/10.48550\/arXiv.2403.19838."},{"key":"10.1016\/j.inffus.2026.104314_sbref0007","series-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","first-page":"12089","article-title":"Mpdrive: improving spatial understanding with marker-based prompt learning for autonomous driving","author":"Zhang","year":"2025"},{"key":"10.1016\/j.inffus.2026.104314_bib0008","unstructured":"E. Zhang, X. Dai, M. Huang, Y. Lv, Q. Miao, Minidrive: more efficient vision-language models with multi-level 2d features as text tokens for autonomous driving, (2024). https:\/\/doi.org\/10.48550\/arXiv.2409.07267."},{"key":"10.1016\/j.inffus.2026.104314_bib0009","unstructured":"S. Jiao, Y. Fang, B. Peng, W. Chen, B. Veeravalli, Lavida drive: vision-text interaction vlm for autonomous driving with token selection, recovery and enhancement, (2024). https:\/\/doi.org\/10.48550\/arXiv.2411.12980."},{"key":"10.1016\/j.inffus.2026.104314_sbref0010","article-title":"A survey on occupancy perception for autonomous driving: the information fusion perspective","author":"Xu","year":"2025","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.inffus.2026.104314_sbref0011","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"4542","article-title":"Nuscenes-qa: a multi-modal visual question answering benchmark for autonomous driving scenario","author":"Qian","year":"2024"},{"key":"10.1016\/j.inffus.2026.104314_sbref0012","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"issue":"8","key":"10.1016\/j.inffus.2026.104314_sbref0013","doi-asserted-by":"crossref","first-page":"5625","DOI":"10.1109\/TPAMI.2024.3369699","article-title":"Vision-language models for vision tasks: a survey","volume":"46","author":"Zhang","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.inffus.2026.104314_sbref0014","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2025.103604","article-title":"COST: Contrastive one-stage transformer for vision-language small object tracking","volume":"126","author":"Zhang","year":"2026","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.inffus.2026.104314_sbref0015","series-title":"International Conference on Machine Learning","first-page":"12888","article-title":"Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022"},{"key":"10.1016\/j.inffus.2026.104314_sbref0016","series-title":"International Conference on Machine Learning","first-page":"19730","article-title":"Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023"},{"key":"10.1016\/j.inffus.2026.104314_sbref0017","series-title":"Advances in Neural Information Processing Systems","article-title":"Visual instruction tuning","author":"Liu","year":"2023"},{"key":"10.1016\/j.inffus.2026.104314_sbref0018","series-title":"Advances in Neural Information Processing Systems","article-title":"InstructBLIP: towards general-purpose vision-Language models with instruction tuning","author":"Dai","year":"2023"},{"key":"10.1016\/j.inffus.2026.104314_sbref0019","series-title":"The Twelfth International Conference on Learning Representations","article-title":"MiniGPT-4: enhancing vision-Language understanding with advanced large language models","author":"Zhu","year":"2024"},{"key":"10.1016\/j.inffus.2026.104314_sbref0020","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"6700","article-title":"Gqa: a new dataset for real-world visual reasoning and compositional question answering","author":"Hudson","year":"2019"},{"key":"10.1016\/j.inffus.2026.104314_sbref0021","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2025.103714","article-title":"Embracing knowledge integration from the vision-language model for federated domain generalization on multi-source fused data","volume":"127","author":"Liu","year":"2026","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.inffus.2026.104314_sbref0022","series-title":"European Conference on Computer Vision","first-page":"252","article-title":"Lingoqa: visual question answering for autonomous driving","author":"Marcu","year":"2024"},{"key":"10.1016\/j.inffus.2026.104314_sbref0023","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"8395","article-title":"Passing the driving knowledge test","author":"Wei","year":"2025"},{"key":"10.1016\/j.inffus.2026.104314_sbref0024","series-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","first-page":"22442","article-title":"Omnidrive: a holistic vision-language dataset for autonomous driving with counterfactual reasoning","author":"Wang","year":"2025"},{"key":"10.1016\/j.inffus.2026.104314_sbref0025","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"9431","article-title":"Fine-grained evaluation of large vision-language models in autonomous driving","author":"Li","year":"2025"},{"key":"10.1016\/j.inffus.2026.104314_sbref0026","series-title":"IEEE International Conference on Robotics and Automation","first-page":"2774","article-title":"BEVFusion: Multi-Task multi-Sensor fusion with unified Bird\u2019s-Eye view representation","author":"Liu","year":"2023"},{"key":"10.1016\/j.inffus.2026.104314_sbref0027","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"15120","article-title":"Lmdrive: closed-loop end-to-end driving with large language models","author":"Shao","year":"2024"},{"key":"10.1016\/j.inffus.2026.104314_sbref0028","doi-asserted-by":"crossref","DOI":"10.1109\/LRA.2024.3440097","article-title":"Drivegpt4: interpretable end-to-end autonomous driving via large language model","author":"Xu","year":"2024","journal-title":"IEEE Robot. Automat. Lett."},{"key":"10.1016\/j.inffus.2026.104314_sbref0029","series-title":"European Conference on Computer Vision","first-page":"403","article-title":"Dolphins: multimodal language model for driving","author":"Ma","year":"2024"},{"key":"10.1016\/j.inffus.2026.104314_bib0030","unstructured":"X. Tian, J. Gu, B. Li, Y. Liu, Y. Wang, Z. Zhao, K. Zhan, P. Jia, X. Lang, H. Zhao, Drivevlm: the convergence of autonomous driving and large vision-language models, arXiv: 2402.12289. https:\/\/proceedings.mlr.press\/v270\/tian25c.html."},{"key":"10.1016\/j.inffus.2026.104314_bib0031","series-title":"European Conference on Computer Vision","first-page":"292","article-title":"Reason2drive: towards interpretable and chain-based reasoning for autonomous driving","author":"Nie","year":"2024"},{"key":"10.1016\/j.inffus.2026.104314_sbref0032","series-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","first-page":"11993","article-title":"Simlingo: vision-only closed-loop autonomous driving with language-action alignment","author":"Renz","year":"2025"},{"key":"10.1016\/j.inffus.2026.104314_bib0033","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"8359","article-title":"Language prompt for autonomous driving","author":"Wu","year":"2025"},{"key":"10.1016\/j.inffus.2026.104314_bib0034","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"9247","article-title":"Lidar-llm: exploring the potential of large language models for 3d lidar understanding","author":"Yang","year":"2025"},{"key":"10.1016\/j.inffus.2026.104314_sbref0035","series-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","first-page":"12068","article-title":"SOLVE: Synergy of language-Vision and end-to-End networks for autonomous driving","author":"Chen","year":"2025"},{"key":"10.1016\/j.inffus.2026.104314_sbref0036","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"26966","article-title":"VLR-Driver: Large vision-Language-Reasoning models for embodied autonomous driving","author":"Kong","year":"2025"},{"key":"10.1016\/j.inffus.2026.104314_sbref0037","series-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","first-page":"3870","article-title":"Reasondrive: efficient visual question answering for autonomous vehicles with reasoning-Enhanced small vision-Language models","author":"Chahe","year":"2025"},{"key":"10.1016\/j.inffus.2026.104314_bib0038","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"5513","article-title":"Unireplknet: a universal perception large-kernel convnet for audio video point cloud time-series and image recognition","author":"Ding","year":"2024"},{"issue":"140","key":"10.1016\/j.inffus.2026.104314_sbref0039","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"10.1016\/j.inffus.2026.104314_sbref0040","series-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","first-page":"11971","article-title":"Uniscene: unified occupancy-centric driving scene generation","author":"Li","year":"2025"},{"key":"10.1016\/j.inffus.2026.104314_sbref0041","series-title":"Advances in Neural Information Processing Systems","article-title":"Michelangelo: conditional 3D shape generation based on shape-Image-Text aligned latent representation","author":"Zhao","year":"2023"},{"key":"10.1016\/j.inffus.2026.104314_bib0042","unstructured":"S. Bai, K. Chen, X. Liu, J. Wang, W. Ge, S. Song, K. Dang, P. Wang, S. Wang, J. Tang, H. Zhong, Y. Zhu, M. Yang, Z. Li, J. Wan, P. Wang, W. Ding, Z. Fu, Y. Xu, J. Ye, X. Zhang, T. Xie, Z. Cheng, H. Zhang, Z. Yang, H. Xu, J. Lin, Qwen2.5-VL technical report, (2025). 10.48550\/arXiv.2502.13923."},{"key":"10.1016\/j.inffus.2026.104314_bib0043","unstructured":"A. Yang, A. Li, B. Yang, B. Zhang, B. Hui, B. Zheng, B. Yu, C. Gao, C. Huang, C. Lv, C. Zheng, D. Liu, F. Zhou, F. Huang, F. Hu, H. Ge, H. Wei, H. Lin, J. Tang, J. Yang, J. Tu, J. Zhang, J. Yang, J. Yang, J. Zhou, J. Lin, K. Dang, K. Bao, K. Yang, L. Yu, L. Deng, M. Li, M. Xue, M. Li, P. Zhang, P. Wang, Q. Zhu, R. Men, R. Gao, S. Liu, S. Luo, T. Li, T. Tang, W. Yin, X. Ren, X. Wang, X. Zhang, X. Ren, Y. Fan, Y. Su, Y. Zhang, Y. Zhang, Y. Wan, Y. Liu, Z. Wang, Z. Cui, Z. Zhang, Z. Zhou, Z. Qiu, Qwen3 technical report, (2025). 10.48550\/arXiv.2505.09388."},{"key":"10.1016\/j.inffus.2026.104314_bib0044","series-title":"Iclr","article-title":"An image is worth 16x16 words: transformers for image recognition at scale","author":"Dosovitskiy","year":"2021"},{"key":"10.1016\/j.inffus.2026.104314_sbref0045","series-title":"The Twelfth International Conference on Learning Representations","article-title":"LLaMA-Adapter: efficient fine-tuning of large language models with zero-initialized attention","author":"Zhang","year":"2024"},{"key":"10.1016\/j.inffus.2026.104314_sbref0046","series-title":"Advances in Neural Information Processing Systems","article-title":"Occ3d: a large-Scale 3D occupancy prediction benchmark for autonomous driving","author":"Tian","year":"2023"},{"key":"10.1016\/j.inffus.2026.104314_bib0047","article-title":"DriveMLM: aligning multi-Modal large language models with behavioral planning states for autonomous driving","volume":"abs\/2312.09245","author":"Wang","year":"2023","journal-title":"CoRR"}],"container-title":["Information Fusion"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1566253526001934?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1566253526001934?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,27]],"date-time":"2026-04-27T15:42:06Z","timestamp":1777304526000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1566253526001934"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,9]]},"references-count":47,"alternative-id":["S1566253526001934"],"URL":"https:\/\/doi.org\/10.1016\/j.inffus.2026.104314","relation":{},"ISSN":["1566-2535"],"issn-type":[{"value":"1566-2535","type":"print"}],"subject":[],"published":{"date-parts":[[2026,9]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"MMDrive: Interactive Scene Understanding Beyond Vision with Multi-representational Fusion","name":"articletitle","label":"Article Title"},{"value":"Information Fusion","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.inffus.2026.104314","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"104314"}}