{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,27]],"date-time":"2026-05-27T17:49:09Z","timestamp":1779904149210,"version":"3.53.1"},"reference-count":368,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,3,3]],"date-time":"2026-03-03T00:00:00Z","timestamp":1772496000000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Image and Vision Computing"],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1016\/j.imavis.2026.105944","type":"journal-article","created":{"date-parts":[[2026,3,3]],"date-time":"2026-03-03T16:26:33Z","timestamp":1772555193000},"page":"105944","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":2,"special_numbering":"C","title":["All you need for object detection: From pixels, points, and prompts to Next-Gen fusion and multimodal LLMs\/VLMs in autonomous vehicles"],"prefix":"10.1016","volume":"169","author":[{"given":"Sayed Pedram Haeri","family":"Boroujeni","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Niloufar","family":"Mehrabi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hazim","family":"Alzorgan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mahlagha","family":"Fazeli","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Abolfazl","family":"Razi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.imavis.2026.105944_b1","article-title":"Secure and efficient vehicle control of autonomous vehicles using federated deep reinforcement learning","author":"ELallid","year":"2025","journal-title":"Appl. Soft Comput."},{"key":"10.1016\/j.imavis.2026.105944_b2","doi-asserted-by":"crossref","DOI":"10.1016\/j.asoc.2024.111802","article-title":"Robust autonomous vehicle control by leveraging multi-stage MPC and quantized CNN in HIL framework","volume":"162","author":"Khosravian","year":"2024","journal-title":"Appl. Soft Comput."},{"key":"10.1016\/j.imavis.2026.105944_b3","doi-asserted-by":"crossref","first-page":"459","DOI":"10.1016\/j.jsr.2024.11.028","article-title":"A scoping review on the methods used to assess health-related quality of life and disability burden in evaluations of road safety interventions","volume":"92","author":"Gerhard","year":"2025","journal-title":"J. Saf. Res."},{"key":"10.1016\/j.imavis.2026.105944_b4","doi-asserted-by":"crossref","DOI":"10.1016\/j.trc.2024.104797","article-title":"A qualitative AI security risk assessment of autonomous vehicles","volume":"169","author":"Grosse","year":"2024","journal-title":"Transp. Res. C"},{"key":"10.1016\/j.imavis.2026.105944_b5","article-title":"Uncertainty quantification for safe and reliable autonomous vehicles: A review of methods and applications","author":"Wang","year":"2025","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b6","doi-asserted-by":"crossref","DOI":"10.1016\/j.energy.2025.135669","article-title":"Hierarchical deep reinforcement learning based multi-agent game control for energy consumption and traffic efficiency improving of autonomous vehicles","volume":"323","author":"Chen","year":"2025","journal-title":"Energy"},{"key":"10.1016\/j.imavis.2026.105944_b7","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2025.105484","article-title":"Real-time localization and navigation method for autonomous vehicles based on multi-modal data fusion by integrating memory transformer and DDQN","volume":"156","author":"Zha","year":"2025","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.105944_b8","unstructured":"Z. Huang, C. Feng, F. Yan, B. Xiao, Z. Jie, Y. Zhong, X. Liang, L. Ma, RoboTron-Drive: All-in-One Large Multimodal Model for Autonomous Driving, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2025, pp. 8011\u20138021."},{"issue":"3","key":"10.1016\/j.imavis.2026.105944_b9","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3714478","article-title":"Advancing explainable autonomous vehicle systems: A comprehensive review and research roadmap","volume":"14","author":"Tekkesinoglu","year":"2025","journal-title":"ACM Trans. Human-Robot Interact."},{"key":"10.1016\/j.imavis.2026.105944_b10","doi-asserted-by":"crossref","DOI":"10.1016\/j.trc.2023.104280","article-title":"Using spatiotemporal stacks for precise vehicle tracking from roadside 3D LiDAR data","volume":"154","author":"Chang","year":"2023","journal-title":"Transp. Res. C"},{"key":"10.1016\/j.imavis.2026.105944_b11","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2025.110563","article-title":"Resilient object detection for autonomous vehicles: Integrating deep learning and sensor fusion in adverse conditions","volume":"151","author":"Thottempudi","year":"2025","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.imavis.2026.105944_b12","article-title":"An in-depth examination of SLAM methods: Challenges, advancements, and applications in complex scenes for autonomous driving","author":"Wang","year":"2025","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b13","article-title":"Heterogeneous multiscale cooperative perception for connected autonomous vehicles via V2X interaction","author":"Zha","year":"2025","journal-title":"IEEE Internet Things J."},{"key":"10.1016\/j.imavis.2026.105944_b14","doi-asserted-by":"crossref","DOI":"10.1016\/j.trc.2022.103625","article-title":"Optimal roadside units location for path flow reconstruction in a connected vehicle environment","volume":"138","author":"Salari","year":"2022","journal-title":"Transp. Res. C"},{"key":"10.1016\/j.imavis.2026.105944_b15","series-title":"2025 International Conference on Computational, Communication and Information Technology","first-page":"809","article-title":"Autonomous vehicle navigation systems: Machine learning for real-time traffic prediction","author":"Praveen","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b16","doi-asserted-by":"crossref","DOI":"10.1109\/OJVT.2025.3559461","article-title":"Detection of multiple small biased GPS spoofing attacks on autonomous vehicles using time series analysis","author":"Mohammadi","year":"2025","journal-title":"IEEE Open J. Veh. Technol."},{"key":"10.1016\/j.imavis.2026.105944_b17","series-title":"2025 International Conference on Multi-Agent Systems for Collaborative Intelligence","first-page":"1143","article-title":"Embedded large language models for enhanced human-machine interface in autonomous vehicles","author":"RS","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b18","series-title":"2025 Fourth International Conference on Power, Control and Computing Technologies (ICPC2T)","first-page":"1","article-title":"Improving faster R-CNN for vehicle detection under varying conditions with domain adaptation technique","author":"Kumar","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b19","series-title":"2025 International Conference on Computational, Communication and Information Technology","first-page":"612","article-title":"AI-powered object detection for autonomous vehicles: A comparative study of machine learning models","author":"Shrivastava","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b20","doi-asserted-by":"crossref","DOI":"10.1016\/j.mex.2025.103387","article-title":"Insights of semantic segmentation using the DeepLab architecture for autonomous driving","author":"Subhedar","year":"2025","journal-title":"MethodsX"},{"key":"10.1016\/j.imavis.2026.105944_b21","series-title":"The International Conference Optoelectronic Information and Optical Engineering (OIOE2024)","first-page":"884","article-title":"Ranging research on telematics based on mask R-CNN dual eye stereo vision ranging algorithm","volume":"vol. 13513","author":"Chen","year":"2025"},{"issue":"1","key":"10.1016\/j.imavis.2026.105944_b22","doi-asserted-by":"crossref","first-page":"187","DOI":"10.1109\/TITS.2025.3628969","article-title":"Large (vision) language models for autonomous vehicles: Current trends and future directions","volume":"27","author":"Tian","year":"2025","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b23","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2025.103158","article-title":"UAVs meet LLMs: Overviews and perspectives towards agentic low-altitude mobility","volume":"122","author":"Tian","year":"2025","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.imavis.2026.105944_b24","series-title":"2024 2nd International Conference on Foundation and Large Language Models","first-page":"501","article-title":"Vlm-auto: Vlm-based autonomous driving assistant with human-like behavior and understanding for complex road scenes","author":"Guo","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b25","article-title":"Developments in 3D object detection for autonomous driving: A review","author":"Wang","year":"2025","journal-title":"IEEE Sens. J."},{"issue":"9","key":"10.1016\/j.imavis.2026.105944_b26","doi-asserted-by":"crossref","first-page":"2794","DOI":"10.3390\/s25092794","article-title":"A survey of the multi-sensor fusion object detection task in autonomous driving","volume":"25","author":"Wang","year":"2025","journal-title":"Sensors"},{"issue":"3","key":"10.1016\/j.imavis.2026.105944_b27","doi-asserted-by":"crossref","first-page":"1757","DOI":"10.1007\/s00371-024-03480-6","article-title":"A review of 3D object detection based on autonomous driving","volume":"41","author":"Wang","year":"2025","journal-title":"Vis. Comput."},{"key":"10.1016\/j.imavis.2026.105944_b28","doi-asserted-by":"crossref","DOI":"10.1109\/TITS.2024.3439557","article-title":"Robustness-aware 3d object detection in autonomous driving: A review and outlook","author":"Song","year":"2024","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"issue":"1","key":"10.1016\/j.imavis.2026.105944_b29","doi-asserted-by":"crossref","first-page":"20","DOI":"10.3390\/wevj15010020","article-title":"Emerging trends in autonomous vehicle perception: Multimodal fusion for 3D object detection","volume":"15","author":"Alaba","year":"2024","journal-title":"World Electr. Veh. J."},{"issue":"3","key":"10.1016\/j.imavis.2026.105944_b30","doi-asserted-by":"crossref","first-page":"257","DOI":"10.1109\/JPROC.2023.3238524","article-title":"Object detection in 20 years: A survey","volume":"111","author":"Zou","year":"2023","journal-title":"Proc. IEEE"},{"issue":"5","key":"10.1016\/j.imavis.2026.105944_b31","doi-asserted-by":"crossref","first-page":"3537","DOI":"10.1109\/TPAMI.2023.3346386","article-title":"3D object detection from images for autonomous driving: A survey","volume":"46","author":"Ma","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.imavis.2026.105944_b32","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2022.108796","article-title":"3D object detection for autonomous driving: A survey","volume":"130","author":"Qian","year":"2022","journal-title":"Pattern Recognit."},{"issue":"2","key":"10.1016\/j.imavis.2026.105944_b33","doi-asserted-by":"crossref","first-page":"722","DOI":"10.1109\/TITS.2020.3023541","article-title":"Deep learning for image and point cloud fusion in autonomous driving: A review","volume":"23","author":"Cui","year":"2021","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"issue":"3","key":"10.1016\/j.imavis.2026.105944_b34","doi-asserted-by":"crossref","first-page":"1341","DOI":"10.1109\/TITS.2020.2972974","article-title":"Deep multi-modal object detection and semantic segmentation for autonomous driving: Datasets, methods, and challenges","volume":"22","author":"Feng","year":"2020","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"issue":"8","key":"10.1016\/j.imavis.2026.105944_b35","doi-asserted-by":"crossref","first-page":"3135","DOI":"10.1109\/TITS.2019.2926042","article-title":"Is it safe to drive? An overview of factors, metrics, and datasets for driveability assessment in autonomous driving","volume":"21","author":"Guo","year":"2019","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b36","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2024.102822","article-title":"Security analysis and adaptive false data injection against multi-sensor fusion localization for autonomous driving","volume":"117","author":"Hu","year":"2025","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.imavis.2026.105944_b37","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2024.102859","article-title":"Advancements in perception system with multi-sensor fusion for embodied agents","volume":"117","author":"Du","year":"2025","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.imavis.2026.105944_b38","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2024.102647","article-title":"Fusion-based modeling of an intelligent algorithm for enhanced object detection using a deep learning approach on radar and camera data","volume":"113","author":"Wu","year":"2025","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.imavis.2026.105944_b39","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2024.102370","article-title":"Joint semantic segmentation using representations of LiDAR point clouds and camera images","volume":"108","author":"Wu","year":"2024","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.imavis.2026.105944_b40","doi-asserted-by":"crossref","first-page":"218","DOI":"10.1016\/j.inffus.2022.09.018","article-title":"Multi-GNSS PPP\/INS\/Vision\/LiDAR tightly integrated system for precise navigation in urban environments","volume":"90","author":"Li","year":"2023","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.imavis.2026.105944_b41","doi-asserted-by":"crossref","first-page":"527","DOI":"10.1109\/OJITS.2023.3295502","article-title":"High-definition maps: Comprehensive survey, challenges, and future perspectives","volume":"4","author":"Elghazaly","year":"2023","journal-title":"IEEE Open J. Intell. Transp. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b42","doi-asserted-by":"crossref","DOI":"10.1109\/TITS.2025.3554781","article-title":"Exploring radar data representations in autonomous driving: A comprehensive review","author":"Yao","year":"2025","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b43","doi-asserted-by":"crossref","DOI":"10.1016\/j.commtr.2025.100229","article-title":"PriorFusion: Unified integration of priors for robust road perception in autonomous driving","volume":"5","author":"Tang","year":"2025","journal-title":"Commun. Transp. Res."},{"issue":"3","key":"10.1016\/j.imavis.2026.105944_b44","doi-asserted-by":"crossref","first-page":"1352","DOI":"10.1007\/s11263-024-02235-z","article-title":"Maptrv2: An end-to-end framework for online vectorized hd map construction","volume":"133","author":"Liao","year":"2025","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.imavis.2026.105944_b45","series-title":"European Conference on Computer Vision","first-page":"73","article-title":"Online vectorized hd map construction using geometry","author":"Zhang","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b46","series-title":"European Conference on Computer Vision","first-page":"422","article-title":"Enhancing vectorized map perception with historical rasterized maps","author":"Zhang","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b47","first-page":"7730","article-title":"Waymax: An accelerated, data-driven simulator for large-scale autonomous driving research","volume":"36","author":"Gulino","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b48","doi-asserted-by":"crossref","DOI":"10.3389\/fcomp.2024.1423129","article-title":"A systemic survey of the omniverse platform and its applications in data generation, simulation and metaverse","volume":"6","author":"Ahmed","year":"2024","journal-title":"Front. Comput. Sci."},{"key":"10.1016\/j.imavis.2026.105944_b49","series-title":"2020 IEEE 23rd International Conference on Intelligent Transportation Systems","first-page":"1","article-title":"Lgsvl simulator: A high fidelity simulator for autonomous driving","author":"Rong","year":"2020"},{"key":"10.1016\/j.imavis.2026.105944_b50","series-title":"Conference on Robot Learning","first-page":"1","article-title":"CARLA: An open urban driving simulator","author":"Dosovitskiy","year":"2017"},{"key":"10.1016\/j.imavis.2026.105944_b51","series-title":"Field and Service Robotics: Results of the 11th International Conference","first-page":"621","article-title":"Airsim: High-fidelity visual and physical simulation for autonomous vehicles","author":"Shah","year":"2017"},{"issue":"3","key":"10.1016\/j.imavis.2026.105944_b52","doi-asserted-by":"crossref","first-page":"892","DOI":"10.1002\/cae.22492","article-title":"Application of baidu apollo open platform in a course of control simulation experiments","volume":"30","author":"Feng","year":"2022","journal-title":"Comput. Appl. Eng. Educ."},{"key":"10.1016\/j.imavis.2026.105944_b53","series-title":"2018 IEEE 12th International Conference on Application of Information and Communication Technologies","first-page":"1","article-title":"Self-driving car steering angle prediction based on deep neural network an example of carnd udacity simulator","author":"Smolyakov","year":"2018"},{"key":"10.1016\/j.imavis.2026.105944_b54","series-title":"2018 ACM\/IEEE 9th International Conference on Cyber-Physical Systems","first-page":"287","article-title":"Autoware on board: Enabling autonomous vehicles with embedded systems","author":"Kato","year":"2018"},{"key":"10.1016\/j.imavis.2026.105944_b55","series-title":"2004 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)(IEEE Cat. No. 04CH37566)","first-page":"2149","article-title":"Design and use paradigms for gazebo, an open-source multi-robot simulator","volume":"vol. 3","author":"Koenig","year":"2004"},{"key":"10.1016\/j.imavis.2026.105944_b56","unstructured":"B. Wymann, E. Espi\u00e9, C. Guionneau, C. Dimitrakakis, R. Coulom, A. Sumner, Torcs, the open racing car simulator, 4 (6) (2000) 2. Software Available At http:\/\/torcs.sourceforge.net."},{"key":"10.1016\/j.imavis.2026.105944_b57","series-title":"Oasis: Open agent social interaction simulations with one million agents","author":"Yang","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b58","series-title":"2020 IEEE 4th Information Technology, Networking, Electronic and Automation Control Conference","first-page":"1814","article-title":"Development and verification of traffic confrontation simulation test platform based on PanoSim","volume":"vol. 1","author":"Zhang","year":"2020"},{"key":"10.1016\/j.imavis.2026.105944_b59","series-title":"2021 IEEE International Conference on Internet of Things and Intelligence Systems (IoTaIS)","first-page":"215","article-title":"System configuration of human-in-the-loop simulation for level 3 autonomous vehicle using IPG CarMaker","author":"Hong","year":"2021"},{"key":"10.1016\/j.imavis.2026.105944_b60","article-title":"SCANeR studio\/SE-workbench-RF physical radar sensor","author":"Faucher","year":"2022","journal-title":"Proc. DSC Eur. VR"},{"key":"10.1016\/j.imavis.2026.105944_b61","series-title":"2013 IEEE Intelligent Vehicles Symposium","first-page":"713","article-title":"A simulation tool suite for developing connected vehicle systems","author":"Tideman","year":"2013"},{"issue":"9","key":"10.1016\/j.imavis.2026.105944_b62","doi-asserted-by":"crossref","first-page":"24","DOI":"10.1007\/s38311-017-0088-y","article-title":"Simulation accelerates development of autonomous driving","volume":"119","author":"Sovani","year":"2017","journal-title":"ATZ Worldw."},{"issue":"28","key":"10.1016\/j.imavis.2026.105944_b63","doi-asserted-by":"crossref","DOI":"10.1126\/scirobotics.aaw0863","article-title":"AADS: Augmented autonomous driving simulation using data-driven algorithms","volume":"4","author":"Li","year":"2019","journal-title":"Sci. Robot."},{"issue":"17","key":"10.1016\/j.imavis.2026.105944_b64","doi-asserted-by":"crossref","first-page":"3486","DOI":"10.3390\/electronics13173486","article-title":"Virtual tools for testing autonomous driving: A survey and benchmark of simulators, datasets, and competitions","volume":"13","author":"Zhang","year":"2024","journal-title":"Electronics"},{"key":"10.1016\/j.imavis.2026.105944_b65","first-page":"99","article-title":"CARSIM: Car-following model for simulation of traffic in normal and stop-and-go conditions","volume":"1194","author":"Benekohal","year":"1988","journal-title":"Transp. Res. Rec."},{"issue":"5","key":"10.1016\/j.imavis.2026.105944_b66","first-page":"36","article-title":"Multi-sensor fusion and cooperative perception for autonomous driving: A review","volume":"15","author":"Xiang","year":"2023","journal-title":"IEEE Intell. Transp. Syst. Mag."},{"key":"10.1016\/j.imavis.2026.105944_b67","doi-asserted-by":"crossref","unstructured":"D. Kent, M. Alyaqoub, X. Lu, H. Khatounabadi, K. Sung, C. Scheller, A. Dalat, A. bin Thabit, R. Whitley, H. Radha, MSU-4S-The Michigan State University Four Seasons Dataset, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 22658\u201322667.","DOI":"10.1109\/CVPR52733.2024.02138"},{"key":"10.1016\/j.imavis.2026.105944_b68","series-title":"OmniHD-scenes: A next-generation multimodal dataset for autonomous driving","author":"Zheng","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b69","doi-asserted-by":"crossref","unstructured":"M. Alibeigi, W. Ljungbergh, A. Tonderski, G. Hess, A. Lilja, C. Lindstr\u00f6m, D. Motorniuk, J. Fu, J. Widahl, C. Petersson, Zenseact open dataset: A large-scale and diverse multimodal dataset for autonomous driving, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 20178\u201320188.","DOI":"10.1109\/ICCV51070.2023.01846"},{"key":"10.1016\/j.imavis.2026.105944_b70","doi-asserted-by":"crossref","unstructured":"C.A. Diaz-Ruiz, Y. Xia, Y. You, J. Nino, J. Chen, J. Monica, X. Chen, K. Luo, Y. Wang, M. Emond, et al., Ithaca365: Dataset and driving perception under repeated and challenging weather conditions, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 21383\u201321392.","DOI":"10.1109\/CVPR52688.2022.02069"},{"key":"10.1016\/j.imavis.2026.105944_b71","series-title":"One million scenes for autonomous driving: Once dataset","author":"Mao","year":"2021"},{"key":"10.1016\/j.imavis.2026.105944_b72","series-title":"2021 Ieee International Intelligent Transportation Systems Conference (Itsc)","first-page":"2987","article-title":"Pixset: An opportunity for 3d computer vision to go beyond point clouds with a full-waveform lidar dataset","author":"D\u00e9ziel","year":"2021"},{"key":"10.1016\/j.imavis.2026.105944_b73","series-title":"2021 IEEE International Intelligent Transportation Systems Conference","first-page":"3095","article-title":"Pandaset: Advanced sensor suite dataset for autonomous driving","author":"Xiao","year":"2021"},{"key":"10.1016\/j.imavis.2026.105944_b74","series-title":"A2d2: Audi autonomous driving dataset","author":"Geyer","year":"2020"},{"key":"10.1016\/j.imavis.2026.105944_b75","series-title":"Self-driving car dataset","author":"Roboflow","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b76","doi-asserted-by":"crossref","unstructured":"P. Sun, H. Kretzschmar, X. Dotiwalla, A. Chouard, V. Patnaik, P. Tsui, J. Guo, Y. Zhou, Y. Chai, B. Caine, et al., Scalability in perception for autonomous driving: Waymo open dataset, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 2446\u20132454.","DOI":"10.1109\/CVPR42600.2020.00252"},{"key":"10.1016\/j.imavis.2026.105944_b77","series-title":"2020 IEEE International Conference on Robotics and Automation","first-page":"2267","article-title":"A* 3d dataset: Towards autonomous driving in challenging environments","author":"Pham","year":"2020"},{"key":"10.1016\/j.imavis.2026.105944_b78","series-title":"2020 IEEE Intelligent Vehicles Symposium","first-page":"1929","article-title":"The ind dataset: A drone dataset of naturalistic road user trajectories at german intersections","author":"Bock","year":"2020"},{"key":"10.1016\/j.imavis.2026.105944_b79","series-title":"2022 IEEE Intelligent Vehicles Symposium","first-page":"958","article-title":"The exid dataset: A real-world trajectory dataset of highly interactive highway scenarios in Germany","author":"Moers","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b80","doi-asserted-by":"crossref","unstructured":"H. Caesar, V. Bankiti, A.H. Lang, S. Vora, V.E. Liong, Q. Xu, A. Krishnan, Y. Pan, G. Baldan, O. Beijbom, nuscenes: A multimodal dataset for autonomous driving, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 11621\u201311631.","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"10.1016\/j.imavis.2026.105944_b81","doi-asserted-by":"crossref","unstructured":"M.-F. Chang, J. Lambert, P. Sangkloy, J. Singh, S. Bak, A. Hartnett, D. Wang, P. Carr, S. Lucey, D. Ramanan, et al., Argoverse: 3d tracking and forecasting with rich maps, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 8748\u20138757.","DOI":"10.1109\/CVPR.2019.00895"},{"key":"10.1016\/j.imavis.2026.105944_b82","series-title":"2019 International Conference on Robotics and Automation","first-page":"6685","article-title":"BLVD: Building a large-scale 5D semantics benchmark for autonomous driving","author":"Xue","year":"2019"},{"key":"10.1016\/j.imavis.2026.105944_b83","series-title":"A commute in data: The comma2k19 dataset","author":"Schafer","year":"2018"},{"key":"10.1016\/j.imavis.2026.105944_b84","unstructured":"F. Yu, W. Xian, Y. Chen, F. Liu, M. Liao, V. Madhavan, T. Darrell, et al. Bdd100k: A diverse driving video database with scalable annotation tooling, 2 (5) (2018) 6. arXiv preprint arXiv:1805.04687."},{"key":"10.1016\/j.imavis.2026.105944_b85","doi-asserted-by":"crossref","unstructured":"X. Huang, X. Cheng, Q. Geng, B. Cao, D. Zhou, P. Wang, Y. Lin, R. Yang, The apolloscape dataset for autonomous driving, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, 2018, pp. 954\u2013960.","DOI":"10.1109\/CVPRW.2018.00141"},{"key":"10.1016\/j.imavis.2026.105944_b86","doi-asserted-by":"crossref","unstructured":"M. Cordts, M. Omran, S. Ramos, T. Rehfeld, M. Enzweiler, R. Benenson, U. Franke, S. Roth, B. Schiele, The cityscapes dataset for semantic urban scene understanding, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 3213\u20133223.","DOI":"10.1109\/CVPR.2016.350"},{"key":"10.1016\/j.imavis.2026.105944_b87","series-title":"2020 IEEE International Conference on Robotics and Automation","first-page":"6433","article-title":"The oxford radar robotcar dataset: A radar extension to the oxford robotcar dataset","author":"Barnes","year":"2020"},{"key":"10.1016\/j.imavis.2026.105944_b88","series-title":"2012 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"3354","article-title":"Are we ready for autonomous driving? the kitti vision benchmark suite","author":"Geiger","year":"2012"},{"key":"10.1016\/j.imavis.2026.105944_b89","series-title":"European Conference on Computer Vision","first-page":"331","article-title":"Roscenes: A large-scale multi-view 3d dataset for roadside perception","author":"Zhu","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b90","series-title":"2023 IEEE 26th International Conference on Intelligent Transportation Systems","first-page":"1030","article-title":"Tumtraf intersection dataset: All you need for urban 3d camera-lidar roadside perception","author":"Zimmer","year":"2023"},{"key":"10.1016\/j.imavis.2026.105944_b91","series-title":"2022 IEEE Intelligent Vehicles Symposium","first-page":"965","article-title":"A9-dataset: Multi-sensor infrastructure-based dataset for mobility research","author":"Cre\u00df","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b92","series-title":"2022 International Conference on Robotics and Automation","first-page":"2539","article-title":"Ips300+: A challenging multi-modal data sets for intersection perception system","author":"Wang","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b93","doi-asserted-by":"crossref","unstructured":"X. Ye, M. Shu, H. Li, Y. Shi, Y. Li, G. Wang, X. Tan, E. Ding, Rope3d: The roadside perception dataset for autonomous driving and monocular 3d object detection task, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 21341\u201321350.","DOI":"10.1109\/CVPR52688.2022.02065"},{"key":"10.1016\/j.imavis.2026.105944_b94","series-title":"2022 IEEE Intelligent Vehicles Symposium","first-page":"1127","article-title":"Lumpi: The leibniz university multi-perspective intersection dataset","author":"Busch","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b95","series-title":"Weakly supervised training of monocular 3d object detectors using wide baseline multi-view traffic camera data","author":"Howe","year":"2021"},{"key":"10.1016\/j.imavis.2026.105944_b96","series-title":"Interaction dataset: An international, adversarial and cooperative motion dataset in interactive driving scenarios with semantic maps","author":"Zhan","year":"2019"},{"key":"10.1016\/j.imavis.2026.105944_b97","doi-asserted-by":"crossref","unstructured":"Z. Tang, M. Naphade, M.-Y. Liu, X. Yang, S. Birchfield, S. Wang, R. Kumar, D. Anastasiu, J.-N. Hwang, Cityflow: A city-scale benchmark for multi-target multi-camera vehicle tracking and re-identification, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 8797\u20138806.","DOI":"10.1109\/CVPR.2019.00900"},{"key":"10.1016\/j.imavis.2026.105944_b98","series-title":"DriveLMM-o1: A step-by-step reasoning dataset and large multimodal model for driving scenario understanding","author":"Ishaq","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b99","series-title":"2025 IEEE\/CVF Winter Conference on Applications of Computer Vision","first-page":"7817","article-title":"Automated evaluation of large vision-language models on self-driving corner cases","author":"Chen","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b100","series-title":"V2v-llm: Vehicle-to-vehicle cooperative autonomous driving with multi-modal large language models","author":"Chiu","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b101","series-title":"European Conference on Computer Vision","first-page":"256","article-title":"Drivelm: Driving with graph visual question answering","author":"Sima","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b102","doi-asserted-by":"crossref","unstructured":"Y. Inoue, Y. Yada, K. Tanahashi, Y. Yamaguchi, Nuscenes-mqa: Integrated evaluation of captions and qa for autonomous driving datasets using markup annotations, in: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2024, pp. 930\u2013938.","DOI":"10.1109\/WACVW60836.2024.00104"},{"key":"10.1016\/j.imavis.2026.105944_b103","series-title":"OmniDrive: A holistic vision-language dataset for autonomous driving with counterfactual reasoning","author":"Wang","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b104","series-title":"Tokenize the world into object-level knowledge to address long-tail events in autonomous driving","author":"Tian","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b105","doi-asserted-by":"crossref","unstructured":"X. Cao, T. Zhou, Y. Ma, W. Ye, C. Cui, K. Tang, Z. Cao, K. Liang, Z. Wang, J.M. Rehg, et al., Maplm: A real-world large-scale vision-language benchmark for map and traffic scene understanding, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 21819\u201321830.","DOI":"10.1109\/CVPR52733.2024.02061"},{"key":"10.1016\/j.imavis.2026.105944_b106","series-title":"European Conference on Computer Vision","first-page":"252","article-title":"LingoQA: Visual question answering for autonomous driving","author":"Marcu","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b107","doi-asserted-by":"crossref","unstructured":"T. Qian, J. Chen, L. Zhuo, Y. Jiao, Y.-G. Jiang, Nuscenes-qa: A multi-modal visual question answering benchmark for autonomous driving scenario, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, 2024, pp. 4542\u20134550.","DOI":"10.1609\/aaai.v38i5.28253"},{"key":"10.1016\/j.imavis.2026.105944_b108","doi-asserted-by":"crossref","unstructured":"X. Ding, J. Han, H. Xu, X. Liang, W. Zhang, X. Li, Holistic autonomous driving understanding by bird\u2019s-eye-view injected multi-modal large models, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 13668\u201313677.","DOI":"10.1109\/CVPR52733.2024.01297"},{"key":"10.1016\/j.imavis.2026.105944_b109","doi-asserted-by":"crossref","unstructured":"S. Malla, C. Choi, I. Dwivedi, J.H. Choi, J. Li, Drama: Joint risk localization and captioning in driving, in: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2023, pp. 1043\u20131052.","DOI":"10.1109\/WACV56688.2023.00110"},{"key":"10.1016\/j.imavis.2026.105944_b110","doi-asserted-by":"crossref","unstructured":"Y. Li, Z. Li, N. Chen, M. Gong, Z. Lyu, Z. Wang, P. Jiang, C. Feng, Multiagent multitraversal multimodal self-driving: Open mars dataset, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 22041\u201322051.","DOI":"10.1109\/CVPR52733.2024.02081"},{"key":"10.1016\/j.imavis.2026.105944_b111","series-title":"An extensible framework for open heterogeneous collaborative perception","author":"Lu","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b112","doi-asserted-by":"crossref","unstructured":"Y. Hu, Y. Lu, R. Xu, W. Xie, S. Chen, Y. Wang, Collaboration helps camera overtake lidar in 3d detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 9243\u20139252.","DOI":"10.1109\/CVPR52729.2023.00892"},{"key":"10.1016\/j.imavis.2026.105944_b113","first-page":"28462","article-title":"Asynchrony-robust collaborative perception via bird\u2019s eye view flow","volume":"36","author":"Wei","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b114","doi-asserted-by":"crossref","unstructured":"R. Xu, X. Xia, J. Li, H. Li, S. Zhang, Z. Tu, Z. Meng, H. Xiang, X. Dong, R. Song, et al., V2v4real: A real-world large-scale dataset for vehicle-to-vehicle cooperative perception, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 13712\u201313722.","DOI":"10.1109\/CVPR52729.2023.01318"},{"key":"10.1016\/j.imavis.2026.105944_b115","series-title":"2023 IEEE Intelligent Vehicles Symposium","first-page":"1","article-title":"LUCOOP: Leibniz university cooperative perception and urban navigation dataset","author":"Axmann","year":"2023"},{"key":"10.1016\/j.imavis.2026.105944_b116","first-page":"4874","article-title":"Where2comm: Communication-efficient collaborative perception via spatial confidence maps","volume":"35","author":"Hu","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b117","series-title":"2022 International Conference on Robotics and Automation","first-page":"2583","article-title":"Opv2v: An open benchmark dataset and fusion pipeline for perception with vehicle-to-vehicle communication","author":"Xu","year":"2022"},{"issue":"2","key":"10.1016\/j.imavis.2026.105944_b118","doi-asserted-by":"crossref","first-page":"1502","DOI":"10.1109\/LRA.2021.3137888","article-title":"Fast and robust registration of partially overlapping point clouds","volume":"7","author":"Arnold","year":"2021","journal-title":"IEEE Robot. Autom. Lett."},{"key":"10.1016\/j.imavis.2026.105944_b119","doi-asserted-by":"crossref","first-page":"255","DOI":"10.5194\/isprs-archives-XLIII-B2-2021-255-2021","article-title":"Comap: A synthetic dataset for collective multi-agent perception of autonomous driving","volume":"43","author":"Yuan","year":"2021","journal-title":"Int. Arch. Photogramm. Remote. Sens. Spat. Inf. Sci."},{"key":"10.1016\/j.imavis.2026.105944_b120","series-title":"2019 IEEE 39th International Conference on Distributed Computing Systems","first-page":"514","article-title":"Cooper: Cooperative perception for connected autonomous vehicles based on 3d point clouds","author":"Chen","year":"2019"},{"key":"10.1016\/j.imavis.2026.105944_b121","series-title":"A benchmark for vision-centric HD mapping by V2I systems","author":"Fan","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b122","series-title":"2024 IEEE Intelligent Vehicles Symposium","first-page":"490","article-title":"V2X-DSI: A density-sensitive infrastructure LiDAR benchmark for economic vehicle-to-everything cooperative perception","author":"Liu","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b123","series-title":"V2X-radar: A multi-modal dataset with 4D radar for cooperative perception","author":"Yang","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b124","doi-asserted-by":"crossref","unstructured":"W. Zimmer, G.A. Wardana, S. Sritharan, X. Zhou, R. Song, A.C. Knoll, Tumtraf v2x cooperative perception dataset, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 22668\u201322677.","DOI":"10.1109\/CVPR52733.2024.02139"},{"key":"10.1016\/j.imavis.2026.105944_b125","series-title":"2024 IEEE\/RSJ International Conference on Intelligent Robots and Systems","first-page":"10732","article-title":"OTVIC: A dataset with online transmission for vehicle-to-infrastructure cooperative 3D object detection","author":"Zhu","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b126","article-title":"Dair-v2xreid: A new real-world vehicle-infrastructure cooperative re-id dataset and cross-shot feature aggregation network perception method","author":"Wang","year":"2024","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b127","doi-asserted-by":"crossref","unstructured":"C. Ma, L. Qiao, C. Zhu, K. Liu, Z. Kong, Q. Li, X. Zhou, Y. Kan, W. Wu, HoloVIC: Large-scale dataset and benchmark for multi-sensor holographic intersection and vehicle-infrastructure cooperative, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 22129\u201322138.","DOI":"10.1109\/CVPR52733.2024.02089"},{"key":"10.1016\/j.imavis.2026.105944_b128","doi-asserted-by":"crossref","unstructured":"H. Yu, W. Yang, H. Ruan, Z. Yang, Y. Tang, X. Gao, X. Hao, Y. Shi, Y. Pan, N. Sun, et al., V2x-seq: A large-scale sequential dataset for vehicle-infrastructure cooperative perception and forecasting, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 5486\u20135495.","DOI":"10.1109\/CVPR52729.2023.00531"},{"key":"10.1016\/j.imavis.2026.105944_b129","series-title":"2022 IEEE 25th International Conference on Intelligent Transportation Systems","first-page":"1743","article-title":"Pillargrid: Deep learning-based cooperative perception for 3d object detection from onboard-roadside lidar","author":"Bai","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b130","doi-asserted-by":"crossref","unstructured":"H. Yu, Y. Luo, M. Shu, Y. Huo, Z. Yang, Y. Shi, Z. Guo, H. Li, X. Hu, J. Yuan, et al., Dair-v2x: A large-scale dataset for vehicle-infrastructure cooperative 3d object detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 21361\u201321370.","DOI":"10.1109\/CVPR52688.2022.02067"},{"issue":"3","key":"10.1016\/j.imavis.2026.105944_b131","doi-asserted-by":"crossref","first-page":"1852","DOI":"10.1109\/TITS.2020.3028424","article-title":"Cooperative perception for 3D object detection in driving scenarios using infrastructure sensors","volume":"23","author":"Arnold","year":"2020","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b132","series-title":"Mixed signals: A diverse point cloud dataset for heterogeneous lidar V2X collaboration","author":"Luo","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b133","series-title":"V2X-ReaLO: An open online framework and dataset for cooperative perception in reality","author":"Xiang","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b134","doi-asserted-by":"crossref","unstructured":"T. Wang, S. Kim, J. Wenxuan, E. Xie, C. Ge, J. Chen, Z. Li, P. Luo, Deepaccident: A motion and accident prediction benchmark for v2x autonomous driving, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, 2024, pp. 5599\u20135606.","DOI":"10.1609\/aaai.v38i6.28370"},{"key":"10.1016\/j.imavis.2026.105944_b135","series-title":"Adver-city: Open-source multi-modal dataset for collaborative perception under adverse weather conditions","author":"Karvat","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b136","series-title":"Multi-V2X: A large scale multi-modal multi-penetration-rate dataset for cooperative perception","author":"Li","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b137","series-title":"WHALES: A multi-agent scheduling dataset for enhanced cooperation in autonomous driving","author":"Chen","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b138","series-title":"V2x-r: Cooperative lidar-4d radar fusion for 3d object detection with denoising diffusion","author":"Huang","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b139","series-title":"V2xpnp: Vehicle-to-everything spatio-temporal fusion for multi-agent perception and prediction","author":"Zhou","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b140","series-title":"SCOPE: A synthetic multi-modal dataset for collective perception including physical-correct weather conditions","author":"Gamerdinger","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b141","first-page":"22350","article-title":"RCDN: Towards robust camera-insensitivity collaborative perception via dynamic feature-based 3D neural modeling","volume":"37","author":"Wang","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"4","key":"10.1016\/j.imavis.2026.105944_b142","doi-asserted-by":"crossref","first-page":"10914","DOI":"10.1109\/LRA.2022.3192802","article-title":"V2X-sim: Multi-agent collaborative perception dataset and benchmark for autonomous driving","volume":"7","author":"Li","year":"2022","journal-title":"IEEE Robot. Autom. Lett."},{"key":"10.1016\/j.imavis.2026.105944_b143","series-title":"European Conference on Computer Vision","first-page":"107","article-title":"V2x-vit: Vehicle-to-everything cooperative perception with vision transformer","author":"Xu","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b144","doi-asserted-by":"crossref","unstructured":"R. Mao, J. Guo, Y. Jia, Y. Sun, S. Zhou, Z. Niu, Dolphins: Dataset for collaborative perception enabled harmonious and interconnected self-driving, in: Proceedings of the Asian Conference on Computer Vision, 2022, pp. 4361\u20134377.","DOI":"10.1007\/978-3-031-26348-4_29"},{"key":"10.1016\/j.imavis.2026.105944_b145","series-title":"European Conference on Computer Vision","first-page":"455","article-title":"V2x-real: A largs-scale dataset for vehicle-to-everything cooperative perception","author":"Xiang","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b146","doi-asserted-by":"crossref","unstructured":"R. Hao, S. Fan, Y. Dai, Z. Zhang, C. Li, Y. Wang, H. Yu, W. Yang, J. Yuan, Z. Nie, Rcooper: A real-world large-scale dataset for roadside cooperative perception, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 22347\u201322357.","DOI":"10.1109\/CVPR52733.2024.02109"},{"key":"10.1016\/j.imavis.2026.105944_b147","series-title":"InScope: A new real-world 3D infrastructure-side collaborative perception dataset for open traffic scenarios","author":"Zhang","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b148","article-title":"Object detection for autonomous vehicles under adverse weather conditions","author":"Chen","year":"2025","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.imavis.2026.105944_b149","doi-asserted-by":"crossref","unstructured":"Z. Yu, B. Qiu, A.W. Khong, ViKIENet: Towards Efficient 3D Object Detection with Virtual Key Instance Enhanced Network, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 11844\u201311853.","DOI":"10.1109\/CVPR52734.2025.01106"},{"issue":"12","key":"10.1016\/j.imavis.2026.105944_b150","doi-asserted-by":"crossref","first-page":"5972","DOI":"10.3390\/app12125972","article-title":"AspectNet: Aspect-aware anchor-free detector for autonomous driving","volume":"12","author":"Liang","year":"2022","journal-title":"Appl. Sci."},{"key":"10.1016\/j.imavis.2026.105944_b151","doi-asserted-by":"crossref","unstructured":"J. Choi, D. Chun, H. Kim, H.-J. Lee, Gaussian yolov3: An accurate and fast object detector using localization uncertainty for autonomous driving, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 502\u2013511.","DOI":"10.1109\/ICCV.2019.00059"},{"issue":"3","key":"10.1016\/j.imavis.2026.105944_b152","doi-asserted-by":"crossref","first-page":"1010","DOI":"10.1109\/TITS.2018.2838132","article-title":"SINet: A scale-insensitive convolutional neural network for fast vehicle detection","volume":"20","author":"Hu","year":"2018","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b153","doi-asserted-by":"crossref","DOI":"10.1016\/j.cviu.2019.102827","article-title":"ASSD: Attentive single shot multibox detector","volume":"189","author":"Yi","year":"2019","journal-title":"Comput. Vis. Image Underst."},{"key":"10.1016\/j.imavis.2026.105944_b154","doi-asserted-by":"crossref","unstructured":"S. Zhang, L. Wen, X. Bian, Z. Lei, S.Z. Li, Single-shot refinement neural network for object detection, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 4203\u20134212.","DOI":"10.1109\/CVPR.2018.00442"},{"key":"10.1016\/j.imavis.2026.105944_b155","doi-asserted-by":"crossref","unstructured":"S. Liu, D. Huang, et al., Receptive field block net for accurate and fast object detection, in: Proceedings of the European Conference on Computer Vision, ECCV, 2018, pp. 385\u2013400.","DOI":"10.1007\/978-3-030-01252-6_24"},{"key":"10.1016\/j.imavis.2026.105944_b156","doi-asserted-by":"crossref","unstructured":"B. Wu, F. Iandola, P.H. Jin, K. Keutzer, Squeezedet: Unified, small, low power fully convolutional neural networks for real-time object detection for autonomous driving, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, 2017, pp. 129\u2013137.","DOI":"10.1109\/CVPRW.2017.60"},{"key":"10.1016\/j.imavis.2026.105944_b157","series-title":"European Conference on Computer Vision","first-page":"354","article-title":"A unified multi-scale deep convolutional neural network for fast object detection","author":"Cai","year":"2016"},{"key":"10.1016\/j.imavis.2026.105944_b158","doi-asserted-by":"crossref","unstructured":"F. Yang, W. Choi, Y. Lin, Exploit all the layers: Fast and accurate cnn object detector with scale dependent pooling and cascaded rejection classifiers, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 2129\u20132137.","DOI":"10.1109\/CVPR.2016.234"},{"key":"10.1016\/j.imavis.2026.105944_b159","doi-asserted-by":"crossref","unstructured":"G. Brazil, X. Liu, M3d-rpn: Monocular 3d region proposal network for object detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 9287\u20139296.","DOI":"10.1109\/ICCV.2019.00938"},{"key":"10.1016\/j.imavis.2026.105944_b160","doi-asserted-by":"crossref","unstructured":"F. Pu, Y. Wang, J. Deng, W. Yang, MonoDGP: Monocular 3D Object Detection with Decoupled-Query and Geometry-Error Priors, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2025, pp. 6520\u20136530.","DOI":"10.1109\/CVPR52734.2025.00611"},{"key":"10.1016\/j.imavis.2026.105944_b161","doi-asserted-by":"crossref","unstructured":"H.-I. Liu, C. Wu, J.-H. Cheng, W. Chai, S.-Y. Wang, G. Liu, H. Latapie, J.-C. Wu, J.-N. Hwang, H.-H. Shuai, W.-H. Cheng, MonoTAKD: Teaching Assistant Knowledge Distillation for Monocular 3D Object Detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2025, pp. 22266\u201322275.","DOI":"10.1109\/CVPR52734.2025.02074"},{"key":"10.1016\/j.imavis.2026.105944_b162","doi-asserted-by":"crossref","unstructured":"Y. Ranasinghe, D. Hegde, V.M. Patel, MonoDiff: Monocular 3D Object Detection and Pose Estimation with Diffusion Models, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2024, pp. 10659\u201310670.","DOI":"10.1109\/CVPR52733.2024.01014"},{"key":"10.1016\/j.imavis.2026.105944_b163","doi-asserted-by":"crossref","unstructured":"Z. Wu, Y. Gan, Y. Wu, R. Wang, X. Wang, J. Pu, Fd3d: Exploiting foreground depth map for feature-supervised monocular 3d object detection, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, 2024, pp. 6189\u20136197.","DOI":"10.1609\/aaai.v38i6.28436"},{"key":"10.1016\/j.imavis.2026.105944_b164","doi-asserted-by":"crossref","unstructured":"L. Yan, P. Yan, S. Xiong, X. Xiang, Y. Tan, MonoCD: Monocular 3D Object Detection with Complementary Depths, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2024, pp. 10248\u201310257.","DOI":"10.1109\/CVPR52733.2024.00976"},{"key":"10.1016\/j.imavis.2026.105944_b165","article-title":"GUPNet++: Geometry uncertainty propagation network for monocular 3D object detection","author":"Lu","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.imavis.2026.105944_b166","doi-asserted-by":"crossref","unstructured":"R. Zhang, H. Qiu, T. Wang, Z. Guo, Z. Cui, Y. Qiao, H. Li, P. Gao, MonoDETR: Depth-guided Transformer for Monocular 3D Object Detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2023, pp. 9155\u20139166.","DOI":"10.1109\/ICCV51070.2023.00840"},{"key":"10.1016\/j.imavis.2026.105944_b167","doi-asserted-by":"crossref","first-page":"11703","DOI":"10.52202\/075280-0514","article-title":"Monouni: A unified vehicle and infrastructure-side monocular 3d object detection network with sufficient depth clues","volume":"36","author":"Jinrang","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b168","series-title":"European Conference on Computer Vision","first-page":"664","article-title":"Deviant: Depth equivariant network for monocular 3d object detection","author":"Kumar","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b169","series-title":"Monopgc: Monocular 3d object detection with pixel geometry contexts","author":"Wu","year":"2023"},{"key":"10.1016\/j.imavis.2026.105944_b170","doi-asserted-by":"crossref","unstructured":"Y. Zhou, H. Zhu, Q. Liu, S. Chang, M. Guo, MonoATT: Online Monocular 3D Object Detection With Adaptive Token Transformer, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2023, pp. 17493\u201317503.","DOI":"10.1109\/CVPR52729.2023.01678"},{"key":"10.1016\/j.imavis.2026.105944_b171","series-title":"European Conference on Computer Vision","first-page":"718","article-title":"Densely constrained depth estimator for monocular 3d object detection","author":"Li","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b172","doi-asserted-by":"crossref","unstructured":"Z. Li, Z. Qu, Y. Zhou, J. Liu, H. Wang, L. Jiang, Diversity Matters: Fully Exploiting Depth Clues for Reliable Monocular 3D Object Detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2022, pp. 2791\u20132800.","DOI":"10.1109\/CVPR52688.2022.00281"},{"key":"10.1016\/j.imavis.2026.105944_b173","doi-asserted-by":"crossref","unstructured":"Z. Qin, X. Li, MonoGround: Detecting Monocular 3D Objects From the Ground, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2022, pp. 3793\u20133802.","DOI":"10.1109\/CVPR52688.2022.00377"},{"key":"10.1016\/j.imavis.2026.105944_b174","doi-asserted-by":"crossref","unstructured":"Q. Lian, P. Li, X. Chen, MonoJSG: Joint Semantic and Geometric Cost Volume for Monocular 3D Object Detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2022, pp. 1070\u20131079.","DOI":"10.1109\/CVPR52688.2022.00114"},{"key":"10.1016\/j.imavis.2026.105944_b175","series-title":"European Conference on Computer Vision","first-page":"135","article-title":"Kinematic 3d object detection in monocular video","author":"Brazil","year":"2020"},{"key":"10.1016\/j.imavis.2026.105944_b176","doi-asserted-by":"crossref","unstructured":"D. Rukhovich, A. Vorontsova, A. Konushin, ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection, in: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, WACV, 2022, pp. 2397\u20132406.","DOI":"10.1109\/WACV51458.2022.00133"},{"key":"10.1016\/j.imavis.2026.105944_b177","doi-asserted-by":"crossref","unstructured":"J. Gu, B. Wu, L. Fan, J. Huang, S. Cao, Z. Xiang, X.-S. Hua, Homography Loss for Monocular 3D Object Detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2022, pp. 1080\u20131089.","DOI":"10.1109\/CVPR52688.2022.00115"},{"key":"10.1016\/j.imavis.2026.105944_b178","series-title":"European Conference on Computer Vision","first-page":"87","article-title":"Cross-modality knowledge distillation network for monocular 3d object detection","author":"Hong","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b179","series-title":"European Conference on Computer Vision","first-page":"71","article-title":"Did-m3d: Decoupling instance depth for monocular 3d object detection","author":"Peng","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b180","doi-asserted-by":"crossref","unstructured":"Y. Lu, X. Ma, L. Yang, T. Zhang, Y. Liu, Q. Chu, J. Yan, W. Ouyang, Geometry Uncertainty Projection Network for Monocular 3D Object Detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2021, pp. 3111\u20133121.","DOI":"10.1109\/ICCV48922.2021.00310"},{"key":"10.1016\/j.imavis.2026.105944_b181","doi-asserted-by":"crossref","unstructured":"X. Ma, Y. Zhang, D. Xu, D. Zhou, S. Yi, H. Li, W. Ouyang, Delving Into Localization Errors for Monocular 3D Object Detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2021, pp. 4721\u20134730.","DOI":"10.1109\/CVPR46437.2021.00469"},{"key":"10.1016\/j.imavis.2026.105944_b182","doi-asserted-by":"crossref","unstructured":"X. Shi, Q. Ye, X. Chen, C. Chen, Z. Chen, T.-K. Kim, Geometry-Based Distance Decomposition for Monocular 3D Object Detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2021, pp. 15172\u201315181.","DOI":"10.1109\/ICCV48922.2021.01489"},{"key":"10.1016\/j.imavis.2026.105944_b183","doi-asserted-by":"crossref","unstructured":"Y. Zhang, J. Lu, J. Zhou, Objects Are Different: Flexible Monocular 3D Object Detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2021, pp. 3289\u20133298.","DOI":"10.1109\/CVPR46437.2021.00330"},{"key":"10.1016\/j.imavis.2026.105944_b184","doi-asserted-by":"crossref","unstructured":"H. Chen, Y. Huang, W. Tian, Z. Gao, L. Xiong, MonoRUn: Monocular 3D Object Detection by Reconstruction and Uncertainty Propagation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2021, pp. 10379\u201310388.","DOI":"10.1109\/CVPR46437.2021.01024"},{"key":"10.1016\/j.imavis.2026.105944_b185","doi-asserted-by":"crossref","unstructured":"L. Wang, L. Du, X. Ye, Y. Fu, G. Guo, X. Xue, J. Feng, L. Zhang, Depth-Conditioned Dynamic Message Propagation for Monocular 3D Object Detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2021, pp. 454\u2013463.","DOI":"10.1109\/CVPR46437.2021.00052"},{"key":"10.1016\/j.imavis.2026.105944_b186","doi-asserted-by":"crossref","unstructured":"Y. Chen, L. Tai, K. Sun, M. Li, Monopair: Monocular 3d object detection using pairwise spatial relationships, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 12093\u201312102.","DOI":"10.1109\/CVPR42600.2020.01211"},{"key":"10.1016\/j.imavis.2026.105944_b187","series-title":"European Conference on Computer Vision","first-page":"644","article-title":"Rtm3d: Real-time monocular 3d detection from object keypoints for autonomous driving","author":"Li","year":"2020"},{"key":"10.1016\/j.imavis.2026.105944_b188","series-title":"European Conference on Computer Vision","first-page":"91","article-title":"Distance-normalized unified representation for monocular 3d object detection","author":"Shi","year":"2020"},{"key":"10.1016\/j.imavis.2026.105944_b189","series-title":"European Conference on Computer Vision","first-page":"767","article-title":"Towards generalization across depth for monocular 3d object detection","author":"Simonelli","year":"2020"},{"key":"10.1016\/j.imavis.2026.105944_b190","series-title":"European Conference on Computer Vision","first-page":"17","article-title":"Monocular 3d object detection via feature domain adaptation","author":"Ye","year":"2020"},{"key":"10.1016\/j.imavis.2026.105944_b191","doi-asserted-by":"crossref","unstructured":"A. Simonelli, S.R. Bulo, L. Porzi, M. L\u00f3pez-Antequera, P. Kontschieder, Disentangling monocular 3d object detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 1991\u20131999.","DOI":"10.1109\/ICCV.2019.00208"},{"key":"10.1016\/j.imavis.2026.105944_b192","series-title":"Conference on Robot Learning","first-page":"180","article-title":"Detr3d: 3d object detection from multi-view images via 3d-to-2d queries","author":"Wang","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b193","doi-asserted-by":"crossref","unstructured":"Z. Xue, M. Guo, H. Fan, S. Zhang, Z. Zhang, CorrBEV: Multi-View 3D Object Detection by Correlation Learning with Multi-modal Prototypes, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2025, pp. 27413\u201327423.","DOI":"10.1109\/CVPR52734.2025.02553"},{"key":"10.1016\/j.imavis.2026.105944_b194","series-title":"Bridging perspectives: Foundation model guided BEV maps for 3D object detection and tracking","author":"K\u00e4ppeler","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b195","series-title":"RoPETR: Improving temporal camera-only 3D detection by integrating enhanced rotary position embedding","author":"Ji","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b196","series-title":"European Conference on Computer Vision","first-page":"376","article-title":"Henet: Hybrid encoding for end-to-end multi-task 3d perception from multi-view cameras","author":"Xia","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b197","article-title":"Bevformer: Learning bird\u2019s-eye-view representation from lidar-camera via spatiotemporal transformers","author":"Li","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.imavis.2026.105944_b198","series-title":"European Conference on Computer Vision","first-page":"200","article-title":"Ray denoising: Depth-aware hard negative sampling for multi-view 3d object detection","author":"Liu","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b199","doi-asserted-by":"crossref","unstructured":"S. Wang, Y. Liu, T. Wang, Y. Li, X. Zhang, Exploring Object-Centric Temporal Modeling for Efficient Multi-View 3D Object Detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2023, pp. 3621\u20133631.","DOI":"10.1109\/ICCV51070.2023.00335"},{"key":"10.1016\/j.imavis.2026.105944_b200","doi-asserted-by":"crossref","unstructured":"Y. Liu, J. Yan, F. Jia, S. Li, A. Gao, T. Wang, X. Zhang, PETRv2: A Unified Framework for 3D Perception from Multi-Camera Images, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2023, pp. 3262\u20133272.","DOI":"10.1109\/ICCV51070.2023.00302"},{"key":"10.1016\/j.imavis.2026.105944_b201","doi-asserted-by":"crossref","unstructured":"Y. Li, Z. Ge, G. Yu, J. Yang, Z. Wang, Y. Shi, J. Sun, Z. Li, Bevdepth: Acquisition of reliable depth for multi-view 3d object detection, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 37, 2023, pp. 1477\u20131485.","DOI":"10.1609\/aaai.v37i2.25233"},{"key":"10.1016\/j.imavis.2026.105944_b202","doi-asserted-by":"crossref","unstructured":"H. Liu, Y. Teng, T. Lu, H. Wang, L. Wang, SparseBEV: High-Performance Sparse 3D Object Detection from Multi-Camera Videos, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2023, pp. 18580\u201318590.","DOI":"10.1109\/ICCV51070.2023.01703"},{"key":"10.1016\/j.imavis.2026.105944_b203","doi-asserted-by":"crossref","unstructured":"Z. Zong, D. Jiang, G. Song, Z. Xue, J. Su, H. Li, Y. Liu, Temporal Enhanced Training of Multi-view 3D Object Detector via Historical Object Prediction, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2023, pp. 3781\u20133790.","DOI":"10.1109\/ICCV51070.2023.00350"},{"key":"10.1016\/j.imavis.2026.105944_b204","doi-asserted-by":"crossref","unstructured":"Y. Jiang, L. Zhang, Z. Miao, X. Zhu, J. Gao, W. Hu, Y.-G. Jiang, Polarformer: Multi-camera 3d object detection with polar transformer, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 37, 2023, pp. 1042\u20131050.","DOI":"10.1609\/aaai.v37i1.25185"},{"key":"10.1016\/j.imavis.2026.105944_b205","doi-asserted-by":"crossref","unstructured":"Y. Wang, Y. Chen, Z. Zhang, FrustumFormer: Adaptive Instance-Aware Resampling for Multi-View 3D Detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2023, pp. 5096\u20135105.","DOI":"10.1109\/CVPR52729.2023.00493"},{"key":"10.1016\/j.imavis.2026.105944_b206","doi-asserted-by":"crossref","unstructured":"Z. Li, Z. Yu, W. Wang, A. Anandkumar, T. Lu, J.M. Alvarez, FB-BEV: BEV Representation from Forward-Backward View Transformations, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2023, pp. 6919\u20136928.","DOI":"10.1109\/ICCV51070.2023.00637"},{"key":"10.1016\/j.imavis.2026.105944_b207","doi-asserted-by":"crossref","unstructured":"Z. Wang, Z. Huang, J. Fu, N. Wang, S. Liu, Object as Query: Lifting Any 2D Object Detector to 3D Detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2023, pp. 3791\u20133800.","DOI":"10.1109\/ICCV51070.2023.00351"},{"key":"10.1016\/j.imavis.2026.105944_b208","doi-asserted-by":"crossref","unstructured":"K. Xiong, S. Gong, X. Ye, X. Tan, J. Wan, E. Ding, J. Wang, X. Bai, CAPE: Camera View Position Embedding for Multi-View 3D Object Detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2023, pp. 21570\u201321579.","DOI":"10.1109\/CVPR52729.2023.02066"},{"key":"10.1016\/j.imavis.2026.105944_b209","doi-asserted-by":"crossref","unstructured":"C. Shu, J. Deng, F. Yu, Y. Liu, 3DPPE: 3D Point Positional Encoding for Transformer-based Multi-Camera 3D Object Detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2023, pp. 3580\u20133589.","DOI":"10.1109\/ICCV51070.2023.00331"},{"key":"10.1016\/j.imavis.2026.105944_b210","series-title":"European Conference on Computer Vision","first-page":"531","article-title":"Petr: Position embedding transformation for multi-view 3d object detection","author":"Liu","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b211","first-page":"18442","article-title":"Unifying voxel-based representation with transformer for 3d object detection","volume":"35","author":"Li","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b212","doi-asserted-by":"crossref","unstructured":"Y. Li, H. Bao, Z. Ge, J. Yang, J. Sun, Z. Li, Bevstereo: Enhancing depth estimation in multi-view 3d object detection with temporal stereo, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 37, 2023, pp. 1486\u20131494.","DOI":"10.1609\/aaai.v37i2.25234"},{"key":"10.1016\/j.imavis.2026.105944_b213","series-title":"Conference on Robot Learning","first-page":"1475","article-title":"Probabilistic and geometric depth: Detecting objects in perspective","author":"Wang","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b214","doi-asserted-by":"crossref","unstructured":"Z. Chen, Z. Li, S. Zhang, L. Fang, Q. Jiang, F. Zhao, Graph-DETR3D: Rethinking overlapping regions for multi-view 3D object detection, in: Proceedings of the 30th ACM International Conference on Multimedia, 2022, pp. 5999\u20136008.","DOI":"10.1145\/3503161.3547859"},{"key":"10.1016\/j.imavis.2026.105944_b215","doi-asserted-by":"crossref","unstructured":"T. Wang, X. Zhu, J. Pang, D. Lin, FCOS3D: Fully Convolutional One-Stage Monocular 3D Object Detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) Workshops, 2021, pp. 913\u2013922.","DOI":"10.1109\/ICCVW54120.2021.00107"},{"key":"10.1016\/j.imavis.2026.105944_b216","series-title":"2020 6th International Conference on Advanced Computing and Communication Systems","first-page":"687","article-title":"YOLO v3-tiny: Object detection and recognition using one stage improved model","author":"Adarsh","year":"2020"},{"issue":"1","key":"10.1016\/j.imavis.2026.105944_b217","doi-asserted-by":"crossref","first-page":"21061","DOI":"10.1038\/s41598-025-06831-y","article-title":"An algorithm for road target detection of autonomous vehicles based on improved YOLOv8","volume":"15","author":"Gao","year":"2025","journal-title":"Sci. Rep."},{"key":"10.1016\/j.imavis.2026.105944_b218","series-title":"YOLOv6: A single-stage object detection framework for industrial applications","author":"Li","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b219","doi-asserted-by":"crossref","unstructured":"C.-Y. Wang, A. Bochkovskiy, H.-Y.M. Liao, YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 7464\u20137475.","DOI":"10.1109\/CVPR52729.2023.00721"},{"issue":"3","key":"10.1016\/j.imavis.2026.105944_b220","doi-asserted-by":"crossref","first-page":"925","DOI":"10.3390\/pr13030925","article-title":"Improved vehicle object detection algorithm based on swin-YOLOv5s","volume":"13","author":"An","year":"2025","journal-title":"Processes"},{"key":"10.1016\/j.imavis.2026.105944_b221","series-title":"YOLO-vehicle-pro: A cloud-edge collaborative framework for object detection in autonomous driving under adverse weather conditions","author":"Li","year":"2024"},{"issue":"1","key":"10.1016\/j.imavis.2026.105944_b222","doi-asserted-by":"crossref","first-page":"1","DOI":"10.62762\/TETAI.2024.894227","article-title":"YOLOv8-lite: A lightweight object detection model for real-time autonomous driving systems","volume":"1","author":"Yang","year":"2024","journal-title":"ICCK Trans. Emerg. Top. Artif. Intell."},{"issue":"17","key":"10.1016\/j.imavis.2026.105944_b223","doi-asserted-by":"crossref","first-page":"7596","DOI":"10.3390\/s23177596","article-title":"Object detection based on lightweight YOLOX for autonomous driving","volume":"23","author":"He","year":"2023","journal-title":"Sensors"},{"issue":"1","key":"10.1016\/j.imavis.2026.105944_b224","doi-asserted-by":"crossref","first-page":"9711","DOI":"10.1038\/s41598-023-36868-w","article-title":"Fast and accurate object detector for autonomous driving based on improved YOLOv5","volume":"13","author":"Jia","year":"2023","journal-title":"Sci. Rep."},{"key":"10.1016\/j.imavis.2026.105944_b225","first-page":"107984","article-title":"Yolov10: Real-time end-to-end object detection","volume":"37","author":"Wang","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b226","series-title":"European Conference on Computer Vision","first-page":"1","article-title":"Yolov9: Learning what you want to learn using programmable gradient information","author":"Wang","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b227","doi-asserted-by":"crossref","DOI":"10.1016\/j.measurement.2024.115820","article-title":"KPTr: Key point transformer for LiDAR-based 3D object detection","volume":"242","author":"Cao","year":"2025","journal-title":"Measurement"},{"key":"10.1016\/j.imavis.2026.105944_b228","doi-asserted-by":"crossref","DOI":"10.1109\/TITS.2025.3568418","article-title":"Fade3D: Fast and deployable 3D object detection for autonomous driving","author":"Ye","year":"2025","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b229","doi-asserted-by":"crossref","unstructured":"Z. Ding, X. Zhang, Q. Jing, Y. Cheng, R. Feng, AS-Det: Active Sampling for Adaptive 3D Object Detection in Point Clouds, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 39, 2025, pp. 2762\u20132770.","DOI":"10.1609\/aaai.v39i3.32281"},{"key":"10.1016\/j.imavis.2026.105944_b230","first-page":"13601","article-title":"Lion: Linear group rnn for 3d object detection in point clouds","volume":"37","author":"Liu","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"5","key":"10.1016\/j.imavis.2026.105944_b231","doi-asserted-by":"crossref","first-page":"7126","DOI":"10.1109\/JSEN.2024.3350770","article-title":"TSSTDet: Transformation-based 3-D object detection via a spatial shape transformer","volume":"24","author":"Hoang","year":"2024","journal-title":"IEEE Sens. J."},{"key":"10.1016\/j.imavis.2026.105944_b232","doi-asserted-by":"crossref","unstructured":"X. Jin, K. Liu, C. Ma, R. Yang, F. Hui, W. Wu, Swiftpillars: High-efficiency pillar encoder for lidar-based 3d detection, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, 2024, pp. 2625\u20132633.","DOI":"10.1609\/aaai.v38i3.28040"},{"key":"10.1016\/j.imavis.2026.105944_b233","doi-asserted-by":"crossref","unstructured":"H. Yang, W. Wang, M. Chen, B. Lin, T. He, H. Chen, X. He, W. Ouyang, Pvt-ssd: Single-stage 3d object detector with point-voxel transformer, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 13476\u201313487.","DOI":"10.1109\/CVPR52729.2023.01295"},{"key":"10.1016\/j.imavis.2026.105944_b234","doi-asserted-by":"crossref","unstructured":"H. Wang, C. Shi, S. Shi, M. Lei, S. Wang, D. He, B. Schiele, L. Wang, Dsvt: Dynamic sparse voxel transformer with rotated sets, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 13520\u201313529.","DOI":"10.1109\/CVPR52729.2023.01299"},{"key":"10.1016\/j.imavis.2026.105944_b235","doi-asserted-by":"crossref","unstructured":"T. Zhao, X. Ning, K. Hong, Z. Qiu, P. Lu, Y. Zhao, L. Zhang, L. Zhou, G. Dai, H. Yang, et al., Ada3d: Exploiting the spatial redundancy with adaptive inference for efficient 3d object detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 17728\u201317738.","DOI":"10.1109\/ICCV51070.2023.01625"},{"key":"10.1016\/j.imavis.2026.105944_b236","doi-asserted-by":"crossref","unstructured":"Y. Chen, J. Liu, X. Zhang, X. Qi, J. Jia, Voxelnext: Fully sparse voxelnet for 3d object detection and tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 21674\u201321683.","DOI":"10.1109\/CVPR52729.2023.02076"},{"key":"10.1016\/j.imavis.2026.105944_b237","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TGRS.2023.3301896","article-title":"3-D HANet: A flexible 3-D heatmap auxiliary network for object detection","volume":"61","author":"Xia","year":"2023","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"10.1016\/j.imavis.2026.105944_b238","doi-asserted-by":"crossref","unstructured":"H. Wu, C. Wen, W. Li, X. Li, R. Yang, C. Wang, Transformation-equivariant 3d object detection for autonomous driving, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 37, 2023, pp. 2795\u20132802.","DOI":"10.1609\/aaai.v37i3.25380"},{"issue":"16","key":"10.1016\/j.imavis.2026.105944_b239","doi-asserted-by":"crossref","first-page":"18879","DOI":"10.1109\/JSEN.2023.3293515","article-title":"3ONet: 3-D detector for occluded object under obstructed conditions","volume":"23","author":"Hoang","year":"2023","journal-title":"IEEE Sens. J."},{"key":"10.1016\/j.imavis.2026.105944_b240","series-title":"European Conference on Computer Vision","first-page":"544","article-title":"Rethinking iou-based optimization for single-stage 3D object detection","author":"Sheng","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b241","doi-asserted-by":"crossref","unstructured":"Y. Zhang, Q. Hu, G. Xu, Y. Ma, J. Wan, Y. Guo, Not all points are equal: Learning highly efficient point-based detectors for 3d lidar point clouds, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 18953\u201318962.","DOI":"10.1109\/CVPR52688.2022.01838"},{"key":"10.1016\/j.imavis.2026.105944_b242","series-title":"European Conference on Computer Vision","first-page":"35","article-title":"Pillarnet: Real-time and high-performance pillar-based 3d object detection","author":"Shi","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b243","doi-asserted-by":"crossref","unstructured":"C. He, R. Li, S. Li, L. Zhang, Voxel set transformer: A set-to-set approach to 3d object detection from point clouds, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 8417\u20138427.","DOI":"10.1109\/CVPR52688.2022.00823"},{"key":"10.1016\/j.imavis.2026.105944_b244","doi-asserted-by":"crossref","unstructured":"Q. Xu, Y. Zhong, U. Neumann, Behind the curtain: Learning occluded shapes for 3d object detection, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, 2022, pp. 2893\u20132901.","DOI":"10.1609\/aaai.v36i3.20194"},{"key":"10.1016\/j.imavis.2026.105944_b245","doi-asserted-by":"crossref","unstructured":"J. Deng, S. Shi, P. Li, W. Zhou, Y. Zhang, H. Li, Voxel r-cnn: Towards high performance voxel-based 3d object detection, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 35, 2021, pp. 1201\u20131209.","DOI":"10.1609\/aaai.v35i2.16207"},{"key":"10.1016\/j.imavis.2026.105944_b246","doi-asserted-by":"crossref","unstructured":"W. Zheng, W. Tang, S. Chen, L. Jiang, C.-W. Fu, Cia-ssd: Confident iou-aware single-stage object detector from point cloud, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 35, 2021, pp. 3555\u20133562.","DOI":"10.1609\/aaai.v35i4.16470"},{"key":"10.1016\/j.imavis.2026.105944_b247","doi-asserted-by":"crossref","unstructured":"H. Sheng, S. Cai, Y. Liu, B. Deng, J. Huang, X.-S. Hua, M.-J. Zhao, Improving 3d object detection with channel-wise transformer, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 2743\u20132752.","DOI":"10.1109\/ICCV48922.2021.00274"},{"key":"10.1016\/j.imavis.2026.105944_b248","doi-asserted-by":"crossref","unstructured":"Q. Xu, Y. Zhou, W. Wang, C.R. Qi, D. Anguelov, Spg: Unsupervised domain adaptation for 3d object detection via semantic point generation, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 15446\u201315456.","DOI":"10.1109\/ICCV48922.2021.01516"},{"key":"10.1016\/j.imavis.2026.105944_b249","doi-asserted-by":"crossref","unstructured":"W. Zheng, W. Tang, L. Jiang, C.-W. Fu, SE-SSD: Self-ensembling single-stage object detector from point cloud, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 14494\u201314503.","DOI":"10.1109\/CVPR46437.2021.01426"},{"key":"10.1016\/j.imavis.2026.105944_b250","doi-asserted-by":"crossref","unstructured":"L. Fan, X. Xiong, F. Wang, N. Wang, Z. Zhang, Rangedet: In defense of range view for lidar-based 3d object detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 2918\u20132927.","DOI":"10.1109\/ICCV48922.2021.00291"},{"key":"10.1016\/j.imavis.2026.105944_b251","doi-asserted-by":"crossref","unstructured":"C. He, H. Zeng, J. Huang, X.-S. Hua, L. Zhang, Structure aware single-stage 3d object detection from point cloud, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 11873\u201311882.","DOI":"10.1109\/CVPR42600.2020.01189"},{"issue":"8","key":"10.1016\/j.imavis.2026.105944_b252","first-page":"2647","article-title":"From points to parts: 3d object detection from point cloud with part-aware and part-aggregation network","volume":"43","author":"Shi","year":"2020","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.imavis.2026.105944_b253","doi-asserted-by":"crossref","unstructured":"W. Shi, R. Rajkumar, Point-gnn: Graph neural network for 3d object detection in a point cloud, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 1711\u20131719.","DOI":"10.1109\/CVPR42600.2020.00178"},{"key":"10.1016\/j.imavis.2026.105944_b254","doi-asserted-by":"crossref","unstructured":"Z. Yang, Y. Sun, S. Liu, J. Jia, 3dssd: Point-based 3d single stage object detector, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 11040\u201311048.","DOI":"10.1109\/CVPR42600.2020.01105"},{"key":"10.1016\/j.imavis.2026.105944_b255","doi-asserted-by":"crossref","unstructured":"Z. Liu, X. Zhao, T. Huang, R. Hu, Y. Zhou, X. Bai, Tanet: Robust 3d object detection from point clouds with triple attention, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, 2020, pp. 11677\u201311684.","DOI":"10.1609\/aaai.v34i07.6837"},{"key":"10.1016\/j.imavis.2026.105944_b256","series-title":"European Conference on Computer Vision","first-page":"685","article-title":"Searching efficient 3d architectures with sparse point-voxel convolution","author":"Tang","year":"2020"},{"key":"10.1016\/j.imavis.2026.105944_b257","doi-asserted-by":"crossref","unstructured":"S. Shi, C. Guo, L. Jiang, Z. Wang, J. Shi, X. Wang, H. Li, Pv-rcnn: Point-voxel feature set abstraction for 3d object detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 10529\u201310538.","DOI":"10.1109\/CVPR42600.2020.01054"},{"key":"10.1016\/j.imavis.2026.105944_b258","doi-asserted-by":"crossref","unstructured":"S. Shi, X. Wang, H. Li, Pointrcnn: 3d object proposal generation and detection from point cloud, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 770\u2013779.","DOI":"10.1109\/CVPR.2019.00086"},{"key":"10.1016\/j.imavis.2026.105944_b259","doi-asserted-by":"crossref","unstructured":"Z. Yang, Y. Sun, S. Liu, X. Shen, J. Jia, Std: Sparse-to-dense 3d object detector for point cloud, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 1951\u20131960.","DOI":"10.1109\/ICCV.2019.00204"},{"issue":"10","key":"10.1016\/j.imavis.2026.105944_b260","doi-asserted-by":"crossref","first-page":"3337","DOI":"10.3390\/s18103337","article-title":"Second: Sparsely embedded convolutional detection","volume":"18","author":"Yan","year":"2018","journal-title":"Sensors"},{"key":"10.1016\/j.imavis.2026.105944_b261","doi-asserted-by":"crossref","unstructured":"Y. Zhou, O. Tuzel, Voxelnet: End-to-end learning for point cloud based 3d object detection, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 4490\u20134499.","DOI":"10.1109\/CVPR.2018.00472"},{"key":"10.1016\/j.imavis.2026.105944_b262","first-page":"81489","article-title":"Voxel mamba: Group-free state space models for point cloud based 3d object detection","volume":"37","author":"Zhang","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b263","article-title":"Pillarnest: Embracing backbone scaling and pretraining for pillar-based 3d object detection","author":"Mao","year":"2024","journal-title":"IEEE Trans. Intell. Veh."},{"key":"10.1016\/j.imavis.2026.105944_b264","doi-asserted-by":"crossref","unstructured":"G. Zhang, J. Chen, G. Gao, J. Li, S. Liu, X. Hu, Safdnet: A simple and effective network for fully sparse 3d object detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 14477\u201314486.","DOI":"10.1109\/CVPR52733.2024.01372"},{"key":"10.1016\/j.imavis.2026.105944_b265","article-title":"Fsd v2: Improving fully sparse 3d object detection with virtual voxels","author":"Fan","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.imavis.2026.105944_b266","doi-asserted-by":"crossref","unstructured":"T. Lu, X. Ding, H. Liu, G. Wu, L. Wang, Link: Linear kernel for lidar-based 3d perception, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 1105\u20131115.","DOI":"10.1109\/CVPR52729.2023.00113"},{"key":"10.1016\/j.imavis.2026.105944_b267","first-page":"53076","article-title":"Hednet: A hierarchical encoder-decoder network for 3d object detection in point clouds","volume":"36","author":"Zhang","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b268","doi-asserted-by":"crossref","unstructured":"Y. Chen, J. Liu, X. Zhang, X. Qi, J. Jia, Largekernel3d: Scaling up kernels in 3d sparse cnns, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 13488\u201313498.","DOI":"10.1109\/CVPR52729.2023.01296"},{"key":"10.1016\/j.imavis.2026.105944_b269","first-page":"1","article-title":"Fully sparse transformer 3-D detector for LiDAR point cloud","volume":"61","author":"Zhang","year":"2023","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"10.1016\/j.imavis.2026.105944_b270","first-page":"39876","article-title":"Uni3detr: Unified 3d detection transformer","volume":"36","author":"Wang","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b271","first-page":"34899","article-title":"Fully convolutional one-stage 3d object detection on lidar range images","volume":"35","author":"Tian","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b272","doi-asserted-by":"crossref","unstructured":"Y. Hu, Z. Ding, R. Ge, W. Shao, L. Huang, K. Li, Q. Liu, Afdetv2: Rethinking the necessity of the second stage for object detection from point clouds, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, 2022, pp. 969\u2013979.","DOI":"10.1609\/aaai.v36i1.19980"},{"key":"10.1016\/j.imavis.2026.105944_b273","doi-asserted-by":"crossref","unstructured":"Y. Chen, Y. Li, X. Zhang, J. Sun, J. Jia, Focal sparse convolutional networks for 3d object detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 5428\u20135437.","DOI":"10.1109\/CVPR52688.2022.00535"},{"key":"10.1016\/j.imavis.2026.105944_b274","doi-asserted-by":"crossref","unstructured":"X. Bai, Z. Hu, X. Zhu, Q. Huang, Y. Chen, H. Fu, C.-L. Tai, Transfusion: Robust lidar-camera fusion for 3d object detection with transformers, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 1090\u20131099.","DOI":"10.1109\/CVPR52688.2022.00116"},{"key":"10.1016\/j.imavis.2026.105944_b275","doi-asserted-by":"crossref","unstructured":"H. Fazlali, Y. Xu, Y. Ren, B. Liu, A versatile multi-view framework for lidar-based 3d object detection with guidance from panoptic segmentation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 17192\u201317201.","DOI":"10.1109\/CVPR52688.2022.01668"},{"key":"10.1016\/j.imavis.2026.105944_b276","doi-asserted-by":"crossref","unstructured":"T. Yin, X. Zhou, P. Krahenbuhl, Center-based 3d object detection and tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 11784\u201311793.","DOI":"10.1109\/CVPR46437.2021.01161"},{"key":"10.1016\/j.imavis.2026.105944_b277","doi-asserted-by":"crossref","unstructured":"X. Pan, Z. Xia, S. Song, L.E. Li, G. Huang, 3d object detection with pointformer, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 7463\u20137472.","DOI":"10.1109\/CVPR46437.2021.00738"},{"key":"10.1016\/j.imavis.2026.105944_b278","first-page":"21224","article-title":"Every view counts: Cross-view consistency in 3d object detection with hybrid-cylindrical-spherical voxelization","volume":"33","author":"Chen","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b279","doi-asserted-by":"crossref","unstructured":"J. Yin, J. Shen, C. Guan, D. Zhou, R. Yang, Lidar-based online 3d video object detection with graph-based message passing and spatiotemporal transformer attention, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 11495\u201311504.","DOI":"10.1109\/CVPR42600.2020.01151"},{"key":"10.1016\/j.imavis.2026.105944_b280","series-title":"Class-balanced grouping and sampling for point cloud 3d object detection","author":"Zhu","year":"2019"},{"key":"10.1016\/j.imavis.2026.105944_b281","doi-asserted-by":"crossref","unstructured":"A.H. Lang, S. Vora, H. Caesar, L. Zhou, J. Yang, O. Beijbom, Pointpillars: Fast encoders for object detection from point clouds, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 12697\u201312705.","DOI":"10.1109\/CVPR.2019.01298"},{"key":"10.1016\/j.imavis.2026.105944_b282","series-title":"ELMAR: Enhancing LiDAR detection with 4D radar motion awareness and cross-modal uncertainty","author":"Peng","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b283","series-title":"MoRAL: Motion-aware multi-frame 4D radar and LiDAR fusion for robust 3D object detection","author":"Peng","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b284","doi-asserted-by":"crossref","unstructured":"X. Huang, Z. Xu, H. Wu, J. Wang, Q. Xia, Y. Xia, J. Li, K. Gao, C. Wen, C. Wang, L4dr: Lidar-4dradar fusion for weather-robust 3d object detection, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 39, 2025, pp. 3806\u20133814.","DOI":"10.1609\/aaai.v39i4.32397"},{"key":"10.1016\/j.imavis.2026.105944_b285","series-title":"ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"Mutualforce: Mutual-aware enhancement for 4d radar-lidar 3d object detection","author":"Peng","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b286","series-title":"2024 IEEE International Conference on Robotics and Automation","first-page":"6585","article-title":"Robust 3d object detection from lidar-radar point clouds via cross-modal feature augmentation","author":"Deng","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b287","series-title":"European Conference on Computer Vision","first-page":"181","article-title":"Rlnet: Adaptive fusion of 4d radar and lidar for 3d object detection","author":"Xu","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b288","series-title":"2022 IEEE\/RSJ International Conference on Intelligent Robots and Systems","first-page":"12247","article-title":"InterFusion: Interaction-based 4D radar and LiDAR fusion for 3D object detection","author":"Wang","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b289","doi-asserted-by":"crossref","DOI":"10.1109\/ACCESS.2025.3558780","article-title":"CLS-3D: Content-wise lidar-camera fusion and slot reweighting transformer for 3D object detection in autonomous vehicles","author":"Mushtaq","year":"2025","journal-title":"IEEE Access"},{"issue":"4","key":"10.1016\/j.imavis.2026.105944_b290","doi-asserted-by":"crossref","first-page":"1097","DOI":"10.3390\/s25041097","article-title":"Point-level fusion and channel attention for 3D object detection in autonomous driving","volume":"25","author":"Shen","year":"2025","journal-title":"Sensors"},{"key":"10.1016\/j.imavis.2026.105944_b291","article-title":"TinyFusionDet: Hardware-efficient lidar-camera fusion framework for 3D object detection at edge","author":"Li","year":"2025","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.imavis.2026.105944_b292","doi-asserted-by":"crossref","DOI":"10.1109\/TGRS.2024.3476072","article-title":"Channel-wise and spatially-guided multimodal feature fusion network for 3D object detection in autonomous vehicles","author":"Uzair","year":"2024","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"10.1016\/j.imavis.2026.105944_b293","first-page":"51562","article-title":"Unleash the potential of image branch for cross-modal 3d object detection","volume":"36","author":"Zhang","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b294","doi-asserted-by":"crossref","unstructured":"X. Li, T. Ma, Y. Hou, B. Shi, Y. Yang, Y. Liu, X. Wu, Q. Chen, Y. Li, Y. Qiao, et al., Logonet: Towards accurate 3d object detection with local-to-global cross-modal fusion, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 17524\u201317534.","DOI":"10.1109\/CVPR52729.2023.01681"},{"key":"10.1016\/j.imavis.2026.105944_b295","doi-asserted-by":"crossref","unstructured":"H. Wu, C. Wen, S. Shi, X. Li, C. Wang, Virtual sparse convolution for multimodal 3d object detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 21653\u201321662.","DOI":"10.1109\/CVPR52729.2023.02074"},{"key":"10.1016\/j.imavis.2026.105944_b296","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2022.105815","article-title":"LiDAR-camera fusion: Dual transformer enhancement for 3D object detection","volume":"120","author":"Chen","year":"2023","journal-title":"Eng. Appl. Artif. Intell."},{"issue":"1","key":"10.1016\/j.imavis.2026.105944_b297","doi-asserted-by":"crossref","first-page":"845","DOI":"10.1109\/TII.2023.3263274","article-title":"MVMM: Multiview multimodal 3-D object detection for autonomous driving","volume":"20","author":"Li","year":"2023","journal-title":"IEEE Trans. Ind. Inform."},{"key":"10.1016\/j.imavis.2026.105944_b298","doi-asserted-by":"crossref","unstructured":"Y. Qin, C. Wang, Z. Kang, N. Ma, Z. Li, R. Zhang, SupFusion: Supervised LiDAR-camera fusion for 3D object detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 22014\u201322024.","DOI":"10.1109\/ICCV51070.2023.02012"},{"key":"10.1016\/j.imavis.2026.105944_b299","doi-asserted-by":"crossref","first-page":"5291","DOI":"10.1109\/TMM.2022.3189778","article-title":"VPFNet: Improving 3D object detection with virtual point based lidar and stereo data fusion","volume":"25","author":"Zhu","year":"2022","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.imavis.2026.105944_b300","doi-asserted-by":"crossref","unstructured":"X. Wu, L. Peng, H. Yang, L. Xie, C. Huang, C. Deng, H. Liu, D. Cai, Sparse fuse dense: Towards high quality 3d detection with depth completion, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 5418\u20135427.","DOI":"10.1109\/CVPR52688.2022.00534"},{"key":"10.1016\/j.imavis.2026.105944_b301","doi-asserted-by":"crossref","unstructured":"Y. Li, X. Qi, Y. Chen, L. Wang, Z. Li, J. Sun, J. Jia, Voxel field fusion for 3d object detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 1120\u20131129.","DOI":"10.1109\/CVPR52688.2022.00119"},{"key":"10.1016\/j.imavis.2026.105944_b302","series-title":"European Conference on Computer Vision","first-page":"662","article-title":"Graph r-cnn: Towards accurate 3d object detection with semantic-decorated local graph","author":"Yang","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b303","doi-asserted-by":"crossref","unstructured":"Y. Zhang, J. Chen, D. Huang, Cat-det: Contrastively augmented transformer for multi-modal 3d object detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 908\u2013917.","DOI":"10.1109\/CVPR52688.2022.00098"},{"key":"10.1016\/j.imavis.2026.105944_b304","doi-asserted-by":"crossref","first-page":"23","DOI":"10.1016\/j.neucom.2022.04.075","article-title":"MSL3D: 3D object detection from monocular, stereo and point cloud for autonomous driving","volume":"494","author":"Chen","year":"2022","journal-title":"Neurocomputing"},{"key":"10.1016\/j.imavis.2026.105944_b305","doi-asserted-by":"crossref","DOI":"10.1016\/j.cviu.2021.103295","article-title":"Deep structural information fusion for 3D object detection on LiDAR\u2013camera system","volume":"214","author":"An","year":"2022","journal-title":"Comput. Vis. Image Underst."},{"issue":"7","key":"10.1016\/j.imavis.2026.105944_b306","first-page":"8324","article-title":"EPNet++: Cascade bi-directional fusion for multi-modal 3D object detection","volume":"45","author":"Liu","year":"2022","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.imavis.2026.105944_b307","series-title":"2020 IEEE\/RSJ International Conference on Intelligent Robots and Systems","first-page":"10386","article-title":"CLOCs: Camera-lidar object candidates fusion for 3D object detection","author":"Pang","year":"2020"},{"key":"10.1016\/j.imavis.2026.105944_b308","series-title":"European Conference on Computer Vision","first-page":"35","article-title":"Epnet: Enhancing point features with image semantics for 3d object detection","author":"Huang","year":"2020"},{"key":"10.1016\/j.imavis.2026.105944_b309","doi-asserted-by":"crossref","unstructured":"S. Vora, A.H. Lang, B. Helou, O. Beijbom, Pointpainting: Sequential fusion for 3d object detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 4604\u20134612.","DOI":"10.1109\/CVPR42600.2020.00466"},{"key":"10.1016\/j.imavis.2026.105944_b310","series-title":"2018 IEEE\/RSJ International Conference on Intelligent Robots and Systems","first-page":"1","article-title":"Joint 3d proposal generation and object detection from view aggregation","author":"Ku","year":"2018"},{"key":"10.1016\/j.imavis.2026.105944_b311","doi-asserted-by":"crossref","unstructured":"D. Xu, D. Anguelov, A. Jain, Pointfusion: Deep sensor fusion for 3d bounding box estimation, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 244\u2013253.","DOI":"10.1109\/CVPR.2018.00033"},{"key":"10.1016\/j.imavis.2026.105944_b312","doi-asserted-by":"crossref","unstructured":"C.R. Qi, W. Liu, C. Wu, H. Su, L.J. Guibas, Frustum pointnets for 3d object detection from rgb-d data, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 918\u2013927.","DOI":"10.1109\/CVPR.2018.00102"},{"key":"10.1016\/j.imavis.2026.105944_b313","doi-asserted-by":"crossref","unstructured":"X. Chen, H. Ma, J. Wan, B. Li, T. Xia, Multi-view 3d object detection network for autonomous driving, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017, pp. 1907\u20131915.","DOI":"10.1109\/CVPR.2017.691"},{"key":"10.1016\/j.imavis.2026.105944_b314","series-title":"Mv2dfusion: Leveraging modality-specific object semantics for multi-modal 3d detection","author":"Wang","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b315","article-title":"Multi-modal BEV enhancement fusion for 3D object detection in autonomous driving","author":"Chen","year":"2025","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b316","series-title":"European Conference on Computer Vision","first-page":"109","article-title":"SparseLIF: High-performance sparse LiDAR-camera fusion for 3D object detection","author":"Zhang","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b317","series-title":"Simplebev: Improved lidar-camera fusion architecture for 3d object detection","author":"Zhao","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b318","doi-asserted-by":"crossref","unstructured":"J. Yin, J. Shen, R. Chen, W. Li, R. Yang, P. Frossard, W. Wang, Is-fusion: Instance-scene collaborative fusion for multimodal 3d object detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 14905\u201314915.","DOI":"10.1109\/CVPR52733.2024.01412"},{"key":"10.1016\/j.imavis.2026.105944_b319","series-title":"European Conference on Computer Vision","first-page":"439","article-title":"Detecting as labeling: Rethinking lidar-camera fusion in 3d object detection","author":"Huang","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b320","series-title":"Bevfusion: Multi-task multi-sensor fusion with unified bird\u2019s-eye view representation","author":"Liu","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b321","doi-asserted-by":"crossref","unstructured":"Y. Xie, C. Xu, M.-J. Rakotosaona, P. Rim, F. Tombari, K. Keutzer, M. Tomizuka, W. Zhan, Sparsefusion: Fusing multi-modal sparse representations for multi-sensor 3d object detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 17591\u201317602.","DOI":"10.1109\/ICCV51070.2023.01613"},{"key":"10.1016\/j.imavis.2026.105944_b322","doi-asserted-by":"crossref","unstructured":"J. Yan, Y. Liu, J. Sun, F. Jia, S. Li, T. Wang, X. Zhang, Cross modal transformer: Towards fast and robust 3d object detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 18268\u201318278.","DOI":"10.1109\/ICCV51070.2023.01675"},{"key":"10.1016\/j.imavis.2026.105944_b323","doi-asserted-by":"crossref","unstructured":"X. Chen, T. Zhang, Y. Wang, Y. Wang, H. Zhao, Futr3d: A unified sensor fusion framework for 3d detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 172\u2013181.","DOI":"10.1109\/CVPRW59228.2023.00022"},{"key":"10.1016\/j.imavis.2026.105944_b324","doi-asserted-by":"crossref","unstructured":"Y. Chen, Z. Yu, Y. Chen, S. Lan, A. Anandkumar, J. Jia, J.M. Alvarez, Focalformer3d: Focusing on hard instance for 3d object detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 8394\u20138405.","DOI":"10.1109\/ICCV51070.2023.00771"},{"key":"10.1016\/j.imavis.2026.105944_b325","series-title":"FusionFormer: A multi-sensory fusion in bird\u2019s-eye-view and temporal consistent transformer for 3D object detection","author":"Hu","year":"2023"},{"key":"10.1016\/j.imavis.2026.105944_b326","series-title":"Ea-lss: Edge-aware lift-splat-shot framework for 3d bev object detection","author":"Hu","year":"2023"},{"key":"10.1016\/j.imavis.2026.105944_b327","series-title":"2021 IEEE International Intelligent Transportation Systems Conference","first-page":"3047","article-title":"Fusionpainting: Multimodal fusion with adaptive attention for 3d object detection","author":"Xu","year":"2021"},{"key":"10.1016\/j.imavis.2026.105944_b328","first-page":"16494","article-title":"Multimodal virtual point 3d detection","volume":"34","author":"Yin","year":"2021","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105944_b329","doi-asserted-by":"crossref","unstructured":"C. Wang, C. Ma, M. Zhu, X. Yang, Pointaugmenting: Cross-modal augmentation for 3d object detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 11794\u201311803.","DOI":"10.1109\/CVPR46437.2021.01162"},{"key":"10.1016\/j.imavis.2026.105944_b330","series-title":"European Conference on Computer Vision","first-page":"720","article-title":"3D-cvf: Generating joint camera and lidar features using cross-view spatial feature fusion for 3d object detection","author":"Yoo","year":"2020"},{"key":"10.1016\/j.imavis.2026.105944_b331","series-title":"Gemini 2.5: Pushing the frontier with advanced reasoning, multimodality, long context, and next generation agentic capabilities","author":"Comanici","year":"2025"},{"issue":"13","key":"10.1016\/j.imavis.2026.105944_b332","doi-asserted-by":"crossref","first-page":"2657","DOI":"10.3390\/electronics13132657","article-title":"Enhancing software code vulnerability detection using gpt-4o and claude-3.5 sonnet: A study on prompt engineering techniques","volume":"13","author":"Bae","year":"2024","journal-title":"Electronics"},{"key":"10.1016\/j.imavis.2026.105944_b333","series-title":"Grok 4","author":"xAI","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b334","series-title":"Qwen2. 5-coder technical report","author":"Hui","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b335","series-title":"GPT-o3","author":"OpenAI","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b336","series-title":"The llama 3 herd of models","author":"Dubey","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b337","series-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","author":"Team","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b338","series-title":"Claude 3","author":"Anthropic","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b339","series-title":"Deepseek-vl: Towards real-world vision-language understanding","author":"Lu","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b340","series-title":"Gpt-4o system card","author":"Hurst","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b341","series-title":"Phi-4 technical report","author":"Abdin","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b342","series-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"10.1016\/j.imavis.2026.105944_b343","series-title":"Gpt-4 technical report","author":"Achiam","year":"2023"},{"key":"10.1016\/j.imavis.2026.105944_b344","series-title":"Palm 2 technical report","author":"Anil","year":"2023"},{"key":"10.1016\/j.imavis.2026.105944_b345","series-title":"TREC","first-page":"479","article-title":"FALCON: Boosting knowledge for answer engines","volume":"vol. 9","author":"Harabagiu","year":"2000"},{"key":"10.1016\/j.imavis.2026.105944_b346","series-title":"Improving language understanding by generative pre-training","author":"Radford","year":"2018"},{"key":"10.1016\/j.imavis.2026.105944_b347","series-title":"BART: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension","author":"Lewis","year":"2019"},{"key":"10.1016\/j.imavis.2026.105944_b348","doi-asserted-by":"crossref","unstructured":"J. Devlin, M.-W. Chang, K. Lee, K. Toutanova, Bert: Pre-training of deep bidirectional transformers for language understanding, in: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), 2019, pp. 4171\u20134186.","DOI":"10.18653\/v1\/N19-1423"},{"key":"10.1016\/j.imavis.2026.105944_b349","doi-asserted-by":"crossref","unstructured":"S. Ma, D. Qian, K. Ye, S. Zhang, Cake: Category aware knowledge extraction for open-vocabulary object detection, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 39, 2025, pp. 5982\u20135990.","DOI":"10.1609\/aaai.v39i6.32639"},{"key":"10.1016\/j.imavis.2026.105944_b350","series-title":"Led: Llm enhanced open-vocabulary object detection without human curated data generation","author":"Zhou","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b351","doi-asserted-by":"crossref","unstructured":"T. Cheng, L. Song, Y. Ge, W. Liu, X. Wang, Y. Shan, Yolo-world: Real-time open-vocabulary object detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 16901\u201316911.","DOI":"10.1109\/CVPR52733.2024.01599"},{"key":"10.1016\/j.imavis.2026.105944_b352","series-title":"European Conference on Computer Vision","first-page":"38","article-title":"Grounding dino: Marrying dino with grounded pre-training for open-set object detection","author":"Liu","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b353","doi-asserted-by":"crossref","unstructured":"X. Wu, F. Zhu, R. Zhao, H. Li, Cora: Adapting clip for open-vocabulary detection with region prompting and anchor pre-matching, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 7031\u20137040.","DOI":"10.1109\/CVPR52729.2023.00679"},{"key":"10.1016\/j.imavis.2026.105944_b354","unstructured":"L.H. Li, P. Zhang, H. Zhang, J. Yang, C. Li, Y. Zhong, L. Wang, L. Yuan, L. Zhang, J.-N. Hwang, et al., Grounded language-image pre-training, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 10965\u201310975."},{"key":"10.1016\/j.imavis.2026.105944_b355","series-title":"F-vlm: Open-vocabulary object detection upon frozen vision and language models","author":"Kuo","year":"2022"},{"key":"10.1016\/j.imavis.2026.105944_b356","doi-asserted-by":"crossref","unstructured":"A. Kamath, M. Singh, Y. LeCun, G. Synnaeve, I. Misra, N. Carion, Mdetr-modulated detection for end-to-end multi-modal understanding, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 1780\u20131790.","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"10.1016\/j.imavis.2026.105944_b357","doi-asserted-by":"crossref","unstructured":"X. Chen, L. Huang, T. Ma, R. Fang, S. Shi, H. Li, SOLVE: Synergy of Language-Vision and End-to-End Networks for Autonomous Driving, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 12068\u201312077.","DOI":"10.1109\/CVPR52734.2025.01127"},{"key":"10.1016\/j.imavis.2026.105944_b358","doi-asserted-by":"crossref","unstructured":"W. Han, D. Guo, C.-Z. Xu, J. Shen, Dme-driver: Integrating human decision logic and 3d scene perception in autonomous driving, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 39, 2025, pp. 3347\u20133355.","DOI":"10.1609\/aaai.v39i3.32346"},{"key":"10.1016\/j.imavis.2026.105944_b359","series-title":"Opendrivevla: Towards end-to-end autonomous driving with large vision language action model","author":"Zhou","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b360","series-title":"Fastdrivevla: Efficient end-to-end driving via plug-and-play reconstruction-based token pruning","author":"Cao","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b361","doi-asserted-by":"crossref","unstructured":"Y. Xie, R. Xu, T. He, J.-J. Hwang, K. Luo, J. Ji, H. Lin, L. Chen, Y. Lu, Z. Leng, et al., S4-Driver: Scalable Self-Supervised Driving Multimodal Large Language Model with Spatio-Temporal Visual Representation, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 1622\u20131632.","DOI":"10.1109\/CVPR52734.2025.00159"},{"key":"10.1016\/j.imavis.2026.105944_b362","doi-asserted-by":"crossref","unstructured":"D. Hegde, R. Yasarla, H. Cai, S. Han, A. Bhattacharyya, S. Mahajan, L. Liu, R. Garrepalli, V.M. Patel, F. Porikli, Distilling multi-modal large language models for autonomous driving, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 27575\u201327585.","DOI":"10.1109\/CVPR52734.2025.02568"},{"key":"10.1016\/j.imavis.2026.105944_b363","doi-asserted-by":"crossref","unstructured":"C. Pan, B. Yaman, T. Nesti, A. Mallik, A.G. Allievi, S. Velipasalar, L. Ren, Vlp: Vision language planning for autonomous driving, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 14760\u201314769.","DOI":"10.1109\/CVPR52733.2024.01398"},{"key":"10.1016\/j.imavis.2026.105944_b364","series-title":"Drivevlm: The convergence of autonomous driving and large vision-language models","author":"Tian","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b365","series-title":"Emma: End-to-end multimodal model for autonomous driving","author":"Hwang","year":"2024"},{"key":"10.1016\/j.imavis.2026.105944_b366","series-title":"AutoVLA: A vision-language-action model for end-to-end autonomous driving with adaptive reasoning and reinforcement fine-tuning","author":"Zhou","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b367","series-title":"Orion: A holistic end-to-end autonomous driving framework by vision-language instructed action generation","author":"Fu","year":"2025"},{"key":"10.1016\/j.imavis.2026.105944_b368","doi-asserted-by":"crossref","unstructured":"K. Renz, L. Chen, E. Arani, O. Sinavski, Simlingo: Vision-only closed-loop autonomous driving with language-action alignment, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 11993\u201312003.","DOI":"10.1109\/CVPR52734.2025.01120"}],"container-title":["Image and Vision Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0262885626000508?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0262885626000508?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T07:02:30Z","timestamp":1777532550000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0262885626000508"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5]]},"references-count":368,"alternative-id":["S0262885626000508"],"URL":"https:\/\/doi.org\/10.1016\/j.imavis.2026.105944","relation":{},"ISSN":["0262-8856"],"issn-type":[{"value":"0262-8856","type":"print"}],"subject":[],"published":{"date-parts":[[2026,5]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"All you need for object detection: From pixels, points, and prompts to Next-Gen fusion and multimodal LLMs\/VLMs in autonomous vehicles","name":"articletitle","label":"Article Title"},{"value":"Image and Vision Computing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.imavis.2026.105944","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 The Authors. Published by Elsevier B.V.","name":"copyright","label":"Copyright"}],"article-number":"105944"}}