{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T15:38:06Z","timestamp":1774021086414,"version":"3.50.1"},"reference-count":76,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"1","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2026,1]]},"DOI":"10.1109\/tpami.2025.3609348","type":"journal-article","created":{"date-parts":[[2025,9,12]],"date-time":"2025-09-12T17:29:57Z","timestamp":1757698197000},"page":"609-623","source":"Crossref","is-referenced-by-count":5,"title":["MV2DFusion: Leveraging Modality-Specific Object Semantics for Multi-Modal 3D Detection"],"prefix":"10.1109","volume":"48","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8138-1869","authenticated-orcid":false,"given":"Zitian","family":"Wang","sequence":"first","affiliation":[{"name":"Institute of Artificial Intelligence, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1653-208X","authenticated-orcid":false,"given":"Zehao","family":"Huang","sequence":"additional","affiliation":[{"name":"Independent Researcher, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3895-1288","authenticated-orcid":false,"given":"Yulu","family":"Gao","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0526-3331","authenticated-orcid":false,"given":"Naiyan","family":"Wang","sequence":"additional","affiliation":[{"name":"Independent Researcher, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9180-2935","authenticated-orcid":false,"given":"Si","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence, Beihang University, Beijing, China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"BEVDet: High-performance multi-camera 3D object detection in bird-eye-view","author":"Huang","year":"2021"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_1"},{"key":"ref3","first-page":"180","article-title":"DETR3D: 3D object detection from multi-view images via 3D-to-2D queries","volume-title":"Proc. Conf. Robot Learn.","author":"Wang"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_31"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01054"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.3390\/s18103337"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01298"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00472"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01667"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/116"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20074-8_36"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160968"},{"key":"ref13","first-page":"10421","article-title":"BEVfusion: A simple and robust LiDAR-camera fusion framework","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Liang"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01675"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610230"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00102"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3392303"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01613"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00351"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00335"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"ref23","article-title":"Argoverse 2: Next generation datasets for self-driving perception and forecasting","author":"Wilson","year":"2023"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00086"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01105"},{"key":"ref26","first-page":"351","article-title":"Fully sparse 3D object detection","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Fan"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2024.3502456"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01161"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00116"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00291"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00567"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02076"},{"key":"ref33","article-title":"Objects as points","author":"Zhou","year":"2019"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00107"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00864"},{"key":"ref36","article-title":"Bevdet4D: Exploit temporal cues in multi-camera 3D object detection","author":"Huang","year":"2022"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25233"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_12"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25185"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.00302"},{"key":"ref42","article-title":"Sparse4D: Multi-view 3D object detection with sparse spatial-temporal fusion","author":"Lin","year":"2022"},{"key":"ref43","article-title":"Sparse4D V2: Recurrent temporal fusion with sparse model","author":"Lin","year":"2023"},{"key":"ref44","article-title":"Sparse4D V3: Advancing end-to-end 3D detection and tracking","author":"Lin","year":"2023"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00466"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01162"},{"key":"ref47","first-page":"16494","article-title":"Multimodal virtual point 3D detection","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yin"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC48978.2021.9564951"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/IROS55552.2023.10341620"},{"key":"ref50","article-title":"Sparsefusion: Efficient sparse multi-modal fusion framework for long-range 3D perception","author":"Li","year":"2024"},{"key":"ref51","first-page":"91","article-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Ren"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"ref53","article-title":"Yolox: Exceeding yolo series in 2021","author":"Ge","year":"2021"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00972"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20158"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01325"},{"key":"ref57","article-title":"Dino: Detr with improved denoising anchor boxes for end-to-end object detection","author":"Zhang","year":"2022"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref59","article-title":"Deformable DETR: Deformable transformers for end-to-end object detection","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhu"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1002\/nav.3800020109"},{"key":"ref61","first-page":"18442","article-title":"Unifying voxel-based representation with transformer for 3D object detection","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Li"},{"key":"ref62","first-page":"1992","article-title":"Deepinteraction: 3D object detection via modality interaction","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yang"},{"key":"ref63","article-title":"EA-LSS: Edge-aware lift-splat-shot framework for 3D bev object detection","author":"Hu","year":"2023"},{"key":"ref64","article-title":"Bevfusion4D: Learning LiDAR-camera fusion under bird\u2019s-eye-view via cross-modality guidance and temporal aggregation","author":"Cai","year":"2023"},{"key":"ref65","article-title":"Fusionformer: A multi-sensory fusion in bird\u2019s-eye-view and temporal consistent transformer for 3D objection","author":"Hu","year":"2023"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref67","article-title":"Decoupled weight decay regularization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Loshchilov"},{"key":"ref68","article-title":"SGDR: Stochastic gradient descent with warm restarts","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Loshchilov"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00644"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01548"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.28033"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01372"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2021.104117"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref76","article-title":"BEVpoolv2: A cutting-edge implementation of BEVdet toward deployment","author":"Huang","year":"2022"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/11275622\/11160680.pdf?arnumber=11160680","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,4]],"date-time":"2025-12-04T09:07:23Z","timestamp":1764839243000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11160680\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1]]},"references-count":76,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3609348","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1]]}}}