{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T15:35:26Z","timestamp":1780328126267,"version":"3.54.1"},"reference-count":187,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"4","license":[{"start":{"date-parts":[[2024,4,1]],"date-time":"2024-04-01T00:00:00Z","timestamp":1711929600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"funder":[{"name":"National Key R&#x0026;D Program of China","award":["2022ZD0160104"],"award-info":[{"award-number":["2022ZD0160104"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62206172"],"award-info":[{"award-number":["62206172"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2024,4]]},"DOI":"10.1109\/tpami.2023.3333838","type":"journal-article","created":{"date-parts":[[2023,11,17]],"date-time":"2023-11-17T19:05:14Z","timestamp":1700247914000},"page":"2151-2170","source":"Crossref","is-referenced-by-count":121,"title":["Delving Into the Devils of Bird\u2019s-Eye-View Perception: A Review, Evaluation and Recipe"],"prefix":"10.1109","volume":"46","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9110-5534","authenticated-orcid":false,"given":"Hongyang","family":"Li","sequence":"first","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6350-4116","authenticated-orcid":false,"given":"Chonghao","family":"Sima","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6785-0785","authenticated-orcid":false,"given":"Jifeng","family":"Dai","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2418-3134","authenticated-orcid":false,"given":"Wenhai","family":"Wang","sequence":"additional","affiliation":[{"name":"Chinese University of Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9809-3818","authenticated-orcid":false,"given":"Lewei","family":"Lu","sequence":"additional","affiliation":[{"name":"SenseTime, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3960-084X","authenticated-orcid":false,"given":"Huijie","family":"Wang","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0682-4898","authenticated-orcid":false,"given":"Jia","family":"Zeng","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7041-0748","authenticated-orcid":false,"given":"Zhiqi","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9106-369X","authenticated-orcid":false,"given":"Jiazhi","family":"Yang","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6255-5479","authenticated-orcid":false,"given":"Hanming","family":"Deng","sequence":"additional","affiliation":[{"name":"SenseTime, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0941-3629","authenticated-orcid":false,"given":"Hao","family":"Tian","sequence":"additional","affiliation":[{"name":"SenseTime, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6890-1049","authenticated-orcid":false,"given":"Enze","family":"Xie","sequence":"additional","affiliation":[{"name":"Huawei Inc., Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2216-9145","authenticated-orcid":false,"given":"Jiangwei","family":"Xie","sequence":"additional","affiliation":[{"name":"SenseTime, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9500-5722","authenticated-orcid":false,"given":"Li","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3838-160X","authenticated-orcid":false,"given":"Tianyu","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2747-6960","authenticated-orcid":false,"given":"Yang","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3895-1288","authenticated-orcid":false,"given":"Yulu","family":"Gao","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5222-1476","authenticated-orcid":false,"given":"Xiaosong","family":"Jia","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9180-2935","authenticated-orcid":false,"given":"Si","family":"Liu","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3257-8272","authenticated-orcid":false,"given":"Jianping","family":"Shi","sequence":"additional","affiliation":[{"name":"SenseTime, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8865-7896","authenticated-orcid":false,"given":"Dahua","family":"Lin","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1889-2567","authenticated-orcid":false,"given":"Yu","family":"Qiao","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref3","article-title":"M $^{2}$2 BEV: Multi-camera joint 3D detection and segmentation with unified birds-eye view representation","author":"Xie","year":"2022"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_1"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/icra48891.2023.10160968"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00252"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00280"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_23"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"ref12","article-title":"Argoverse 2: Next generation datasets for self-driving perception and forecasting","volume-title":"Proc. Neural Inf. Process. Syst. Track Datasets Benchmarks","volume":"1","author":"Wilson"},{"key":"ref13","article-title":"Drago anguelov \u2013 Machine learning for autonomous driving at scale","year":"2020"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref15","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3152247"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref18","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2019.2892405"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-021-11137-y"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.108796"},{"key":"ref22","article-title":"Vision-centric BEV perception: A survey","author":"Ma","year":"2022"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-06780-8_4"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00895"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2926463"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19839-7_32"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01663"},{"key":"ref28","first-page":"409","article-title":"One thousand and one hours: Self-driving motion prediction dataset","volume-title":"Proc. Conf. Robot. Learn.","author":"Houston"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9197385"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8793925"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00939"},{"key":"ref32","article-title":"A2d2: Audi autonomous driving dataset","author":"Geyer","year":"2020"},{"key":"ref33","article-title":"Cityscapes 3D: Dataset and benchmark for 9 DoF vehicle detection","author":"G\u00e4hlert","year":"2020"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC48978.2021.9565009"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2022.3179507"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9561267"},{"key":"ref37","article-title":"One million scenes for autonomous driving: Once dataset","author":"Mao","year":"2021"},{"key":"ref38","article-title":"All-in-one drive: A large-scale comprehensive perception dataset with high-density long-range point clouds","author":"Weng","year":"2021"},{"key":"ref39","article-title":"DeepAccident: A large-scale accident dataset for multi-vehicle autonomous driving","author":"Wang","year":"2022"},{"key":"ref40","article-title":"CARLA: An open urban driving simulator","author":"Dosovitskiy","year":"2017"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298655"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"ref43","article-title":"Orthographic feature transform for monocular 3D object detection","volume-title":"Proc. Brit. Mach. Vis. Conf.","author":"Roddick"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00472"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01298"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2103.01100"},{"key":"ref47","article-title":"BEVDet: High-performance multi-camera 3D object detection in bird-eye-view","author":"Huang","year":"2021"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_31"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25233"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00133"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25185"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC45102.2020.9294462"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01499"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01339"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9812383"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9811901"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_12"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19839-7_31"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00301"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01537"},{"key":"ref61","article-title":"Let-3D-ap: Longitudinal error tolerant 3d average precision for camera-only 3D detection","author":"Hung","year":"2022"},{"key":"ref62","article-title":"Probabilistic and geometric depth: Detecting objects in perspective","author":"Wang","year":"2022"},{"key":"ref63","article-title":"BEVerse: Unified perception and prediction in birds-eye-view for vision-centric autonomous driving","author":"Zhang","year":"2022"},{"key":"ref64","article-title":"BEVDet4D: Exploit temporal cues in multi-camera 3D object detection","author":"Huang","year":"2022"},{"key":"ref65","first-page":"12533","article-title":"DSGN: Deep stereo geometry network for 3D object detection","volume-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","author":"Yilun"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01054"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01161"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00827"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i1.19980"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01710-9"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00272"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20074-8_39"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00466"},{"key":"ref74","first-page":"16494","article-title":"Multimodal virtual point 3d detection","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Yin"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/116"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00116"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01667"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20074-8_36"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00502"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00313"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00107"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00506"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58604-1_41"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.3390\/s18103337"},{"key":"ref85","article-title":"Detr3D: 3D object detection from multi-view images via 3D-to-2D queries","author":"Wang","year":"2022"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00864"},{"key":"ref87","article-title":"Pseudo-lidar++: Accurate depth for 3D object detection in autonomous driving","author":"You","year":"2019"},{"key":"ref88","article-title":"BEVFusion: A simple and robust LiDAR-camera fusion framework","author":"Liang","year":"2022"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00314"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1007\/BF00201978"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1108\/k.2001.30.9_10.1333.2"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19769-7_23"},{"key":"ref93","article-title":"MV-FCOS3D++: Multi-View camera-only 4D object detection with pretrained monocular backbones","author":"Wang","year":"2022"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.3004325"},{"key":"ref95","article-title":"Fishing net: Future inference of semantic heatmaps in grids","author":"Hendy","year":"2020"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01550"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01528"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3142418"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00972"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00249"},{"key":"ref102","article-title":"BEV-Seg: Birds eye view semantic segmentation using geometry and semantic point cloud","author":"Ng","year":"2020"},{"key":"ref103","first-page":"77","article-title":"PointNet: Deep learning on point sets for 3D classification and segmentation","volume-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","author":"Qi"},{"key":"ref104","first-page":"5105","article-title":"PointNet++: Deep hierarchical feature learning on point sets in a metric space","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Qi"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01189"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00315"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16207"},{"key":"ref108","first-page":"20745","article-title":"Object DGCNN: 3D object detection using dynamic graphs","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Wang"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.691"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00798"},{"key":"ref111","article-title":"HDNET: Exploiting HD maps for 3D object detection","author":"Yang","year":"2018"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1016\/j.ecoinf.2021.101236"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2018.2852843"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-11015-4_54"},{"key":"ref115","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-030-11009-3_11","article-title":"Complex-YOLO: An euler-region-proposal for real-time 3D object detection on point clouds","volume-title":"Proc. Eur. Conf. Comput. Vis. Workshops","author":"Simony"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref117","article-title":"Spatially-sparse convolutional neural networks","author":"Graham","year":"2014"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00319"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00937"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00738"},{"key":"ref121","article-title":"Unifying voxel-based representation with transformer for 3D object detection","author":"Li","year":"2022"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01162"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00321"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.00798"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01044"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01079"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.700"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00907"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_45"},{"key":"ref136","article-title":"Deformable DETR: Deformable transformers for end-to-end object detection","volume-title":"CoRR","volume":"abs\/2010.04159","author":"Zhu"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2019.2953639"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00464"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.236"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.597"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00375"},{"key":"ref142","article-title":"Objects as points","author":"Zhou","year":"2019"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00938"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01214"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00217"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00086"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2977026"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58610-2_19"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00885"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00294"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00118"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01105"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01237-3_6"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1145\/3326362"},{"key":"ref155","first-page":"828","article-title":"PointCNN: Convolution on X-transformed points","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Li"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00651"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01595"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01112"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00962"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00981"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01236"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9562041"},{"key":"ref163","article-title":"AMVNet: Assertion-based multi-view fusion network for LiDAR semantic segmentation","author":"Liong","year":"2020"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00735"},{"key":"ref165","article-title":"Drinet++: Efficient voxel-as-point point cloud segmentation","author":"Ye","year":"2021"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01572"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.1109\/3DV53792.2021.00046"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_39"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8794195"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00752"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01270-0_39"},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00157"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1109\/cvprw59228.2023.00022"},{"key":"ref174","article-title":"DeepInteraction: 3D object detection via modality interaction","author":"Yang","year":"2020"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00102"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2018.8594049"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1109\/IROS45743.2020.9341791"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01407"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref181","first-page":"147","article-title":"Freeanchor: Learning to match anchors for visual object detection","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Zhang"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01325"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00363"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2021.104117"},{"key":"ref185","article-title":"1st place solutions for OpenImage2019\u2013object detection and instance segmentation","author":"Liu","year":"2020"},{"key":"ref186","first-page":"17721","article-title":"SOLOv2: Dynamic and fast instance segmentation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Wang"},{"key":"ref187","article-title":"Neural network intelligence"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00747"},{"key":"ref189","article-title":"On the opportunities and risks of foundation models","author":"Bommasani","year":"2021"},{"key":"ref190","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Brown"},{"key":"ref191","first-page":"23318","article-title":"OFA: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang"},{"key":"ref192","article-title":"Uni-Perceiver-MoE: Learning sparse generalist models with conditional moes","author":"Zhu","year":"2022"},{"key":"ref193","article-title":"A generalist agent","author":"Reed","year":"2022"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/10461350\/10321736.pdf?arnumber=10321736","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,8]],"date-time":"2024-03-08T18:49:48Z","timestamp":1709923788000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10321736\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4]]},"references-count":187,"journal-issue":{"issue":"4"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2023.3333838","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,4]]}}}