{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T22:13:35Z","timestamp":1778278415263,"version":"3.51.4"},"reference-count":49,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,5,29]],"date-time":"2023-05-29T00:00:00Z","timestamp":1685318400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,5,29]],"date-time":"2023-05-29T00:00:00Z","timestamp":1685318400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,5,29]]},"DOI":"10.1109\/icra48891.2023.10161451","type":"proceedings-article","created":{"date-parts":[[2023,7,4]],"date-time":"2023-07-04T13:20:56Z","timestamp":1688476856000},"page":"4850-4857","source":"Crossref","is-referenced-by-count":9,"title":["CrossDTR: Cross-view and Depth-guided Transformers for 3D Object Detection"],"prefix":"10.1109","author":[{"given":"Ching-Yu","family":"Tseng","sequence":"first","affiliation":[{"name":"National Taiwan University"}]},{"given":"Yi-Rong","family":"Chen","sequence":"additional","affiliation":[{"name":"National Taiwan University"}]},{"given":"Hsin-Ying","family":"Lee","sequence":"additional","affiliation":[{"name":"National Taiwan University"}]},{"given":"Tsung-Han","family":"Wu","sequence":"additional","affiliation":[{"name":"National Taiwan University"}]},{"given":"Wen-Chin","family":"Chen","sequence":"additional","affiliation":[{"name":"National Taiwan University"}]},{"given":"Winston H.","family":"Hsu","sequence":"additional","affiliation":[{"name":"National Taiwan University"}]}],"member":"263","reference":[{"key":"ref13","first-page":"180","article-title":"Detr3d: 3d object detection from multi-view images via 3d-to-2d queries","author":"wang","year":"2022","journal-title":"Conference on Robot Learning"},{"key":"ref12","first-page":"1475","article-title":"Probabilistic and geometric depth: Detecting objects in perspective","author":"wang","year":"2022","journal-title":"Conference on Robot Learning"},{"key":"ref15","article-title":"Bevdet4d: Exploit temporal cues in multi-camera 3d object detection","author":"huang","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref14","article-title":"Bevdet: High-performance multi-camera 3d object detection in bird-eye-view","author":"huang","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00107"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00845"},{"key":"ref17","article-title":"Petr: Position embedding transformation for multi-view 3d object detection","author":"liu","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref16","article-title":"Bevformer: Learning bird's-eye-view representation from multi-camera images via spatiotemporal transformers","author":"li","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547859"},{"key":"ref18","article-title":"Petrv2: A unified framework for 3d perception from multi-camera images","author":"liu","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref46","first-page":"289","article-title":"Center3d: Center-based monocular 3d object detection with joint depth understanding","author":"tang","year":"2020","journal-title":"DAGM German Conference on Pattern Recognition (GCPR)"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref48","article-title":"Class-balanced grouping and sampling for point cloud 3d object detection","author":"zhu","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"ref42","article-title":"Deformable detr: Deformable transformers for end-to-end object detection","author":"zhu","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref41","article-title":"Bevsegformer: Bird's eye view semantic segmentation from arbitrary camera rigs","author":"peng","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref44","article-title":"3d object detection for autonomous driving: A review and new outlooks","author":"mao","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref43","author":"chang","year":"2021","journal-title":"Transformer-based monocular depth estimation with attention supervision"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00114"},{"key":"ref7","article-title":"Objects as points","author":"zhou","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref9","article-title":"Pseudo-lidar++: Accurate depth for 3d object detection in autonomous driving","author":"you","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01255"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01161"},{"key":"ref6","article-title":"Orthographic feature transform for monocular 3d object detection","author":"roddick","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00783"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9811901"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00972"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20158"},{"key":"ref36","first-page":"213","article-title":"End-to-end object detection with transformers","author":"carion","year":"2020","journal-title":"European Conference on Computer Vision"},{"key":"ref31","article-title":"Monodetr: Depth-aware transformer for monocular 3d object detection","author":"zhang","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref30","first-page":"13 906","article-title":"Centermask: Real-time anchor-free instance seg-mentation","author":"lee","year":"2020","journal-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_39"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00398"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01298"},{"key":"ref1","doi-asserted-by":"crossref","first-page":"3337","DOI":"10.3390\/s18103337","article-title":"Second: Sparsely embedded convolutional detection","volume":"18","author":"yan","year":"2018","journal-title":"SENSORS"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.3004325"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2019.2891028"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01339"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01499"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01169"},{"key":"ref25","article-title":"Learning geometry-guided depth via projective modeling for monoc-ular 3d object detection","author":"zhang","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref20","article-title":"Beverse: Unified perception and prediction in birds-eye-view for vision-centric autonomous driving","author":"zhang","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref22","first-page":"194","article-title":"Lift, splat, shoot: Encoding images from arbitrary camera rigs by implicitly unprojecting to 3d","author":"philion","year":"2020","journal-title":"European Conference on Computer Vision"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00133"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00313"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00256"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2019.00103"}],"event":{"name":"2023 IEEE International Conference on Robotics and Automation (ICRA)","location":"London, United Kingdom","start":{"date-parts":[[2023,5,29]]},"end":{"date-parts":[[2023,6,2]]}},"container-title":["2023 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10160211\/10160212\/10161451.pdf?arnumber=10161451","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,7,24]],"date-time":"2023-07-24T13:33:03Z","timestamp":1690205583000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10161451\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,5,29]]},"references-count":49,"URL":"https:\/\/doi.org\/10.1109\/icra48891.2023.10161451","relation":{},"subject":[],"published":{"date-parts":[[2023,5,29]]}}}