{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T03:29:25Z","timestamp":1775014165496,"version":"3.50.1"},"publisher-location":"Cham","reference-count":51,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726699","type":"print"},{"value":"9783031726705","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72670-5_25","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:01:50Z","timestamp":1727593310000},"page":"439-455","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":27,"title":["Detecting as Labeling: Rethinking LiDAR-Camera Fusion in\u00a03D Object Detection"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2853-2549","authenticated-orcid":false,"given":"Junjie","family":"Huang","sequence":"first","affiliation":[]},{"given":"Yun","family":"Ye","sequence":"additional","affiliation":[]},{"given":"Zhujin","family":"Liang","sequence":"additional","affiliation":[]},{"given":"Yi","family":"Shan","sequence":"additional","affiliation":[]},{"given":"Dalong","family":"Du","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"25_CR1","doi-asserted-by":"crossref","unstructured":"Bai, X., et al.: TransFusion: robust LiDAR-camera fusion for 3D object detection with transformers. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00116"},{"key":"25_CR2","doi-asserted-by":"crossref","unstructured":"Caesar, H., et al.: nuScenes: a multimodal dataset for autonomous driving. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"25_CR3","doi-asserted-by":"crossref","unstructured":"Cai, Q., Pan, Y., Yao, T., Ngo, C.W., Mei, T.: ObjectFusion: multi-modal 3D object detection with object-centric fusion. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01656"},{"key":"25_CR4","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"25_CR5","doi-asserted-by":"crossref","unstructured":"Chen, X., Ma, H., Wan, J., Li, B., Xia, T.: Multi-view 3D object detection network for autonomous driving. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.691"},{"key":"25_CR6","doi-asserted-by":"crossref","unstructured":"Chen, Y., Li, Y., Zhang, X., Sun, J., Jia, J.: Focal sparse convolutional networks for 3D object detection. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00535"},{"key":"25_CR7","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"25_CR8","doi-asserted-by":"crossref","unstructured":"Wang, H., et al.: UniTR: a unified and efficient multi-modal transformer for bird\u2019s-eye-view representation. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00625"},{"key":"25_CR9","doi-asserted-by":"crossref","unstructured":"Wang, H., et al.: DSVT: dynamic sparse voxel transformer with rotated sets. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01299"},{"key":"25_CR10","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep Residual Learning for Image Recognition. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"25_CR11","doi-asserted-by":"crossref","unstructured":"Hu, Y., et al.: Planning-oriented autonomous driving. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01712"},{"key":"25_CR12","unstructured":"Huang, J., Huang, G.: BEVDet4D: Exploit Temporal Cues in Multi-camera 3D Object Detection (2022). arXiv preprint arXiv:2203.17054"},{"key":"25_CR13","unstructured":"Huang, J., Huang, G., Zhu, Z., Yun, Y., Du, D.: BEVDet: High-performance Multi-camera 3D Object Detection in Bird-Eye-View (2021). arXiv preprint arXiv:2112.11790"},{"key":"25_CR14","unstructured":"Dai, J., et al.: Deformable Convolutional Networks (2017). arXiv preprint arXiv:1703.06211"},{"key":"25_CR15","unstructured":"Li, Y., Chen, Y., Qi, X., Li, Z., Sun, J., Jia, J.: Unifying Voxel-based Representation with Transformer for 3D Object Detection. In: NeurIPS (2022)"},{"key":"25_CR16","doi-asserted-by":"crossref","unstructured":"Li, Y., Bao, H., Ge, Z., Yang, J., Sun, J., Li, Z.: BEVStereo: enhancing depth estimation in multi-view 3D object detection with dynamic temporal stereo. In: AAAI (2022)","DOI":"10.1609\/aaai.v37i2.25234"},{"key":"25_CR17","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: BEVDepth: acquisition of reliable depth for multi-view 3D object detection. In: AAAI (2022)","DOI":"10.1609\/aaai.v37i2.25233"},{"key":"25_CR18","doi-asserted-by":"crossref","unstructured":"Li, Z., Yu, Z., Wang, W., Anandkumar, A., Lu, T., \u00c1lvarez, J.M.: FB-BEV: BEV representation from forward-backward view transformations. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00637"},{"key":"25_CR19","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"25_CR20","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., et al.: Microsoft COCO: common objects in context. In: ECCV (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"25_CR21","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"25_CR22","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: BEVFusion: multi-task multi-sensor fusion with unified bird\u2019s-eye view representation. In: ICRA (2023)","DOI":"10.1109\/ICRA48891.2023.10160968"},{"key":"25_CR23","doi-asserted-by":"crossref","unstructured":"Pan, B., Sun, J., Leung, H.Y.T., Andonian, A., Zhou, B.: Cross-view semantic segmentation for sensing surroundings. IEEE Robot. Autom. Lett. 5(3), 4867\u20134873 (2020)","DOI":"10.1109\/LRA.2020.3004325"},{"key":"25_CR24","doi-asserted-by":"crossref","unstructured":"Park, D., Ambrus, R., Guizilini, V., Li, J., Gaidon, A.: Is pseudo-lidar needed for monocular 3D object detection? In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00313"},{"key":"25_CR25","unstructured":"Park, J., et al.: Time Will Tell: New Outlooks and A Baseline for Temporal Multi-View 3D Object Detection (2023)"},{"key":"25_CR26","doi-asserted-by":"crossref","unstructured":"Philion, J., Fidler, S.: Lift, Splat, Shoot: encoding images from arbitrary camera rigs by implicitly Unprojecting to 3D. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58568-6_12"},{"key":"25_CR27","doi-asserted-by":"crossref","unstructured":"Reading, C., Harakeh, A., Chae, J., Waslander, S.L.: Categorical depth distribution network for monocular 3D object detection. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00845"},{"key":"25_CR28","doi-asserted-by":"crossref","unstructured":"Roddick, T., Cipolla, R.: Predicting semantic map representations from images using pyramid occupancy networks. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01115"},{"key":"25_CR29","doi-asserted-by":"crossref","unstructured":"Sindagi, V.A., Zhou, Y., Tuzel, O.: MVX-Net: multimodal VoxelNet for 3D object detection. In: ICRA (2019)","DOI":"10.1109\/ICRA.2019.8794195"},{"key":"25_CR30","doi-asserted-by":"crossref","unstructured":"Smith, L.N.: Cyclical learning rates for training neural networks. In: WACV (2017)","DOI":"10.1109\/WACV.2017.58"},{"key":"25_CR31","unstructured":"Tan, M., Le, Q.: EfficientNet: rethinking model scaling for convolutional neural networks. In: ICLR (2019)"},{"key":"25_CR32","unstructured":"Tian, X., Jiang, T., Yun, L., Wang, Y., Wang, Y., Zhao, H.: Occ3D: A Large-Scale 3D Occupancy Prediction Benchmark for Autonomous Driving (2023). arXiv preprint arXiv:2304.14365"},{"key":"25_CR33","unstructured":"Liang, T., et al.: BEVFusion: a simple and robust LiDAR-camera fusion framework. In: NeurIPS (2022)"},{"key":"25_CR34","doi-asserted-by":"crossref","unstructured":"Vora, S., Lang, A.H., Helou, B., Beijbom, O.: PointPainting: sequential fusion for 3D object detection. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00466"},{"key":"25_CR35","doi-asserted-by":"crossref","unstructured":"Vora, S., Lang, A.H., Helou, B., Beijbom, O.: PointPainting: sequential fusion for 3D object detection. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00466"},{"key":"25_CR36","doi-asserted-by":"crossref","unstructured":"Wang, C., Ma, C., Zhu, M., Yang, X.: PointAugmenting: cross-modal augmentation for 3D object detection. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01162"},{"key":"25_CR37","doi-asserted-by":"crossref","unstructured":"Wang, C., Ma, C., Zhu, M., Yang, X.: PointAugmenting: cross-modal augmentation for 3D object detection. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01162"},{"key":"25_CR38","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: OpenOccupancy: a large scale benchmark for surrounding semantic occupancy perception. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01636"},{"key":"25_CR39","unstructured":"Wang, Y., Guizilini, V., Zhang, T., Wang, Y., Zhao, H., , Solomon, J.M.: DETR3D: 3D object detection from multi-view images via 3D-to-2D queries. In: CoRL (2021)"},{"key":"25_CR40","doi-asserted-by":"crossref","unstructured":"Wu, H., Wen, C., Shi, S., Li, X., Wang, C.: Virtual sparse convolution for multimodal 3D object detection. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02074"},{"key":"25_CR41","doi-asserted-by":"crossref","unstructured":"Wu, X., et al.: Sparse fuse dense: towards high quality 3D detection with depth completion. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00534"},{"key":"25_CR42","doi-asserted-by":"crossref","unstructured":"Xie, Y., et al.: SparseFusion: fusing multi-modal sparse representations for multi-sensor 3D object detection. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01613"},{"key":"25_CR43","doi-asserted-by":"crossref","unstructured":"Xu, S., Zhou, D., Fang, J., Yin, J., Bin, Z., Zhang, L.: FusionPainting: multimodal fusion with adaptive attention for 3D object detection. In: ITSC (2021)","DOI":"10.1109\/ITSC48978.2021.9564951"},{"key":"25_CR44","doi-asserted-by":"crossref","unstructured":"Yan, J., et al.: Cross modal transformer via coordinates encoding for 3D object dectection. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01675"},{"key":"25_CR45","doi-asserted-by":"crossref","unstructured":"Yan, Y., Mao, Y., Li, B.: SECOND: sparsely embedded convolutional detection. Sensors 18(10), 3337 (2018)","DOI":"10.3390\/s18103337"},{"key":"25_CR46","doi-asserted-by":"crossref","unstructured":"Yang, W., et al.: Projecting your view attentively: monocular road scene layout estimation via cross-view transformation. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01528"},{"key":"25_CR47","unstructured":"Yang, Z., Chen, J., Miao, Z., Li, W., Zhu, X., Zhang, L.: DeepInteraction: 3D object detection via modality interaction. In: NeurIPS (2022)"},{"key":"25_CR48","doi-asserted-by":"crossref","unstructured":"Yin, T., Zhou, X., Krahenbuhl, P.: Center-based 3D object detection and tracking. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01161"},{"key":"25_CR49","doi-asserted-by":"crossref","unstructured":"Zhou, B., et al.: Semantic Understanding of Scenes through the ADE20K Dataset. IJCV (2019)","DOI":"10.1007\/s11263-018-1140-0"},{"key":"25_CR50","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Tuzel, O.: VoxelNet: end-to-end learning for point cloud based 3D object detection. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00472"},{"key":"25_CR51","unstructured":"Zhu, B., Jiang, Z., Zhou, X., Li, Z., Yu, G.: Class-balanced Grouping and Sampling for Point Cloud 3D Object Detection (2019). arXiv preprint arXiv:1908.09492"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72670-5_25","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:25:23Z","timestamp":1727594723000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72670-5_25"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031726699","9783031726705"],"references-count":51,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72670-5_25","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}