{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T14:52:27Z","timestamp":1742914347905,"version":"3.40.3"},"publisher-location":"Cham","reference-count":39,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031781094"},{"type":"electronic","value":"9783031781100"}],"license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78110-0_29","type":"book-chapter","created":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T21:54:26Z","timestamp":1733090066000},"page":"445-460","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Transformer-Based RGB and\u00a0LiDAR Fusion for\u00a0Enhanced Object Detection"],"prefix":"10.1007","author":[{"given":"Reza","family":"Sadeghian","sequence":"first","affiliation":[]},{"given":"Niloofar","family":"Hooshyaripour","sequence":"additional","affiliation":[]},{"given":"WonSook","family":"Lee","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,2]]},"reference":[{"key":"29_CR1","doi-asserted-by":"crossref","unstructured":"Bai, X., et al.: Transfusion: robust lidar-camera fusion for 3d object detection with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision And Pattern Recognition, pp. 1090\u20131099 (2022)","DOI":"10.1109\/CVPR52688.2022.00116"},{"key":"29_CR2","doi-asserted-by":"crossref","unstructured":"Brazil, G., Kumar, A., Straub, J., Ravi, N., Johnson, J., Gkioxari, G.: Omni3d: a large benchmark and model for 3d object detection in the wild. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13154\u201313164 (2023)","DOI":"10.1109\/CVPR52729.2023.01264"},{"key":"29_CR3","doi-asserted-by":"crossref","unstructured":"Chai, Y., et al.: To the point: efficient 3d object detection in the range image with graph convolution kernels. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2021)","DOI":"10.1109\/CVPR46437.2021.01574"},{"key":"29_CR4","doi-asserted-by":"crossref","unstructured":"Chen, Y., Li, Y., Zhang, X., Sun, J., Jia, J.: Focal sparse convolutional networks for 3d object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5428\u20135437 (2022)","DOI":"10.1109\/CVPR52688.2022.00535"},{"key":"29_CR5","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"29_CR6","doi-asserted-by":"crossref","unstructured":"El\u00a0Ahmar, W., et al.: Enhanced thermal-rgb fusion for robust object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 365\u2013374 (2023)","DOI":"10.1109\/CVPRW59228.2023.00042"},{"key":"29_CR7","doi-asserted-by":"crossref","unstructured":"Fan, L., Xiong, X., Wang, F., Wang, N., Zhang, Z.: Rangedet: in defense of range view for lidar-based 3d object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2918\u20132927 (2021)","DOI":"10.1109\/ICCV48922.2021.00291"},{"key":"29_CR8","unstructured":"Fu, C.Y., Liu, W., Ranga, A., Tyagi, A., Berg, A.C.: Dssd: Deconvolutional single shot detector. arXiv preprint arXiv:1701.06659 (2017)"},{"key":"29_CR9","doi-asserted-by":"crossref","unstructured":"Geiger, A., Lenz, P., Urtasun, R.: Are we ready for autonomous driving? the kitti vision benchmark suite. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition, pp. 3354\u20133361. IEEE (2012)","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"29_CR10","doi-asserted-by":"crossref","unstructured":"Girshick, R.: Fast r-cnn. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1440\u20131448 (2015)","DOI":"10.1109\/ICCV.2015.169"},{"key":"29_CR11","doi-asserted-by":"crossref","unstructured":"Guan, T., et al.: M3detr: multi-representation, multi-scale, mutual-relation 3d object detection with transformers. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 772\u2013782 (2022)","DOI":"10.1109\/WACV51458.2022.00235"},{"key":"29_CR12","doi-asserted-by":"crossref","unstructured":"He, Q., Wang, Z., Zeng, H., Zeng, Y., Liu, Y.: Svga-net: sparse voxel-graph attention network for 3d object detection from point clouds. In: Proceedings of the AAAI Conference on Artificial Intelligence. vol.\u00a036, pp. 870\u2013878 (2022)","DOI":"10.1609\/aaai.v36i1.19969"},{"key":"29_CR13","doi-asserted-by":"publisher","unstructured":"Huang, T., Liu, Z., Chen, X., Bai, X.: EPNet: Enhancing Point Features with Image Semantics for 3D Object Detection. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12360, pp. 35\u201352. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58555-6_3","DOI":"10.1007\/978-3-030-58555-6_3"},{"key":"29_CR14","unstructured":"Kingma, D.P., Ba, J.: Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"29_CR15","doi-asserted-by":"crossref","unstructured":"Lang, A.H., Vora, S., Caesar, H., Zhou, L., Yang, J., Beijbom, O.: Pointpillars: fast encoders for object detection from point clouds. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12697\u201312705 (2019)","DOI":"10.1109\/CVPR.2019.01298"},{"key":"29_CR16","unstructured":"Li, C., et\u00a0al.: Yolov6: A single-stage object detection framework for industrial applications. arXiv preprint arXiv:2209.02976 (2022)"},{"key":"29_CR17","doi-asserted-by":"crossref","unstructured":", Li, Y., et\u00a0al.: Deepfusion: Lidar-camera deep fusion for multi-modal 3d object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17182\u201317191 (2022)","DOI":"10.1109\/CVPR52688.2022.01667"},{"key":"29_CR18","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Bevfusion: multi-task multi-sensor fusion with unified bird\u2019s-eye view representation. In: 2023 IEEE International Conference on Robotics and Automation (ICRA), pp. 2774\u20132781. IEEE (2023)","DOI":"10.1109\/ICRA48891.2023.10160968"},{"key":"29_CR19","doi-asserted-by":"crossref","unstructured":"Min, Z., Zhuang, B., Schulter, S., Liu, B., Dunn, E., Chandraker, M.: Neurocs: neural nocs supervision for monocular 3d object localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21404\u201321414 (2023)","DOI":"10.1109\/CVPR52729.2023.02050"},{"key":"29_CR20","doi-asserted-by":"crossref","unstructured":"Pasandi, M.M., Liu, T., Massoud, Y., Lagani\u00e8re, R.: Sensor fusion operators for multimodal 2d object detection. In: International Symposium on Visual Computing, pp. 184\u2013195. Springer (2022)","DOI":"10.1007\/978-3-031-20713-6_14"},{"key":"29_CR21","unstructured":"Qi, D., Su, L., Song, J., Cui, E., Bharti, T., Sacheti, A.: Imagebert: Cross-modal pre-training with large-scale weak-supervised image-text data. arXiv preprint arXiv:2001.07966 (2020)"},{"key":"29_CR22","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: tDowards real-time object detection with region proposal networks. Advances in Neural Information Processing Systems 28 (2015)"},{"key":"29_CR23","doi-asserted-by":"crossref","unstructured":"Shi, S., Wang, X., Li, H.: Pointrcnn: 3d object proposal generation and detection from point cloud. In: Proceedings of the IEEE\/CVF Conference on Computer Vision And Pattern Recognition, pp. 770\u2013779 (2019)","DOI":"10.1109\/CVPR.2019.00086"},{"key":"29_CR24","doi-asserted-by":"crossref","unstructured":"Sun, P., et\u00a0al.: Scalability in perception for autonomous driving: Waymo open dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2446\u20132454 (2020)","DOI":"10.1109\/CVPR42600.2020.00252"},{"key":"29_CR25","doi-asserted-by":"crossref","unstructured":"Sun, P., et al.: Rsn: range sparse net for efficient, accurate lidar 3d object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5725\u20135734 (2021)","DOI":"10.1109\/CVPR46437.2021.00567"},{"key":"29_CR26","unstructured":"Trockman, A., Kolter, J.Z.: Patches are all you need? arXiv preprint arXiv:2201.09792 (2022)"},{"key":"29_CR27","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"29_CR28","doi-asserted-by":"crossref","unstructured":"Wang, C.Y., Bochkovskiy, A., Liao, H.Y.M.: Yolov7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7464\u20137475 (2023)","DOI":"10.1109\/CVPR52729.2023.00721"},{"key":"29_CR29","doi-asserted-by":"crossref","unstructured":"Wang, C., Ma, C., Zhu, M., Yang, X.: Pointaugmenting: Cross-modal augmentation for 3d object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11794\u201311803 (2021)","DOI":"10.1109\/CVPR46437.2021.01162"},{"key":"29_CR30","doi-asserted-by":"crossref","unstructured":"Xu, J., et al.: Mononerd: Nerf-like representations for monocular 3d object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6814\u20136824 (2023)","DOI":"10.1109\/ICCV51070.2023.00627"},{"key":"29_CR31","doi-asserted-by":"crossref","unstructured":"Xu, Q., Zhong, Y., Neumann, U.: Behind the curtain: Learning occluded shapes for 3d object detection. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a036, pp. 2893\u20132901 (2022)","DOI":"10.1609\/aaai.v36i3.20194"},{"key":"29_CR32","doi-asserted-by":"crossref","unstructured":"Yang, H., et al.: Pvt-ssd: Single-stage 3d object detector with point-voxel transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13476\u201313487 (2023)","DOI":"10.1109\/CVPR52729.2023.01295"},{"key":"29_CR33","doi-asserted-by":"crossref","unstructured":"Yang, Z., Sun, Y., Liu, S., Shen, X., Jia, J.: Std: Sparse-to-dense 3d object detector for point cloud. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1951\u20131960 (2019)","DOI":"10.1109\/ICCV.2019.00204"},{"key":"29_CR34","doi-asserted-by":"publisher","unstructured":"Yoo, J.H., Kim, Y., Kim, J., Choi, J.W.: 3D-CVF: Generating Joint Camera and LiDAR Features Using Cross-view Spatial Feature Fusion for 3D Object Detection. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12372, pp. 720\u2013736. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58583-9_43","DOI":"10.1007\/978-3-030-58583-9_43"},{"key":"29_CR35","doi-asserted-by":"publisher","first-page":"3051","DOI":"10.1007\/s11263-021-01515-2","volume":"129","author":"C Yu","year":"2021","unstructured":"Yu, C., Gao, C., Wang, J., Yu, G., Shen, C., Sang, N.: Bisenet v2: bilateral network with guided aggregation for real-time semantic segmentation. Int. J. Comput. Vision 129, 3051\u20133068 (2021)","journal-title":"Int. J. Comput. Vision"},{"key":"29_CR36","doi-asserted-by":"crossref","unstructured":"Yu, Z., Yu, J., Fan, J., Tao, D.: Multi-modal factorized bilinear pooling with co-attention learning for visual question answering. In: Proceedings of the IEEE International Conference On Computer Vision, pp. 1821\u20131830 (2017)","DOI":"10.1109\/ICCV.2017.202"},{"key":"29_CR37","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Chen, J., Huang, D.: Cat-det: contrastively augmented transformer for multi-modal 3d object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 908\u2013917 (2022)","DOI":"10.1109\/CVPR52688.2022.00098"},{"key":"29_CR38","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Hu, Q., Xu, G., Ma, Y., Wan, J., Guo, Y.: Not all points are equal: learning highly efficient point-based detectors for 3d lidar point clouds. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18953\u201318962 (2022)","DOI":"10.1109\/CVPR52688.2022.01838"},{"key":"29_CR39","doi-asserted-by":"crossref","unstructured":"Zhou, C., Zhang, Y., Chen, J., Huang, D.: Octr: Octree-based transformer for 3d object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5166\u20135175 (2023)","DOI":"10.1109\/CVPR52729.2023.00500"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78110-0_29","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T23:35:43Z","timestamp":1733096143000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78110-0_29"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"ISBN":["9783031781094","9783031781100"],"references-count":39,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78110-0_29","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,2]]},"assertion":[{"value":"2 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}