{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,31]],"date-time":"2025-12-31T00:54:54Z","timestamp":1767142494298,"version":"build-2238731810"},"reference-count":98,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2025,1,14]],"date-time":"2025-01-14T00:00:00Z","timestamp":1736812800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,14]],"date-time":"2025-01-14T00:00:00Z","timestamp":1736812800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"Project of the National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62403400"],"award-info":[{"award-number":["62403400"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s11263-024-02340-z","type":"journal-article","created":{"date-parts":[[2025,1,14]],"date-time":"2025-01-14T01:37:20Z","timestamp":1736818640000},"page":"3481-3518","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Generalized Robot Vision-Language Model via Linguistic Foreground-Aware Contrast"],"prefix":"10.1007","volume":"133","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8387-3565","authenticated-orcid":false,"given":"Kangcheng","family":"Liu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chaoqun","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaodong","family":"Han","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yong-Jin","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Baoquan","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,1,14]]},"reference":[{"key":"2340_CR1","doi-asserted-by":"crossref","unstructured":"Aubry, M., Schlickewei, U., & Cremers, D. (2011). The wave kernel signature: A quantum mechanical approach to shape analysis. In 2011 IEEE international conference on computer vision workshops (ICCV workshops) (pp. 1626\u20131633). IEEE.","DOI":"10.1109\/ICCVW.2011.6130444"},{"key":"2340_CR2","unstructured":"Bai, J., Bai, S., Yang, S., Wang, S., Tan, S., Wang, P., Lin, J., Zhou, C., & Zhou, J. (2023). Qwen-VL: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966"},{"key":"2340_CR3","doi-asserted-by":"crossref","unstructured":"Bai, Y., Chen, X., Kirillov, A., Yuille, A., & Berg, A. C. (2022). Point-level region contrast for object detection pre-training. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 16,061\u201316,070).","DOI":"10.1109\/CVPR52688.2022.01559"},{"key":"2340_CR4","doi-asserted-by":"crossref","unstructured":"Behley, J., Garbade, M., Milioto, A., Quenzel, J., Behnke, S., Stachniss, C., & Gall, J. (2019). SemanticKITTI: A dataset for semantic scene understanding of lidar sequences. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 9297\u20139307).","DOI":"10.1109\/ICCV.2019.00939"},{"key":"2340_CR5","doi-asserted-by":"crossref","unstructured":"Bronstein, M. M., & Kokkinos, I. (2010). Scale-invariant heat kernel signatures for non-rigid shape recognition. In 2010 IEEE computer society conference on computer vision and pattern recognition (pp. 1704\u20131711). IEEE.","DOI":"10.1109\/CVPR.2010.5539838"},{"key":"2340_CR6","doi-asserted-by":"crossref","unstructured":"Caesar, H., Bankiti, V., Lang, A. H., Vora, S., Liong, V. E., Xu, Q., Krishnan, A., Pan, Y., Baldan, G., & Beijbom, O. (2020). nuScenes: A multimodal dataset for autonomous driving. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 11,621\u201311,631).","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"2340_CR7","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., & Zagoruyko, S. (2020). End-to-end object detection with transformers. In European conference on computer vision (pp. 213\u2013229). Springer","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"2340_CR8","doi-asserted-by":"crossref","unstructured":"Chen, D. Y., Tian, X. P., Shen, Y. T., & Ouhyoung, M. (2003). On visual similarity based 3D model retrieval. In Computer graphics forum (vol.\u00a022, pp. 223\u2013232). Wiley Online Library.","DOI":"10.1111\/1467-8659.00669"},{"key":"2340_CR9","doi-asserted-by":"crossref","unstructured":"Chen, Y., Nie\u00dfner, M., & Dai, A. (2022). 4DContrast: Contrastive learning with dynamic correspondences for 3D scene understanding. In European Conference on Computer Vision. Springer.","DOI":"10.1007\/978-3-031-19824-3_32"},{"key":"2340_CR10","doi-asserted-by":"crossref","unstructured":"Cheraghian, A., Rahman, S., Campbell, D., & Petersson, L. (2020). Transductive zero-shot learning for 3D point cloud classification. In Proceedings of the IEEE\/CVF winter conference on applications of computer vision (pp. 923\u2013933).","DOI":"10.1109\/WACV45572.2020.9093545"},{"key":"2340_CR11","doi-asserted-by":"crossref","unstructured":"Chibane, J., Engelmann, F., Anh\u00a0Tran, T., & Pons-Moll, G. (2022). Box2Mask: Weakly supervised 3d semantic instance segmentation using bounding boxes. In: European conference on computer vision (pp. 681\u2013699). Springer.","DOI":"10.1007\/978-3-031-19821-2_39"},{"key":"2340_CR12","doi-asserted-by":"crossref","unstructured":"Dai, A., Chang, A. X., Savva, M., Halber, M., Funkhouser, T., & Nie\u00dfner, M. (2017). ScanNet: Richly-annotated 3D reconstructions of indoor scenes. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 5828\u20135839).","DOI":"10.1109\/CVPR.2017.261"},{"key":"2340_CR13","doi-asserted-by":"crossref","unstructured":"Deng, C., Litany, O., Duan, Y., Poulenard, A., Tagliasacchi, A., & Guibas, L. J. (2021). Vector neurons: A general framework for SO (3)-equivariant networks. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 12,200\u201312,209).","DOI":"10.1109\/ICCV48922.2021.01198"},{"key":"2340_CR14","doi-asserted-by":"crossref","unstructured":"Ding, R., Yang, J., Xue, C., Zhang, W., Bai, S., & Qi, X. (2023). PLA: Language-driven open-vocabulary 3D scene understanding. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 7010\u20137019).","DOI":"10.1109\/CVPR52729.2023.00677"},{"key":"2340_CR15","doi-asserted-by":"crossref","unstructured":"Er\u00e7elik, E., Yurtsever, E., Liu, M., Yang, Z., Zhang, H., Top\u00e7am, P., Listl, M., \u00c7ayl\u0131, Y. K., & Knoll, A. (2022). 3D object detection with a self-supervised lidar scene flow backbone. In European Conference on Computer Vision","DOI":"10.1007\/978-3-031-20080-9_15"},{"key":"2340_CR16","doi-asserted-by":"crossref","unstructured":"Esteves, C., Allen-Blanchette, C., Makadia, A., & Daniilidis, K. (2018). Learning SO (3) equivariant representations with spherical CNNs. In Proceedings of the European conference on computer vision (ECCV) (pp. 52\u201368).","DOI":"10.1007\/978-3-030-01261-8_4"},{"issue":"11","key":"2340_CR17","doi-asserted-by":"publisher","first-page":"2270","DOI":"10.1109\/TPAMI.2014.2316828","volume":"36","author":"Y Guo","year":"2014","unstructured":"Guo, Y., Bennamoun, M., Sohel, F., Lu, M., & Wan, J. (2014). 3D object recognition in cluttered scenes with local surface features: A survey. IEEE Transactions on Pattern Analysis and Machine Intelligence, 36(11), 2270\u20132287.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2340_CR18","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., & Girshick, R. (2022). Masked autoencoders are scalable vision learners. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 16,000\u201316,009).","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"2340_CR19","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., & Girshick, R. (2017). Mask R-CNN. In Proceedings of the IEEE international conference on computer vision (pp. 2961\u20132969).","DOI":"10.1109\/ICCV.2017.322"},{"key":"2340_CR20","doi-asserted-by":"crossref","unstructured":"Hou, J., Graham, B., Nie\u00dfner, M., Xie, S. (2021). Exploring data-efficient 3d scene understanding with contrastive scene contexts. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 15,587\u201315,597).","DOI":"10.1109\/CVPR46437.2021.01533"},{"key":"2340_CR21","doi-asserted-by":"crossref","unstructured":"Huang, S., Gojcic, Z., Usvyatsov, M., Wieser, A., & Schindler, K. (2021) PREDATOR: Registration of 3D point clouds with low overlap. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 4267\u20134276).","DOI":"10.1109\/CVPR46437.2021.00425"},{"key":"2340_CR22","doi-asserted-by":"crossref","unstructured":"Huang, S., Qi, S., Zhu, Y., Xiao, Y., Xu, Y., Zhu, S. C. (2018). Holistic 3D scene parsing and reconstruction from a single RGB image. In Proceedings of the European conference on computer vision (pp. 187\u2013203).","DOI":"10.1007\/978-3-030-01234-2_12"},{"key":"2340_CR23","doi-asserted-by":"crossref","unstructured":"Huang, S., Xie, Y., Zhu, S. C., & Zhu, Y. (2021). Spatio-temporal self-supervised representation learning for 3D point clouds. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 6535\u20136545).","DOI":"10.1109\/ICCV48922.2021.00647"},{"key":"2340_CR24","doi-asserted-by":"crossref","unstructured":"Jiang, L., Zhao, H., Shi, S., Liu, S., Fu, C. W., & Jia, J. (2020). PointGroup: Dual-set point grouping for 3D instance segmentation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 4867\u20134876).","DOI":"10.1109\/CVPR42600.2020.00492"},{"key":"2340_CR25","unstructured":"Jiang, M., Wu, Y., Zhao, T., Zhao, Z., & Lu, C. (2018). PointSIFT: A sift-like network module for 3d point cloud semantic segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition."},{"key":"2340_CR26","unstructured":"Kingma, D. P., & Ba, J. (2014). Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980"},{"issue":"1","key":"2340_CR27","doi-asserted-by":"publisher","first-page":"7","DOI":"10.1002\/nav.20053","volume":"52","author":"HW Kuhn","year":"2005","unstructured":"Kuhn, H. W. (2005). The method for the assignment problem. Naval Research Logistics (NRL), 52(1), 7\u201321.","journal-title":"Naval Research Logistics (NRL)"},{"key":"2340_CR28","doi-asserted-by":"crossref","unstructured":"Landrieu, L., & Boussaha, M. (2019). Point cloud oversegmentation with graph-structured deep metric learning. In Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR) (pp. 7440\u20137449).","DOI":"10.1109\/CVPR.2019.00762"},{"key":"2340_CR29","doi-asserted-by":"crossref","unstructured":"Landrieu, L., & Simonovsky, M. (2018). Large-scale point cloud semantic segmentation with superpoint graphs. In Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR) (pp. 4558\u20134567)","DOI":"10.1109\/CVPR.2018.00479"},{"key":"2340_CR30","doi-asserted-by":"crossref","unstructured":"Lang, A. H., Vora, S., Caesar, H., Zhou, L., Yang, J., & Beijbom, O. (2019). PointPillars: Fast encoders for object detection from point clouds. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 12,697\u201312,705)","DOI":"10.1109\/CVPR.2019.01298"},{"key":"2340_CR31","doi-asserted-by":"crossref","unstructured":"Li, L., & Heizmann, M. (2022). A closer look at invariances in self-supervised pre-training for 3d vision. In European Conference on Computer Vision. Springer.","DOI":"10.1007\/978-3-031-20056-4_38"},{"key":"2340_CR32","doi-asserted-by":"crossref","unstructured":"Liang, H., Jiang, C., Feng, D., Chen, X., Xu, H., Liang, X., Zhang, W., Li, Z., & Van\u00a0Gool, L. (2021). Exploring geometry-aware contrast and clustering harmonization for self-supervised 3d object detection. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 3293\u20133302).","DOI":"10.1109\/ICCV48922.2021.00328"},{"key":"2340_CR33","doi-asserted-by":"crossref","unstructured":"Liang, Y., Zhao, S., Yu, B., Zhang, J., & He, F. (2022). MeshMAE: Masked autoencoders for 3d mesh data analysis. In European conference on computer vision","DOI":"10.1007\/978-3-031-20062-5_3"},{"key":"2340_CR34","unstructured":"Liu, K. (2023a). Learning-based defect recognitions for autonomous UAV inspections. arxiv preprint arxiv:2302.06093"},{"key":"2340_CR35","unstructured":"Liu, K. (2022a). An enhanced lidar-inertial slam system for robotics localization and mapping. arXiv preprint arXiv:2212.14209"},{"key":"2340_CR36","unstructured":"Liu, K. (2022b). An integrated lidar-slam system for complex environment with noisy point clouds. arXiv preprint arXiv:2212.05705"},{"key":"2340_CR37","unstructured":"Liu, K. (2022c). An integrated visual system for unmanned aerial vehicles tracking and landing on the ground vehicles. arXiv preprint arXiv:2301.00198"},{"key":"2340_CR38","doi-asserted-by":"crossref","unstructured":"Liu, K. (2022d). A robust and efficient lidar-inertial-visual fused simultaneous localization and mapping system with loop closure. In 2022 12th International conference on cyber technology in automation, control, and intelligent systems (CYBER) (pp. 1182\u20131187). IEEE.","DOI":"10.1109\/CYBER55403.2022.9907651"},{"key":"2340_CR39","doi-asserted-by":"crossref","unstructured":"Liu, K. (2022e). Robust industrial UAV\/UGV-based unsupervised domain adaptive crack recognitions with depth and edge awareness: from system and database constructions to real-site inspections. In Proceedings of the 30th ACM international conference on multimedia (pp. 5361\u20135370)","DOI":"10.1145\/3503161.3548304"},{"key":"2340_CR40","doi-asserted-by":"crossref","unstructured":"Liu, K. (2022f). Semi-supervised confidence-level-based contrastive discrimination for class-imbalanced semantic segmentation. In 2022 12th International conference on CYBER technology in automation, control, and intelligent systems (CYBER) (pp. 1230\u20131235). IEEE.","DOI":"10.1109\/CYBER55403.2022.9907146"},{"key":"2340_CR41","unstructured":"Liu, K. (2023b). Learning-based defect recognitions for autonomous UAV inspections. arXiv preprint arXiv:2302.06093"},{"key":"2340_CR42","unstructured":"Liu, K. (2023c). A lidar-inertial-visual slam system with loop detection. arXiv preprint arXiv:2301.05604"},{"key":"2340_CR43","doi-asserted-by":"crossref","unstructured":"Liu, K. (2023d). RM3D: Robust data-efficient 3D scene parsing via traditional and learnt 3D descriptors-based semantic region merging. International Journal of Computer Vision, 131(4), 938\u2013967.","DOI":"10.1007\/s11263-022-01740-3"},{"issue":"5","key":"2340_CR44","doi-asserted-by":"publisher","first-page":"2876","DOI":"10.1109\/TMECH.2023.3253715","volume":"28","author":"K Liu","year":"2023","unstructured":"Liu, K., & Cao, M. (2023). DLC-SLAM: A robust LiDAR-slam system with learning-based denoising and loop closure. IEEE\/ASME Transactions on Mechatronics, 28(5), 2876\u20132884.","journal-title":"IEEE\/ASME Transactions on Mechatronics"},{"key":"2340_CR45","doi-asserted-by":"crossref","unstructured":"Liu, K., & Chen, B. M. (2022a). Industrial UAV-based unsupervised domain adaptive crack recognitions: From database towards real-site infrastructural inspections. IEEE Transactions on Industrial Electronics, 70(9), 9410\u20139420.","DOI":"10.1109\/TIE.2022.3204953"},{"key":"2340_CR46","doi-asserted-by":"crossref","unstructured":"Liu, K., & Chen, B. M. (2022b). Industrial UAV-based unsupervised domain adaptive crack recognitions: From system setups to real-site infrastructural inspections. IEEE Transactions on Industrial Electronics, 70, 9410\u20139420.","DOI":"10.1109\/TIE.2022.3204953"},{"key":"2340_CR47","doi-asserted-by":"crossref","unstructured":"Liu, K., Gao, Z., Lin, F., & Chen, B.M.(2020). FG-Net: Fast large-scale lidar point cloudsunderstanding network leveraging correlatedfeature mining and geometric-aware modelling. arXiv preprint arXiv:2012.09439","DOI":"10.1109\/ICRA48506.2021.9561496"},{"key":"2340_CR48","doi-asserted-by":"crossref","unstructured":"Liu, K., Gao, Z., Lin, F., & Chen, B. M.(2021). FG-Conv: Large-scale LiDAR point clouds understanding leveraging feature correlation mining and geometric-aware modeling. In 2021 IEEE international conference on robotics and automation (ICRA) (pp. 12,896\u201312,902). IEEE.","DOI":"10.1109\/ICRA48506.2021.9561496"},{"issue":"1","key":"2340_CR49","doi-asserted-by":"publisher","first-page":"553","DOI":"10.1109\/TCYB.2022.3159815","volume":"53","author":"K Liu","year":"2022","unstructured":"Liu, K., Gao, Z., Lin, F., & Chen, B. M. (2022). FG-Net: A fast and accurate framework for large-scale lidar point cloud understanding. IEEE Transactions on Cybernetics, 53(1), 553\u2013564.","journal-title":"IEEE Transactions on Cybernetics"},{"key":"2340_CR50","doi-asserted-by":"crossref","unstructured":"Liu, K., Han, X., & Chen, B. M. (2019). Deep learning based automatic crack detection and segmentation for unmanned aerial vehicle inspections. In 2019 IEEE international conference on robotics and biomimetics (ROBIO) (pp. 381\u2013387). IEEE.","DOI":"10.1109\/ROBIO49542.2019.8961534"},{"key":"2340_CR51","doi-asserted-by":"crossref","unstructured":"Liu, K., & Ou, H. (2022a). A light-weight lidar-inertial slam system with high efficiency and loop closure detection capacity. In 2022 International conference on advanced robotics and mechatronics (ICARM) (pp. 284\u2013289). IEEE.","DOI":"10.1109\/ICARM54641.2022.9959555"},{"key":"2340_CR52","doi-asserted-by":"crossref","unstructured":"Liu, K., & Ou, H. (2022b). A light-weight lidar-inertial slam system with loop closing. arXiv preprint arXiv:2212.05743","DOI":"10.1109\/ICARM54641.2022.9959555"},{"issue":"3","key":"2340_CR53","doi-asserted-by":"publisher","first-page":"3097","DOI":"10.1109\/TPWRS.2017.2761897","volume":"33","author":"K Liu","year":"2017","unstructured":"Liu, K., Qu, Y., Kim, H. M., & Song, H. (2017). Avoiding frequency second dip in power unreserved control during wind power rotational speed recovery. IEEE Transactions on Power Systems, 33(3), 3097\u20133106.","journal-title":"IEEE Transactions on Power Systems"},{"key":"2340_CR54","doi-asserted-by":"crossref","unstructured":"Liu, K., Xiao, A., Huang, J., Cui, K., Xing, Y., & Lu, S. (2022a). D-LC-Nets: Robust denoising and loop closing networks for lidar slam in complicated circumstances with noisy point clouds. In IEEE\/RSJ international conference on intelligent robots and systems (IROS) (pp. 3097\u20133106).","DOI":"10.1109\/IROS47612.2022.9981388"},{"key":"2340_CR55","doi-asserted-by":"crossref","unstructured":"Liu, K., Xiao, A., Huang, J., Cui, K., Xing, Y., & Lu, S. (2022b). D-LC-Nets: Robust denoising and loop closing networks for lidar slam in complicated circumstances with noisy point clouds. In 2022 IEEE\/RSJ international conference on intelligent robots and systems (IROS) (pp. 12,212\u201312,218). IEEE.","DOI":"10.1109\/IROS47612.2022.9981388"},{"key":"2340_CR56","doi-asserted-by":"crossref","unstructured":"Liu, K., Xiao, A., Zhang, X., Lu, S., & Shao, L. (2023). FAC: 3D representation learning via foreground aware feature contrast. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 9476\u20139485).","DOI":"10.1109\/CVPR52729.2023.00914"},{"key":"2340_CR57","doi-asserted-by":"crossref","unstructured":"Liu, K., Yang, G., Zhang, J., Zhao, Z., Chen, X., & Chen, B. M. (2022c). Datasets and methods for boosting infrastructure inspection: A survey on defect segmentation and detection. In 2022 IEEE 17th international conference on control & automation (ICCA) (pp. 23\u201330). IEEE.","DOI":"10.1109\/ICCA54724.2022.9831925"},{"key":"2340_CR58","doi-asserted-by":"crossref","unstructured":"Liu, K., Zhao, Y., Gao, Z., & Chen, B. M. (2022d). WeakLabel3D-Net: A complete framework for real-scene lidar point clouds weakly supervised multi-tasks understanding. In 2022 international conference on robotics and automation (ICRA) (pp. 5108\u20135115). IEEE.","DOI":"10.1109\/ICRA46639.2022.9811959"},{"key":"2340_CR59","doi-asserted-by":"crossref","unstructured":"Liu, K., Zhao, Y., Nie, Q., Gao, Z., & Chen, B. M. (2022e). Weakly supervised 3d scene segmentation with region-level boundary awareness and instance discrimination. In European Conference on Computer Vision 2022 (ECCV 2022) (pp. 37\u201355). Springer, Cham.","DOI":"10.1007\/978-3-031-19815-1_3"},{"key":"2340_CR60","unstructured":"Liu, K., Zheng, X., Wang, C., Wang, H., Liu, M., & Tang, K. (2024). Online robot navigation and and manipulation with distilled vision-language models. arXiv preprint arXiv:2401.17083"},{"key":"2340_CR61","doi-asserted-by":"crossref","unstructured":"Liu, K., Zhou, X., & Chen, B. M. (2022f). An enhanced lidar inertial localization and mapping system for unmanned ground vehicles. In 2022 IEEE 17th international conference on control & automation (ICCA) (pp. 587\u2013592). IEEE.","DOI":"10.1109\/ICCA54724.2022.9831822"},{"key":"2340_CR62","doi-asserted-by":"crossref","unstructured":"Liu, K., Zhou, X., Zhao, B., Ou, H., & Chen, B. M. (2022g). An integrated visual system for unmanned aerial vehicles following ground vehicles: Simulations and experiments. In 2022 IEEE 17th international conference on control & automation (ICCA) (pp. 593\u2013598). IEEE.","DOI":"10.1109\/ICCA54724.2022.9831831"},{"key":"2340_CR63","doi-asserted-by":"crossref","unstructured":"Liu, M., Zhou, Y., Qi, C. R., Gong, B., Su, H., & Anguelov, D. (2022h). Less: Label-efficient semantic segmentation for lidar point clouds. In European conference on computer vision (pp. 70\u201389). Springer.","DOI":"10.1007\/978-3-031-19842-7_5"},{"key":"2340_CR64","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021a). Swin transformer: Hierarchical vision transformer using shifted windows. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 10,012\u201310,022).","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2340_CR65","doi-asserted-by":"crossref","unstructured":"Liu, Z., Qi, X., & Fu, C. W. (2021b). One thing one click: A self-training approach for weakly supervised 3D semantic segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 1726\u20131736).","DOI":"10.1109\/CVPR46437.2021.00177"},{"key":"2340_CR66","doi-asserted-by":"crossref","unstructured":"Mao, J., Xue, Y., Niu, M., Bai, H., Feng, J., Liang, X., Xu, H., & Xu, C. (2021). Voxel transformer for 3D object detection. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 3164\u20133173).","DOI":"10.1109\/ICCV48922.2021.00315"},{"key":"2340_CR67","doi-asserted-by":"crossref","unstructured":"Michele, B., Boulch, A., Puy, G., Bucher, M., & Marlet, R. (2021). Generative zero-shot learning for semantic segmentation of 3D point clouds. In 2021 International Conference on 3D vision (3DV) (pp. 992\u20131002). IEEE.","DOI":"10.1109\/3DV53792.2021.00107"},{"key":"2340_CR68","unstructured":"Oord, A.v.d., Li, Y., & Vinyals, O. (2018). Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748"},{"key":"2340_CR69","doi-asserted-by":"crossref","unstructured":"Pang, Y., Wang, W., Tay, F. E., Liu, W., Tian, Y., & Yuan, L. (2022). Masked autoencoders for point cloud self-supervised learning. In European conference on computer vision. Springer.","DOI":"10.1007\/978-3-031-20086-1_35"},{"key":"2340_CR70","doi-asserted-by":"crossref","unstructured":"Papon, J., Abramov, A., Schoeler, M., & Worgotter, F. (2013). Voxel cloud connectivity segmentation-supervoxels for point clouds. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2027\u20132034).","DOI":"10.1109\/CVPR.2013.264"},{"key":"2340_CR71","unstructured":"Passaro, S., & Zitnick, C. L. (2023). Reducing SO (3) convolutions to SO (2) for efficient equivariant GNNs. In International conference on machine learning (pp. 27,420\u201327,438). PMLR."},{"key":"2340_CR72","doi-asserted-by":"crossref","unstructured":"Rao, Y., Liu, B., Wei, Y., Lu, J., Hsieh, C. J., & Zhou, J. (2021). RandomRooms: Unsupervised pre-training from synthetic shapes and randomized layouts for 3D object detection. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 3283\u20133292).","DOI":"10.1109\/ICCV48922.2021.00327"},{"key":"2340_CR73","doi-asserted-by":"crossref","unstructured":"Rozenberszki, D., Litany, O., & Dai, A. (2022). Language-grounded indoor 3D semantic segmentation in the wild. In European conference on computer vision (pp. 125\u2013141). Springer.","DOI":"10.1007\/978-3-031-19827-4_8"},{"key":"2340_CR74","doi-asserted-by":"crossref","unstructured":"Rusu, R. B., Blodow, N., & Beetz, M. (2009). Fast point feature histograms (FPFH) for 3D registration. In 2009 IEEE international conference on robotics and automation (ICRA) (pp. 3212\u20133217). IEEE.","DOI":"10.1109\/ROBOT.2009.5152473"},{"key":"2340_CR75","doi-asserted-by":"crossref","unstructured":"Rusu, R. B., Blodow, N., Marton, Z. C., & Beetz, M. (2008). Aligning point cloud views using persistent feature histograms. In 2008 IEEE\/RSJ international conference on intelligent robots and systems (IROS) (pp. 3384\u20133391). IEEE.","DOI":"10.1109\/IROS.2008.4650967"},{"key":"2340_CR76","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1016\/j.cviu.2014.04.011","volume":"125","author":"S Salti","year":"2014","unstructured":"Salti, S., Tombari, F., & Di Stefano, L. (2014). Shot: Unique signatures of histograms for surface and texture description. Computer Vision and Image Understanding, 125, 251\u2013264.","journal-title":"Computer Vision and Image Understanding"},{"key":"2340_CR77","doi-asserted-by":"crossref","unstructured":"Sanghi, A. (2020). Info3D: Representation learning on 3D objects using mutual information maximization and contrastive learning. In European conference on computer vision (pp. 626\u2013642). Springer","DOI":"10.1007\/978-3-030-58526-6_37"},{"key":"2340_CR78","doi-asserted-by":"crossref","unstructured":"Shi, S., Wang, X., & Li, H.(2019). PointRCNN: 3D object proposal generation and detection from point cloud. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 770\u2013779).","DOI":"10.1109\/CVPR.2019.00086"},{"issue":"8","key":"2340_CR79","first-page":"2647","volume":"43","author":"S Shi","year":"2020","unstructured":"Shi, S., Wang, Z., Shi, J., Wang, X., & Li, H. (2020). From points to parts: 3D object detection from point cloud with part-aware and part-aggregation network. IEEE Transactions on Pattern Analysis and Machine Intelligence, 43(8), 2647\u20132664.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2340_CR80","doi-asserted-by":"crossref","unstructured":"Sun, J., Ovsjanikov, M., & Guibas, L. (2009). A concise and provably informative multi-scale signature based on heat diffusion. In Computer graphics forum (vol.\u00a028, pp. 1383\u20131392). Wiley Online Library.","DOI":"10.1111\/j.1467-8659.2009.01515.x"},{"key":"2340_CR81","doi-asserted-by":"crossref","unstructured":"Sun, P., Kretzschmar, H., Dotiwalla, X., Chouard, A., Patnaik, V., Tsui, P., Guo, J., Zhou, Y., Chai, Y., Caine, B., & Vasudevan V. (2020). Scalability in perception for autonomous driving: Waymo open dataset. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 2446\u20132454).","DOI":"10.1109\/CVPR42600.2020.00252"},{"key":"2340_CR82","unstructured":"Takmaz, A., Fedele, E., Sumner, R. W., Pollefeys, M., Tombari, F., & Engelmann, F. (2023). OpenMask3D: Open-vocabulary 3D instance segmentation. arXiv preprint arXiv:2306.13631"},{"key":"2340_CR83","unstructured":"Team, O. (2020). OpenPCDet: An open-source toolbox for 3d object detection from point clouds. OD Team"},{"key":"2340_CR84","doi-asserted-by":"crossref","unstructured":"Uy, M. A., Pham, Q. H., Hua, B. S., Nguyen, T., & Yeung, S. K. (2019). Revisiting point cloud classification: A new benchmark dataset and classification model on real-world data. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 1588\u20131597).","DOI":"10.1109\/ICCV.2019.00167"},{"issue":"11","key":"2340_CR85","first-page":"2579","volume":"9","author":"L Van der Maaten","year":"2008","unstructured":"Van der Maaten, L., & Hinton, G. (2008). Visualizing data using t-SNE. Journal of Machine Learning Research, 9(11), 2579\u20132605.","journal-title":"Journal of Machine Learning Research"},{"key":"2340_CR86","doi-asserted-by":"crossref","unstructured":"Vu, T., Kim, K., Luu, T. M., Nguyen, T., Yoo, & C. D. (2022). SoftGroup for 3D instance segmentation on point clouds. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 2708\u20132717).","DOI":"10.1109\/CVPR52688.2022.00273"},{"key":"2340_CR87","doi-asserted-by":"crossref","unstructured":"Wang, H., Cong, Y., Litany, O., Gao, Y., & Guibas, L. J. (2021a). 3DIoUMatch: Leveraging IoU prediction for semi-supervised 3D object detection. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 14,615\u201314,624).","DOI":"10.1109\/CVPR46437.2021.01438"},{"key":"2340_CR88","doi-asserted-by":"crossref","unstructured":"Wang, H., Liu, Q., Yue, X., Lasenby, J., & Kusner, M. J. (2021b). Unsupervised point cloud pre-training via occlusion completion. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 9782\u20139792).","DOI":"10.1109\/ICCV48922.2021.00964"},{"key":"2340_CR89","doi-asserted-by":"crossref","unstructured":"Xie, S., Gu, J., Guo, D., Qi, C. R., Guibas, L., & Litany, O. (2020a). PointContrast: Unsupervised pre-training for 3D point cloud understanding. In European Conference on Computer Vision (pp. 574\u2013591). Springer.","DOI":"10.1007\/978-3-030-58580-8_34"},{"key":"2340_CR90","unstructured":"Xie, Y., Dai, H., Chen, M., Dai, B., Zhao, T., Zha, H., Wei, W., & Pfister, T. (2020b). Differentiable top-k with optimal transport. In Advances in neural information processing systems (vol. 33, pp. 20520\u201320531)."},{"key":"2340_CR91","doi-asserted-by":"crossref","unstructured":"Xie, Z., Zhang, Z., Cao, Y., Lin, Y., Bao, J., Yao, Z., Dai, Q., & Hu, H. (2022). SimMIM: A simple framework for masked image modeling. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 9653\u20139663).","DOI":"10.1109\/CVPR52688.2022.00943"},{"key":"2340_CR92","doi-asserted-by":"crossref","unstructured":"Xu, C., Tan, R.T., Tan, Y., Chen, S., Wang, Y.G., Wang, X., & Wang, Y. (2023). EqMotion: Equivariant multi-agent motion prediction with invariant interaction reasoning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 1410\u20131420).","DOI":"10.1109\/CVPR52729.2023.00142"},{"key":"2340_CR93","doi-asserted-by":"crossref","unstructured":"Yang, G., Liu, K., Zhao, Z., Zhang, J., Chen, X., & Chen, B. M. (2022). Datasets and methods for boosting infrastructure inspection: A survey on defect classification. In 2022 IEEE 17th international conference on control & automation (ICCA) (pp. 15\u201322). IEEE.","DOI":"10.1109\/ICCA54724.2022.9831922"},{"key":"2340_CR94","doi-asserted-by":"crossref","unstructured":"Yang, J., Deng, C., Wu, J., Antonova, R., Guibas, L., & Bohg, J. (2024). EquivAct: SIM(3)-equivariant visuomotor policies beyond rigid object manipulation. In 2024 IEEE international conference on robotics and automation (ICRA) (pp. 9249\u20139255). IEEE.","DOI":"10.1109\/ICRA57147.2024.10611491"},{"key":"2340_CR95","doi-asserted-by":"crossref","unstructured":"Yin, J., Zhou, D., Zhang, L., Fang, J., Xu, C. Z., Shen, J., & Wang, W. (2022). ProposalContrast: Unsupervised pre-training for LiDAR-based 3D object detection. In European conference on computer vision (pp. 574\u2013591). Springer.","DOI":"10.1007\/978-3-031-19842-7_2"},{"key":"2340_CR96","unstructured":"Zhang, Z., Bai, M., & Li, E. (2022). Self-supervised pretraining for large-scale point clouds. In Advances in neural information processing systems."},{"key":"2340_CR97","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Sun, B., Yang, H., & Huang, Q. (2020). H3dnet: 3D object detection using hybrid geometric primitives. In European conference on computer vision (pp. 311\u2013329). Springer.","DOI":"10.1007\/978-3-030-58610-2_19"},{"key":"2340_CR98","doi-asserted-by":"crossref","unstructured":"Zhou, Y., & Tuzel, O. (2018). VoxelNet: End-to-end learning for point cloud based 3D object detection. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 4490\u20134499).","DOI":"10.1109\/CVPR.2018.00472"}],"updated-by":[{"DOI":"10.1007\/s11263-025-02383-w","type":"correction","label":"Correction","source":"publisher","updated":{"date-parts":[[2025,3,15]],"date-time":"2025-03-15T00:00:00Z","timestamp":1741996800000}}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02340-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-024-02340-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02340-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,10]],"date-time":"2025-05-10T02:55:34Z","timestamp":1746845734000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-024-02340-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,1,14]]},"references-count":98,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["2340"],"URL":"https:\/\/doi.org\/10.1007\/s11263-024-02340-z","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,1,14]]},"assertion":[{"value":"30 November 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 December 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 January 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 February 2025","order":4,"name":"change_date","label":"Change Date","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Update","order":5,"name":"change_type","label":"Change Type","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The multiple errors in the article has been corrected.","order":6,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 March 2025","order":7,"name":"change_date","label":"Change Date","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Correction","order":8,"name":"change_type","label":"Change Type","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"A Correction to this paper has been published:","order":9,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"https:\/\/doi.org\/10.1007\/s11263-025-02383-w","URL":"https:\/\/doi.org\/10.1007\/s11263-025-02383-w","order":10,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}}]}}