{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T16:36:34Z","timestamp":1775838994551,"version":"3.50.1"},"publisher-location":"Cham","reference-count":64,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031727535","type":"print"},{"value":"9783031727542","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72754-2_16","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T14:57:07Z","timestamp":1730300227000},"page":"278-295","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":21,"title":["Segment3D: Learning Fine-Grained Class-Agnostic 3D Segmentation Without Manual Labels"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4970-698X","authenticated-orcid":false,"given":"Rui","family":"Huang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6085-8059","authenticated-orcid":false,"given":"Songyou","family":"Peng","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7834-4565","authenticated-orcid":false,"given":"Ay\u00e7a","family":"Takmaz","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5598-5212","authenticated-orcid":false,"given":"Federico","family":"Tombari","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2448-2318","authenticated-orcid":false,"given":"Marc","family":"Pollefeys","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7361-9283","authenticated-orcid":false,"given":"Shiji","family":"Song","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7251-0988","authenticated-orcid":false,"given":"Gao","family":"Huang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5745-2137","authenticated-orcid":false,"given":"Francis","family":"Engelmann","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"16_CR1","doi-asserted-by":"crossref","unstructured":"Armeni, I., et al.: 3D semantic parsing of large-scale indoor spaces. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.170"},{"key":"16_CR2","unstructured":"Baruch, G., et\u00a0al.: ARKitScenes: a diverse real-world dataset for 3D indoor scene understanding using mobile RGB-D data. arXiv preprint arXiv:2111.08897 (2021)"},{"key":"16_CR3","unstructured":"Bhat, S.F., Birkl, R., Wofk, D., Wonka, P., M\u00fcller, M.: ZoeDepth: zero-shot transfer by combining relative and metric depth. arXiv preprint arXiv:2302.12288 (2023)"},{"key":"16_CR4","doi-asserted-by":"crossref","unstructured":"Caesar, H., et al.: nuScenes: a multimodal dataset for autonomous driving. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"16_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-End object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"16_CR6","doi-asserted-by":"crossref","unstructured":"Caron, M., Touvron, H., Misra, I., J\u00e9gou, H., Mairal, J., Bojanowski, P., Joulin, A.: Emerging properties in self-supervised vision transformers. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"16_CR7","unstructured":"Chen, M., et al.: STPLS3D: a large-scale synthetic and real aerial photogrammetry 3D point cloud dataset. arXiv preprint arXiv:2203.09065 (2022)"},{"key":"16_CR8","doi-asserted-by":"crossref","unstructured":"Chen, R., et al.: CLIP2Scene: towards label-efficient 3D scene understanding by CLIP. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00678"},{"key":"16_CR9","doi-asserted-by":"crossref","unstructured":"Chen, S., Fang, J., Zhang, Q., Liu, W., Wang, X.: Hierarchical aggregation for 3D instance segmentation. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01518"},{"key":"16_CR10","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"16_CR11","unstructured":"Cheng, B., Schwing, A., Kirillov, A.: Per-pixel classification is not all you need for semantic segmentation. In: NeurIPS (2021)"},{"key":"16_CR12","doi-asserted-by":"crossref","unstructured":"Choy, C., Gwak, J., Savarese, S.: 4D spatio-temporal convnets: minkowski convolutional neural networks. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00319"},{"key":"16_CR13","doi-asserted-by":"crossref","unstructured":"Dai, A., Chang, A.X., Savva, M., Halber, M., Funkhouser, T., Nie\u00dfner, M.: ScanNet: richly-annotated 3D reconstructions of indoor scenes. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.261"},{"issue":"4","key":"16_CR14","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3054739","volume":"36","author":"A Dai","year":"2017","unstructured":"Dai, A., Nie\u00dfner, M., Zollh\u00f6fer, M., Izadi, S., Theobalt, C.: BundleFusion: real-time globally consistent 3D reconstruction using on-the-fly surface re-integration. TOG 36(4), 1 (2017)","journal-title":"TOG"},{"key":"16_CR15","doi-asserted-by":"crossref","unstructured":"Delitzas, A., Takmaz, A., Tombari, F., Sumner, R., Pollefeys, M., Engelmann, F.: SceneFun3D: fine-grained functionality and affordance understanding in 3D scenes. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.01377"},{"key":"16_CR16","doi-asserted-by":"crossref","unstructured":"Ding, R., Yang, J., Xue, C., Zhang, W., Bai, S., Qi, X.: PLA: language-driven open-vocabulary 3D scene understanding. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00677"},{"key":"16_CR17","doi-asserted-by":"crossref","unstructured":"Engelmann, F., Bokeloh, M., Fathi, A., Leibe, B., Nie\u00dfner, M.: 3D-MPA: multi-proposal aggregation for 3D semantic instance segmentation. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00905"},{"key":"16_CR18","unstructured":"Engelmann, F., Manhardt, F., Niemeyer, M., Tateno, K., Tombari, F.: OpenNeRF: open set 3D neural scene segmentation with pixel-wise features and rendered novel views. In: ICLR (2024)"},{"key":"16_CR19","unstructured":"Ester, M., Kriegel, H.P., Sander, J., Xu, X., et\u00a0al.: A density-based algorithm for discovering clusters in large spatial databases with noise. In: KDD (1996)"},{"key":"16_CR20","doi-asserted-by":"publisher","first-page":"167","DOI":"10.1023\/B:VISI.0000022288.19776.77","volume":"59","author":"PF Felzenszwalb","year":"2004","unstructured":"Felzenszwalb, P.F., Huttenlocher, D.P.: Efficient graph-based image segmentation. IJCV 59, 167\u2013181 (2004)","journal-title":"IJCV"},{"key":"16_CR21","unstructured":"Gu, X., Lin, T.Y., Kuo, W., Cui, Y.: Open-vocabulary object detection via vision and language knowledge distillation. In: ICLR (2022)"},{"key":"16_CR22","unstructured":"Ha, H., Song, S.: semantic abstraction: open-world 3D scene understanding from 2D vision-language models. In: CoRL (2022)"},{"key":"16_CR23","doi-asserted-by":"crossref","unstructured":"Huang, T., et al.: CLIP2Point: transfer CLIP to point cloud classification with image-depth pre-training. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.02025"},{"key":"16_CR24","doi-asserted-by":"crossref","unstructured":"Izadi, S., et\u00a0al.: KinectFusion: real-time 3D reconstruction and interaction using a moving depth camera. In: UIST (2011)","DOI":"10.1145\/2047196.2047270"},{"key":"16_CR25","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: ICML (2021)"},{"key":"16_CR26","doi-asserted-by":"crossref","unstructured":"Jiang, L., Zhao, H., Shi, S., Liu, S., Fu, C.W., Jia, J.: PointGroup: dual-set point grouping for 3D instance segmentation. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00492"},{"key":"16_CR27","doi-asserted-by":"crossref","unstructured":"Kerr, J., Kim, C.M., Goldberg, K., Kanazawa, A., Tancik, M.: LERF: language embedded radiance fields. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01807"},{"key":"16_CR28","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"16_CR29","unstructured":"Kobayashi, S., Matsumoto, E., Sitzmann, V.: Decomposing nerf for editing via feature field distillation. In: NeurIPS (2022)"},{"key":"16_CR30","unstructured":"Lemke, O., Bauer, Z., Zurbr\u00fcgg, R., Pollefeys, M., Engelmann, F., Blum, H.: Spot-compose: a framework for open-vocabulary object retrieval and drawer manipulation in point clouds. In: 2nd Workshop on Mobile Manipulation and Embodied Intelligence at ICRA 2024 (2024)"},{"key":"16_CR31","doi-asserted-by":"crossref","unstructured":"Liang, Z., Li, Z., Xu, S., Tan, M., Jia, K.: Instance segmentation in 3D scenes using semantic superpoint tree networks. In: CVPR (2021)","DOI":"10.1109\/ICCV48922.2021.00278"},{"key":"16_CR32","doi-asserted-by":"crossref","unstructured":"Lu, J., Deng, J., Wang, C., He, J., Zhang, T.: Query refinement transformer for 3D instance segmentation. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01697"},{"key":"16_CR33","doi-asserted-by":"crossref","unstructured":"Lu, Y., et al.: Open-vocabulary point-cloud object detection without 3D annotation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00121"},{"key":"16_CR34","doi-asserted-by":"crossref","unstructured":"Milletari, F., Navab, N., Ahmadi, S.A.: V-net: fully convolutional neural networks for volumetric medical image segmentation. In: 3DV (2016)","DOI":"10.1109\/3DV.2016.79"},{"key":"16_CR35","doi-asserted-by":"crossref","unstructured":"Nekrasov, A., Schult, J., Litany, O., Leibe, B., Engelmann, F.: Mix3D: Out-of-context data augmentation for 3D scenes. In: 3DV (2021)","DOI":"10.1109\/3DV53792.2021.00022"},{"key":"16_CR36","unstructured":"Oquab, M., et\u00a0al.: DINOv2: learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)"},{"key":"16_CR37","doi-asserted-by":"crossref","unstructured":"Patashnik, O., Wu, Z., Shechtman, E., Cohen-Or, D., Lischinski, D.: StyleCLIP: text-driven manipulation of StyleGAN imagery. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00209"},{"key":"16_CR38","doi-asserted-by":"crossref","unstructured":"Peng, S., Genova, K., Jiang, C., Tagliasacchi, A., Pollefeys, M., Funkhouser, T., et\u00a0al.: OpenScene: 3D scene understanding with open vocabularies. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"16_CR39","unstructured":"Qi, C.R., Su, H., Mo, K., Guibas, L.J.: PointNet: deep learning on point sets for 3D classification and segmentation. In: CVPR (2017)"},{"key":"16_CR40","unstructured":"Qi, C.R., Yi, L., Su, H., Guibas, L.J.: Pointnet++: deep hierarchical feature learning on point sets in a metric space. In: NeurIPS (2017)"},{"key":"16_CR41","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"16_CR42","doi-asserted-by":"crossref","unstructured":"Rao, Y., et al.: DenseCLIP: language-guided dense prediction with context-aware prompting. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"16_CR43","first-page":"545","volume":"37","author":"X Roynard","year":"2018","unstructured":"Roynard, X., Deschaud, J.E., Goulette, F.: Paris-Lille-3D: a large and high-quality ground-truth urban point cloud dataset for automatic segmentation and classification. IJRR 37, 545\u2013557 (2018)","journal-title":"IJRR"},{"key":"16_CR44","doi-asserted-by":"publisher","first-page":"125","DOI":"10.1007\/978-3-031-19827-4_8","volume-title":"ECCV 2022","author":"D Rozenberszki","year":"2022","unstructured":"Rozenberszki, D., Litany, O., Dai, A.: Language-grounded indoor 3D semantic segmentation in the wild. In: Avidan, S., Brostow, G., Cisse, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, pp. 125\u2013141. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19827-4_8"},{"key":"16_CR45","doi-asserted-by":"crossref","unstructured":"Rozenberszki, D., Litany, O., Dai, A.: UnScene3D: unsupervised 3D instance segmentation for indoor scenes. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.01886"},{"key":"16_CR46","doi-asserted-by":"crossref","unstructured":"Schult, J., Engelmann, F., Hermans, A., Litany, O., Tang, S., Leibe, B.: Mask3D: mask transformer for 3D semantic instance segmentation. In: ICRA (2023)","DOI":"10.1109\/ICRA48891.2023.10160590"},{"key":"16_CR47","doi-asserted-by":"crossref","unstructured":"Schult, J., Engelmann, F., Kontogianni, T., Leibe, B.: DualConvMesh-net: joint geodesic and euclidean convolutions on 3D meshes. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00864"},{"key":"16_CR48","unstructured":"Straub, J., et\u00a0al.: The replica dataset: a digital replica of indoor spaces. arXiv preprint arXiv:1906.05797 (2019)"},{"key":"16_CR49","doi-asserted-by":"crossref","unstructured":"Sun, J., Qing, C., Tan, J., Xu, X.: Superpoint transformer for 3D scene instance segmentation. In: AAAI (2023)","DOI":"10.1609\/aaai.v37i2.25335"},{"key":"16_CR50","unstructured":"Sun, T., et al.: Nothing stands still: a spatiotemporal benchmark on 3d point cloud registration under large geometric and temporal change. arXiv preprint arXiv:2311.09346 (2023)"},{"key":"16_CR51","unstructured":"Takmaz, A., Fedele, E., Sumner, R.W., Pollefeys, M., Tombari, F., Engelmann, F.: OpenMask3D: open-vocabulary 3D instance segmentation. In: NeurIPS (2023)"},{"key":"16_CR52","doi-asserted-by":"crossref","unstructured":"Takmaz, A., et al.: 3D segmentation of humans in point clouds with synthetic data. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00125"},{"key":"16_CR53","doi-asserted-by":"crossref","unstructured":"Thomas, H., Qi, C.R., Deschaud, J.E., Marcotegui, B., Goulette, F., Guibas, L.J.: KPConv: flexible and deformable convolution for point clouds. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00651"},{"key":"16_CR54","doi-asserted-by":"crossref","unstructured":"Vu, T., Kim, K., Luu, T.M., Nguyen, T., Yoo, C.D.: SoftGroup for 3D instance segmentation on point clouds. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00273"},{"key":"16_CR55","doi-asserted-by":"crossref","unstructured":"Weder, S., Blum, H., Engelmann, F., Pollefeys, M.: LabelMaker: automatic semantic label generation from RGB-D trajectories. In: 3DV (2024)","DOI":"10.1109\/3DV62453.2024.00075"},{"key":"16_CR56","unstructured":"Yang, B., et al.: Learning object bounding boxes for 3D instance segmentation on point clouds. In: NeurIPS (2019)"},{"key":"16_CR57","unstructured":"Yang, Y., Wu, X., He, T., Zhao, H., Liu, X.: SAM3D: segment anything in 3D scenes. In: ICCVW (2023)"},{"key":"16_CR58","doi-asserted-by":"crossref","unstructured":"Yeshwanth, C., Liu, Y.C., Nie\u00dfner, M., Dai, A.: ScanNet++: a high-fidelity dataset of 3D indoor scenes. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00008"},{"key":"16_CR59","doi-asserted-by":"crossref","unstructured":"Yi, L., Zhao, W., Wang, H., Sung, M., Guibas, L.J.: GSPN: generative shape proposal network for 3D instance segmentation in point cloud. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00407"},{"key":"16_CR60","doi-asserted-by":"crossref","unstructured":"Yue, Y., Das, A., Engelmann, F., Tang, S., Lenssen, J.: Improving 2D feature representations by 3D-aware fine-tuning. In: ECCV (2024)","DOI":"10.1007\/978-3-031-72627-9_4"},{"key":"16_CR61","doi-asserted-by":"crossref","unstructured":"Yue, Y., Kontogianni, T., Schindler, K., Engelmann, F.: Connecting the dots: floorplan reconstruction using two-level queries. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00088"},{"key":"16_CR62","doi-asserted-by":"crossref","unstructured":"Zeng, Y., et al.: CLIP2: contrastive language-image-point pretraining from real-world point cloud data. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01463"},{"key":"16_CR63","doi-asserted-by":"crossref","unstructured":"Zhang, R., et al.: PointCLIP: point cloud understanding by CLIP. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00836"},{"key":"16_CR64","doi-asserted-by":"crossref","unstructured":"Zurbr\u00fcgg, R., et al.: ICGNet: a unified approach for instance-centric grasping. In: ICRA (2024)","DOI":"10.1109\/ICRA57147.2024.10611725"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72754-2_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:06:40Z","timestamp":1730300800000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72754-2_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031727535","9783031727542"],"references-count":64,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72754-2_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}