{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T16:51:19Z","timestamp":1777654279294,"version":"3.51.4"},"publisher-location":"Cham","reference-count":79,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729485","type":"print"},{"value":"9783031729492","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72949-2_2","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:22:17Z","timestamp":1730301737000},"page":"19-38","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["CONDENSE: Consistent 2D\/3D Pre-training for\u00a0Dense and\u00a0Sparse Features from\u00a0Multi-View Images"],"prefix":"10.1007","author":[{"given":"Xiaoshuai","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Zhicheng","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Howard","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Soham","family":"Ghosh","sequence":"additional","affiliation":[]},{"given":"Danushen","family":"Gnanapragasam","sequence":"additional","affiliation":[]},{"given":"Varun","family":"Jampani","sequence":"additional","affiliation":[]},{"given":"Hao","family":"Su","sequence":"additional","affiliation":[]},{"given":"Leonidas","family":"Guibas","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"2_CR1","unstructured":"An, X., et al.: Unicom: universal and compact representation learning for image retrieval. arXiv preprint arXiv:2304.05884 (2023)"},{"key":"2_CR2","doi-asserted-by":"publisher","unstructured":"Armeni, I., Sener, O., Zamir, A.R., Jiang, H., Brilakis, I., Fischer, M., Savarese, S.: 3d semantic parsing of large-scale indoor spaces. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1534\u20131543 (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.170","DOI":"10.1109\/CVPR.2016.170"},{"key":"2_CR3","doi-asserted-by":"crossref","unstructured":"Barron, J.T., Mildenhall, B., Verbin, D., Srinivasan, P.P., Hedman, P.: Mip-nerf 360: unbounded anti-aliased neural radiance fields. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00539"},{"issue":"1","key":"2_CR4","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/1899404.1899405","volume":"30","author":"AM Bronstein","year":"2011","unstructured":"Bronstein, A.M., Bronstein, M.M., Guibas, L.J., Ovsjanikov, M.: Shape google: geometric words and expressions for invariant shape retrieval. ACM Trans. Graph. (TOG) 30(1), 1\u201320 (2011)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"2_CR5","doi-asserted-by":"crossref","unstructured":"Carlucci, F.M., D\u2019Innocente, A., Bucci, S., Caputo, B., Tommasi, T.: Domain generalization by solving jigsaw puzzles. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2229\u20132238 (2019)","DOI":"10.1109\/CVPR.2019.00233"},{"key":"2_CR6","doi-asserted-by":"crossref","unstructured":"Caron, M., Touvron, H., Misra, I., J\u00e9gou, H., Mairal, J., Bojanowski, P., Joulin, A.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"2_CR7","unstructured":"Chang, A.X., et al.: ShapeNet: an information-rich 3D model repository. Technical Report. arXiv:1512.03012 (2015)"},{"key":"2_CR8","doi-asserted-by":"crossref","unstructured":"Chen, A., et al.: Mvsnerf: fast generalizable radiance field reconstruction from multi-view stereo. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 14124\u201314133 (2021)","DOI":"10.1109\/ICCV48922.2021.01386"},{"key":"2_CR9","doi-asserted-by":"publisher","unstructured":"Chen, D.Y., Tian, X.P., Shen, Y.T., Ouhyoung, M.: On visual similarity based 3d model retrieval. Comput. Graph. Forum 22(3), 223\u2013232 (2003). https:\/\/doi.org\/10.1111\/1467-8659.00669. https:\/\/onlinelibrary.wiley.com\/doi\/abs\/10.1111\/1467-8659.00669","DOI":"10.1111\/1467-8659.00669"},{"key":"2_CR10","unstructured":"Chen, G., Wang, M., Yang, Y., Yu, K., Yuan, L., Yue, Y.: Pointgpt: auto-regressively generative pre-training from point clouds. arXiv preprint arXiv:2305.11487 (2023)"},{"key":"2_CR11","doi-asserted-by":"crossref","unstructured":"Chen, R., Han, S., Xu, J., Su, H.: Point-based multi-view stereo network. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1538\u20131547 (2019)","DOI":"10.1109\/ICCV.2019.00162"},{"key":"2_CR12","doi-asserted-by":"crossref","unstructured":"Choy, C., Gwak, J., Savarese, S.: 4d spatio-temporal convnets: minkowski convolutional neural networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3075\u20133084 (2019)","DOI":"10.1109\/CVPR.2019.00319"},{"key":"2_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"424","DOI":"10.1007\/978-3-319-46723-8_49","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2016","author":"\u00d6 \u00c7i\u00e7ek","year":"2016","unstructured":"\u00c7i\u00e7ek, \u00d6., Abdulkadir, A., Lienkamp, S.S., Brox, T., Ronneberger, O.: 3D U-Net: learning dense volumetric segmentation from sparse annotation. In: Ourselin, S., Joskowicz, L., Sabuncu, M.R., Unal, G., Wells, W. (eds.) MICCAI 2016. LNCS, vol. 9901, pp. 424\u2013432. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46723-8_49"},{"key":"2_CR14","doi-asserted-by":"crossref","unstructured":"Dai, A., Chang, A.X., Savva, M., Halber, M., Funkhouser, T., Nie\u00dfner, M.: Scannet: Richly-annotated 3d reconstructions of indoor scenes. In: Proceedings of Computer Vision and Pattern Recognition (CVPR). IEEE (2017)","DOI":"10.1109\/CVPR.2017.261"},{"key":"2_CR15","doi-asserted-by":"crossref","unstructured":"Deng, X., Zhang, W., Ding, Q., Zhang, X.: Pointvector: a vector representation in point cloud analysis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9455\u20139465 (2023)","DOI":"10.1109\/CVPR52729.2023.00912"},{"key":"2_CR16","doi-asserted-by":"crossref","unstructured":"DeTone, D., Malisiewicz, T., Rabinovich, A.: Superpoint: self-supervised interest point detection and description (2018)","DOI":"10.1109\/CVPRW.2018.00060"},{"key":"2_CR17","unstructured":"Dosovitskiy, A., et al.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. In: ICLR (2021)"},{"key":"2_CR18","doi-asserted-by":"crossref","unstructured":"Hamdi, A., Giancola, S., Ghanem, B.: Mvtn: multi-view transformation network for 3d shape recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1\u201311 (2021)","DOI":"10.1109\/ICCV48922.2021.00007"},{"key":"2_CR19","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"2_CR20","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"2_CR21","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"2_CR22","unstructured":"Hu, E.J., et al.: Lora: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"2_CR23","doi-asserted-by":"publisher","unstructured":"Ilharco, G., et al.: Openclip (2021). https:\/\/doi.org\/10.5281\/zenodo.5143773","DOI":"10.5281\/zenodo.5143773"},{"key":"2_CR24","doi-asserted-by":"crossref","unstructured":"Jensen, R., Dahl, A., Vogiatzis, G., Tola, E., Aan\u00e6s, H.: Large scale multi-view stereopsis evaluation. In: 2014 IEEE Conference on Computer Vision and Pattern Recognition, pp. 406\u2013413. IEEE (2014)","DOI":"10.1109\/CVPR.2014.59"},{"key":"2_CR25","doi-asserted-by":"crossref","unstructured":"Kerr, J., Kim, C.M., Goldberg, K., Kanazawa, A., Tancik, M.: Lerf: language embedded radiance fields. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 19729\u201319739 (2023)","DOI":"10.1109\/ICCV51070.2023.01807"},{"key":"2_CR26","unstructured":"Li, Y., Bu, R., Sun, M., Wu, W., Di, X., Chen, B.: Pointcnn: convolution on x-transformed points. Adv. Neural Inf. Process. Syst. 31 (2018)"},{"issue":"6","key":"2_CR27","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2816795.2818071","volume":"34","author":"Y Li","year":"2015","unstructured":"Li, Y., Su, H., Qi, C.R., Fish, N., Cohen-Or, D., Guibas, L.J.: Joint embeddings of shapes and images via cnn image purification. ACM Trans. Graph. (TOG) 34(6), 1\u201312 (2015)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"2_CR28","unstructured":"Lin, H., et al.: Meta architecure for point cloud analysis. arXiv:2211.14462 (2022)"},{"key":"2_CR29","unstructured":"Liu, M., et al.: OpenShape: scaling up 3D shape representation towards open-world understanding. In: Annual Conference on Neural Information Processing Systems (NeurIPS) (2023)"},{"key":"2_CR30","doi-asserted-by":"crossref","unstructured":"Liu, M., et al.: Partslip: low-shot part segmentation for 3d point clouds via pretrained image-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21736\u201321746 (2023)","DOI":"10.1109\/CVPR52729.2023.02082"},{"key":"2_CR31","unstructured":"Ma, X., Qin, C., You, H., Ran, H., Fu, Y.: Rethinking network design and local geometry in point cloud: a simple residual mlp framework. arXiv preprint arXiv:2202.07123 (2022)"},{"issue":"1","key":"2_CR32","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1145\/3503250","volume":"65","author":"B Mildenhall","year":"2021","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: Nerf: representing scenes as neural radiance fields for view synthesis. Commun. ACM 65(1), 99\u2013106 (2021)","journal-title":"Commun. ACM"},{"key":"2_CR33","doi-asserted-by":"crossref","unstructured":"Nekrasov, A., Schult, J., Litany, O., Leibe, B., Engelmann, F.: Mix3d: out-of-context data augmentation for 3d scenes. In: 2021 International Conference on 3D Vision (3DV), pp. 116\u2013125. IEEE (2021)","DOI":"10.1109\/3DV53792.2021.00022"},{"key":"2_CR34","unstructured":"Oquab, M., et\u00a0al.: Dinov2: learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)"},{"key":"2_CR35","doi-asserted-by":"crossref","unstructured":"Osada, R., Funkhouser, T., Chazelle, B., Dobkin, D.: Matching 3d models with shape distributions. In: Proceedings International Conference on Shape Modeling and Applications, pp. 154\u2013166. IEEE (2001)","DOI":"10.1109\/SMA.2001.923386"},{"key":"2_CR36","doi-asserted-by":"publisher","unstructured":"Pang, Y., Wang, W., Tay, F.E., Liu, W., Tian, Y., Yuan, L.: Masked autoencoders for point cloud self-supervised learning. In: Computer Vision\u2013ECCV 2022: 17th European Conference, Tel Aviv, Israel, 23\u201327 October 2022, Proceedings, Part II, pp. 604\u2013621. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-20086-1_35","DOI":"10.1007\/978-3-031-20086-1_35"},{"key":"2_CR37","doi-asserted-by":"crossref","unstructured":"Pathak, D., Krahenbuhl, P., Donahue, J., Darrell, T., Efros, A.A.: Context encoders: feature learning by inpainting. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2536\u20132544 (2016)","DOI":"10.1109\/CVPR.2016.278"},{"key":"2_CR38","doi-asserted-by":"crossref","unstructured":"Peng, S., et\u00a0al.: Openscene: 3d scene understanding with open vocabularies. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 815\u2013824 (2023)","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"2_CR39","unstructured":"Qi, C.R., Su, H., Mo, K., Guibas, L.J.: Pointnet: deep learning on point sets for 3d classification and segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 652\u2013660 (2017)"},{"key":"2_CR40","unstructured":"Qi, C.R., Yi, L., Su, H., Guibas, L.J.: Pointnet++: deep hierarchical feature learning on point sets in a metric space. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"2_CR41","unstructured":"Qi, Z., et al.: Contrast with reconstruct: contrastive 3d representation learning guided by generative pretraining. arXiv preprint arXiv:2302.02318 (2023)"},{"key":"2_CR42","first-page":"23192","volume":"35","author":"G Qian","year":"2022","unstructured":"Qian, G., et al.: Pointnext: revisiting pointnet++ with improved training and scaling strategies. Adv. Neural. Inf. Process. Syst. 35, 23192\u201323204 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2_CR43","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"2_CR44","doi-asserted-by":"crossref","unstructured":"Ran, H., Liu, J., Wang, C.: Surface representation for point clouds. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18942\u201318952 (2022)","DOI":"10.1109\/CVPR52688.2022.01837"},{"key":"2_CR45","unstructured":"Ridnik, T., Ben-Baruch, E., Noy, A., Zelnik-Manor, L.: Imagenet-21k pretraining for the masses. arXiv preprint arXiv:2104.10972 (2021)"},{"issue":"3","key":"2_CR46","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., et al.: ImageNet large scale visual recognition challenge. Int. J. Comput. Vision (IJCV) 115(3), 211\u2013252 (2015). https:\/\/doi.org\/10.1007\/s11263-015-0816-y","journal-title":"Int. J. Comput. Vision (IJCV)"},{"key":"2_CR47","unstructured":"Straub, J., et al.: The replica dataset: a digital replica of indoor spaces. arXiv preprint arXiv:1906.05797 (2019)"},{"key":"2_CR48","doi-asserted-by":"crossref","unstructured":"Thomas, H., Qi, C.R., Deschaud, J.E., Marcotegui, B., Goulette, F., Guibas, L.J.: Kpconv: flexible and deformable convolution for point clouds. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6411\u20136420 (2019)","DOI":"10.1109\/ICCV.2019.00651"},{"key":"2_CR49","doi-asserted-by":"crossref","unstructured":"Uy, M.A., Pham, Q.H., Hua, B.S., Nguyen, D.T., Yeung, S.K.: Revisiting point cloud classification: a new benchmark dataset and classification model on real-world data. In: International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00167"},{"key":"2_CR50","unstructured":"Vora, S., et al.: Nesf: neural semantic fields for generalizable semantic segmentation of 3d scenes. arXiv preprint arXiv:2111.13260 (2021)"},{"key":"2_CR51","doi-asserted-by":"crossref","unstructured":"Vu, T., Kim, K., Luu, T.M., Nguyen, T., Yoo, C.D.: Softgroup for 3d instance segmentation on point clouds. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2708\u20132717 (2022)","DOI":"10.1109\/CVPR52688.2022.00273"},{"key":"2_CR52","first-page":"29975","volume":"35","author":"H Wang","year":"2022","unstructured":"Wang, H., et al.: Cagroup3d: class-aware grouping for 3d object detection on point clouds. Adv. Neural. Inf. Process. Syst. 35, 29975\u201329988 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2_CR53","doi-asserted-by":"publisher","first-page":"346","DOI":"10.1016\/j.neucom.2018.09.008","volume":"321","author":"B Wu","year":"2018","unstructured":"Wu, B., Liu, Y., Lang, B., Huang, L.: Dgcnn: disordered graph convolutional neural network based on the gaussian mixture model. Neurocomputing 321, 346\u2013356 (2018)","journal-title":"Neurocomputing"},{"key":"2_CR54","doi-asserted-by":"crossref","unstructured":"Wu, X., et al.: Towards large-scale 3d representation learning with multi-dataset point prompt training. arXiv preprint arXiv:2308.09718 (2023)","DOI":"10.1109\/CVPR52733.2024.01849"},{"key":"2_CR55","doi-asserted-by":"crossref","unstructured":"Wu, X., Wen, X., Liu, X., Zhao, H.: Masked scene contrast: a scalable framework for unsupervised 3d representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9415\u20139424 (2023)","DOI":"10.1109\/CVPR52729.2023.00908"},{"key":"2_CR56","unstructured":"Wu, Z., et al.: 3d shapenets: a deep representation for volumetric shapes. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1912\u20131920 (2015)"},{"key":"2_CR57","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"574","DOI":"10.1007\/978-3-030-58580-8_34","volume-title":"Computer Vision \u2013 ECCV 2020","author":"S Xie","year":"2020","unstructured":"Xie, S., Gu, J., Guo, D., Qi, C.R., Guibas, L., Litany, O.: PointContrast: unsupervised pre-training for 3D point cloud understanding. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12348, pp. 574\u2013591. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58580-8_34"},{"key":"2_CR58","doi-asserted-by":"crossref","unstructured":"Xu, C., et\u00a0al.: Nerf-det: learning geometry-aware volumetric representation for multi-view 3d object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 23320\u201323330 (2023)","DOI":"10.1109\/ICCV51070.2023.02131"},{"key":"2_CR59","doi-asserted-by":"crossref","unstructured":"Xue, L., et al.: Ulip: learning a unified representation of language, images, and point clouds for 3d understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1179\u20131189 (2023)","DOI":"10.1109\/CVPR52729.2023.00120"},{"key":"2_CR60","doi-asserted-by":"crossref","unstructured":"Xue, L., et al.: Ulip-2: towards scalable multimodal pre-training for 3d understanding. arXiv preprint arXiv:2305.08275 (2023)","DOI":"10.1109\/CVPR52733.2024.02558"},{"key":"2_CR61","unstructured":"Yang, Y.Q., et al.: Swin3d: a pretrained transformer backbone for 3d indoor scene understanding. arXiv preprint arXiv:2304.06906 (2023)"},{"key":"2_CR62","doi-asserted-by":"crossref","unstructured":"Yao, Y., Luo, Z., Li, S., Fang, T., Quan, L.: Mvsnet: depth inference for unstructured multi-view stereo. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 767\u2013783 (2018)","DOI":"10.1007\/978-3-030-01237-3_47"},{"key":"2_CR63","doi-asserted-by":"crossref","unstructured":"Ye, J., Wang, N., Wang, X.: Featurenerf: learning generalizable nerfs by distilling foundation models. arXiv preprint arXiv:2303.12786 (2023)","DOI":"10.1109\/ICCV51070.2023.00823"},{"key":"2_CR64","doi-asserted-by":"crossref","unstructured":"Yu, A., Ye, V., Tancik, M., Kanazawa, A.: pixelnerf: neural radiance fields from one or few images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4578\u20134587 (2021)","DOI":"10.1109\/CVPR46437.2021.00455"},{"key":"2_CR65","doi-asserted-by":"crossref","unstructured":"Yu, X., et al.: Mvimgnet: a large-scale dataset of multi-view images. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00883"},{"key":"2_CR66","doi-asserted-by":"crossref","unstructured":"Yu, X., Tang, L., Rao, Y., Huang, T., Zhou, J., Lu, J.: Point-bert: pre-training 3d point cloud transformers with masked point modeling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19313\u201319322 (2022)","DOI":"10.1109\/CVPR52688.2022.01871"},{"key":"2_CR67","doi-asserted-by":"crossref","unstructured":"Zeid, K.A., Schult, J., Hermans, A., Leibe, B.: Point2vec for self-supervised representation learning on point clouds. arXiv preprint arXiv:2303.16570 (2023)","DOI":"10.1007\/978-3-031-54605-1_9"},{"key":"2_CR68","doi-asserted-by":"crossref","unstructured":"Zhang, R., et al.: Pointclip: point cloud understanding by clip. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8552\u20138562 (2022)","DOI":"10.1109\/CVPR52688.2022.00836"},{"key":"2_CR69","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"649","DOI":"10.1007\/978-3-319-46487-9_40","volume-title":"Computer Vision \u2013 ECCV 2016","author":"R Zhang","year":"2016","unstructured":"Zhang, R., Isola, P., Efros, A.A.: Colorful image colorization. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9907, pp. 649\u2013666. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46487-9_40"},{"key":"2_CR70","doi-asserted-by":"crossref","unstructured":"Zhang, X., Bi, S., Sunkavalli, K., Su, H., Xu, Z.: Nerfusion: fusing radiance fields for large-scale scene reconstruction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5449\u20135458 (2022)","DOI":"10.1109\/CVPR52688.2022.00537"},{"key":"2_CR71","doi-asserted-by":"crossref","unstructured":"Zhang, X., Kundu, A., Funkhouser, T., Guibas, L., Su, H., Genova, K.: Nerflets: local radiance fields for efficient structure-aware 3d scene representation from 2d supervision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8274\u20138284 (2023)","DOI":"10.1109\/CVPR52729.2023.00800"},{"key":"2_CR72","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Girdhar, R., Joulin, A., Misra, I.: Self-supervised pretraining of 3d features on any point-cloud. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10252\u201310263 (2021)","DOI":"10.1109\/ICCV48922.2021.01009"},{"key":"2_CR73","doi-asserted-by":"crossref","unstructured":"Zhi, S., Laidlow, T., Leutenegger, S., Davison, A.J.: In-place scene labelling and understanding with implicit scene representation. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01554"},{"key":"2_CR74","unstructured":"Zhou, B., Lapedriza, A., Xiao, J., Torralba, A., Oliva, A.: Learning deep features for scene recognition using places database. In: Ghahramani, Z., Welling, M., Cortes, C., Lawrence, N., Weinberger, K.Q. (eds.) Advances in Neural Information Processing Systems, vol.\u00a027. Curran Associates, Inc. (2014). https:\/\/proceedings.neurips.cc\/paper\/2014\/file\/3fe94a002317b5f9259f82690aeea4cd-Paper.pdf"},{"key":"2_CR75","unstructured":"Zhou, J., et al.: ibot: image bert pre-training with online tokenizer. arXiv preprint arXiv:2111.07832 (2021)"},{"key":"2_CR76","unstructured":"Zhou, J., Wang, J., Ma, B., Liu, Y.S., Huang, T., Wang, X.: Uni3d: exploring unified 3d representation at scale. arXiv preprint arXiv:2310.06773 (2023)"},{"key":"2_CR77","doi-asserted-by":"crossref","unstructured":"Zhou, T., Tucker, R., Flynn, J., Fyffe, G., Snavely, N.: Stereo magnification: learning view synthesis using multiplane images. ACM Trans. Graph. (Proc. SIGGRAPH) 37 (2018). https:\/\/arxiv.org\/abs\/1805.09817","DOI":"10.1145\/3197517.3201323"},{"key":"2_CR78","unstructured":"Zhu, H., et\u00a0al.: Ponderv2: pave the way for 3d foundataion model with a universal pre-training paradigm. arXiv preprint arXiv:2310.08586 (2023)"},{"key":"2_CR79","doi-asserted-by":"crossref","unstructured":"Zhu, X., et al.: Pointclip v2: prompting clip and gpt for powerful 3d open-world learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2639\u20132650 (2023)","DOI":"10.1109\/ICCV51070.2023.00249"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72949-2_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:38:28Z","timestamp":1730302708000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72949-2_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031729485","9783031729492"],"references-count":79,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72949-2_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}