{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T17:04:18Z","timestamp":1771952658746,"version":"3.50.1"},"publisher-location":"Cham","reference-count":63,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726262","type":"print"},{"value":"9783031726279","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,20]],"date-time":"2024-10-20T00:00:00Z","timestamp":1729382400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,20]],"date-time":"2024-10-20T00:00:00Z","timestamp":1729382400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72627-9_4","type":"book-chapter","created":{"date-parts":[[2024,10,19]],"date-time":"2024-10-19T21:02:10Z","timestamp":1729371730000},"page":"57-74","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":18,"title":["Improving 2D Feature Representations by\u00a03D-Aware Fine-Tuning"],"prefix":"10.1007","author":[{"given":"Yuanwen","family":"Yue","sequence":"first","affiliation":[]},{"given":"Anurag","family":"Das","sequence":"additional","affiliation":[]},{"given":"Francis","family":"Engelmann","sequence":"additional","affiliation":[]},{"given":"Siyu","family":"Tang","sequence":"additional","affiliation":[]},{"given":"Jan Eric","family":"Lenssen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,20]]},"reference":[{"key":"4_CR1","unstructured":"Amir, S., Gandelsman, Y., Bagon, S., Dekel, T.: Deep ViT features as dense visual descriptors. In: European Conference on Computer Vision (ECCV) Workshops (2022)"},{"key":"4_CR2","doi-asserted-by":"publisher","unstructured":"Bachmann, R., Mizrahi, D., Atanov, A., Zamir, A.: MultiMAE: multi-modal multi-task masked autoencoders. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part XXXVII, pp. 348\u2013367. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19836-6_20","DOI":"10.1007\/978-3-031-19836-6_20"},{"key":"4_CR3","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: BEiT: BERT pre-training of Image Transformers. In: International Conference on Learning Representations (ICLR) (2022)"},{"key":"4_CR4","doi-asserted-by":"crossref","unstructured":"Bengio, Y., Courville, A., Vincent, P.: Representation learning: a review and new perspectives. IEEE Trans. Pattern Anal. Mach. Intell. 35(8), 1798\u20131828 (2013)","DOI":"10.1109\/TPAMI.2013.50"},{"key":"4_CR5","unstructured":"Bhat, S.F., Alhashim, I., Wonka, P.: Adabins: depth estimation using adaptive bins. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2021)"},{"key":"4_CR6","unstructured":"Caron, M., Misra, I., Mairal, J., Goyal, P., Bojanowski, P., Joulin, A.: Unsupervised learning of visual features by contrasting cluster assignments. In: International Conference on Neural Information Processing Systems (NeurIPS) (2020)"},{"key":"4_CR7","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"4_CR8","unstructured":"Chen, M., et al.: Generative pretraining from pixels. In: International Conference on Machine Learning (ICML) (2020)"},{"key":"4_CR9","doi-asserted-by":"crossref","unstructured":"Dai, A., Chang, A.X., Savva, M., Halber, M., Funkhouser, T., Nie\u00dfner, M.: ScanNet: richly-annotated 3D reconstructions of indoor scenes. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.261"},{"key":"4_CR10","unstructured":"Darcet, T., Oquab, M., Mairal, J., Bojanowski, P.: Vision transformers need registers. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"4_CR11","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL (2018)"},{"key":"4_CR12","doi-asserted-by":"crossref","unstructured":"Doersch, C., Gupta, A., Efros, A.A.: Unsupervised visual representation learning by context prediction. In: International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.167"},{"key":"4_CR13","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (ICLR) (2020)"},{"key":"4_CR14","unstructured":"Dosovitskiy, A., Springenberg, J.T., Riedmiller, M., Brox, T.: Discriminative unsupervised feature learning with convolutional neural networks. In: International Conference on Neural Information Processing Systems (NeurIPS) (2014)"},{"key":"4_CR15","doi-asserted-by":"crossref","unstructured":"El\u00a0Banani, M., et al.: Probing the 3D awareness of visual foundation models. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.02059"},{"key":"4_CR16","unstructured":"Engelmann, F., Manhardt, F., Niemeyer, M., Tateno, K., Tombari, F.: OpenNeRF: open set 3D neural scene segmentation with pixel-wise features and rendered novel views. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"4_CR17","doi-asserted-by":"crossref","unstructured":"Everingham, M., Eslami, S.A., Van\u00a0Gool, L., Williams, C.K., Winn, J., Zisserman, A.: The Pascal visual object classes challenge: a retrospective. Int. J. Comput. Vision (2015)","DOI":"10.1007\/s11263-014-0733-5"},{"key":"4_CR18","doi-asserted-by":"crossref","unstructured":"Geiger, A., Lenz, P., Stiller, C., Urtasun, R.: Vision meets robotics: the Kitti dataset. Int. J. Robot. Res. (2013)","DOI":"10.1177\/0278364913491297"},{"key":"4_CR19","doi-asserted-by":"crossref","unstructured":"Ghiasi, G., Gu, X., Cui, Y., Lin, T.Y.: Scaling open-vocabulary image segmentation with image-level labels. In: European Conference on Computer Vision (ECCV) (2022)","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"4_CR20","unstructured":"Gidaris, S., Singh, P., Komodakis, N.: Unsupervised representation learning by predicting image rotations. In: International Conference on Learning Representations (ICLR) (2018)"},{"key":"4_CR21","unstructured":"Grill, J.B., et\u00a0al.: Bootstrap your own latent-a new approach to self-supervised learning. In: International Conference on Neural Information Processing Systems (NeurIPS) (2020)"},{"key":"4_CR22","unstructured":"Ha, H., Song, S.: Semantic abstraction: open-world 3D scene understanding from 2D vision-language models. In: Conference on Robot Learning (CoRL) (2022)"},{"key":"4_CR23","unstructured":"Hadsell, R., Chopra, S., LeCun, Y.: Dimensionality reduction by learning an invariant mapping. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2006)"},{"key":"4_CR24","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"4_CR25","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"4_CR26","doi-asserted-by":"crossref","unstructured":"Hou, J., Dai, X., He, Z., Dai, A., Nie\u00dfner, M.: Mask3D: pre-training 2D vision transformers by learning masked 3D priors. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.01298"},{"key":"4_CR27","doi-asserted-by":"crossref","unstructured":"Hou, J., Xie, S., Graham, B., Dai, A., Nie\u00dfner, M.: Pri3D: can 3D priors help 2D representation learning? In: International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00564"},{"key":"4_CR28","doi-asserted-by":"crossref","unstructured":"Huang, R., et al.: Segment3D: learning fine-grained class-agnostic 3D segmentation without manual labels. In: European Conference on Computer Vision (ECCV) (2024)","DOI":"10.1007\/978-3-031-72754-2_16"},{"key":"4_CR29","doi-asserted-by":"crossref","unstructured":"Jatavallabhula, K.M., et al.: ConceptFusion: open-set multimodal 3D mapping. Sci. Syst. (RSS) Robot. (2023)","DOI":"10.15607\/RSS.2023.XIX.066"},{"key":"4_CR30","doi-asserted-by":"crossref","unstructured":"Ke, B., Obukhov, A., Huang, S., Metzger, N., Daudt, R.C., Schindler, K.: Repurposing diffusion-based image generators for monocular depth estimation. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.00907"},{"key":"4_CR31","doi-asserted-by":"crossref","unstructured":"Kerbl, B., Kopanas, G., Leimk\u00fchler, T., Drettakis, G.: 3D Gaussian splatting for real-time radiance field rendering. ACM Trans. Graph. (2023)","DOI":"10.1145\/3592433"},{"key":"4_CR32","doi-asserted-by":"crossref","unstructured":"Kerr, J., Kim, C.M., Goldberg, K., Kanazawa, A., Tancik, M.: LERF: language embedded radiance fields. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01807"},{"key":"4_CR33","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"4_CR34","unstructured":"Kobayashi, S., Matsumoto, E., Sitzmann, V.: Decomposing Nerf for editing via feature field distillation. In: International Conference on Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"4_CR35","unstructured":"Li, B., Weinberger, K.Q., Belongie, S., Koltun, V., Ranftl, R.: Language-driven Semantic Segmentation. In: International Conference on Learning Representations (ICLR) (2022)"},{"key":"4_CR36","doi-asserted-by":"crossref","unstructured":"Li, F., et al.: Mask DINO: towards a unified transformer-based framework for object detection and segmentation. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00297"},{"key":"4_CR37","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"4_CR38","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (ICLR) (2019)"},{"key":"4_CR39","doi-asserted-by":"crossref","unstructured":"Mazur, K., Sucar, E., Davison, A.J.: Feature-realistic neural fusion for real-time, open set scene understanding. In: International Conference on Robotics and Automation (ICRA) (2023)","DOI":"10.1109\/ICRA48891.2023.10160800"},{"key":"4_CR40","doi-asserted-by":"crossref","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: NeRF: representing scenes as neural radiance fields for view synthesis. In: European Conference on Computer Vision (ECCV) (2020)","DOI":"10.1007\/978-3-030-58452-8_24"},{"key":"4_CR41","doi-asserted-by":"crossref","unstructured":"Noroozi, M., Favaro, P.: Unsupervised learning of visual representations by solving Jigsaw puzzles. In: European Conference on Computer Vision (ECCV) (2016)","DOI":"10.1007\/978-3-319-46466-4_5"},{"key":"4_CR42","unstructured":"Oquab, M., et\u00a0al.: DINOv2: learning robust visual features without supervision. Trans. Mach. Learn. Res. (2023)"},{"key":"4_CR43","doi-asserted-by":"crossref","unstructured":"Pathak, D., Girshick, R., Doll\u00e1r, P., Darrell, T., Hariharan, B.: Learning features by watching objects move. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.638"},{"key":"4_CR44","doi-asserted-by":"crossref","unstructured":"Peng, S., Genova, K., Jiang, C., Tagliasacchi, A., Pollefeys, M., Funkhouser, T.: OpenScene: 3D scene understanding with open vocabularies. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"4_CR45","doi-asserted-by":"crossref","unstructured":"Qin, M., Li, W., Zhou, J., Wang, H., Pfister, H.: LangSplat: 3D language Gaussian splatting. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.01895"},{"key":"4_CR46","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning (ICML) (2021)"},{"key":"4_CR47","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"4_CR48","unstructured":"Saxena, S., et al.: The surprising effectiveness of diffusion models for optical flow and monocular depth estimation. In: International Conference on Neural Information Processing Systems (NeurIPS) (2024)"},{"key":"4_CR49","unstructured":"Shen, W., Yang, G., Yu, A., Wong, J., Kaelbling, L.P., Isola, P.: Distilled feature fields enable few-shot language-guided manipulation. In: Conference on Robot Learning (CoRL) (2023)"},{"key":"4_CR50","doi-asserted-by":"crossref","unstructured":"Shi, J.C., Wang, M., Duan, H.B., Guan, S.H.: Language embedded 3D Gaussians for open-vocabulary scene understanding. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.00510"},{"key":"4_CR51","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"746","DOI":"10.1007\/978-3-642-33715-4_54","volume-title":"Computer Vision \u2013 ECCV 2012","author":"N Silberman","year":"2012","unstructured":"Silberman, N., Hoiem, D., Kohli, P., Fergus, R.: Indoor segmentation and support inference from RGBD images. In: Fitzgibbon, A., Lazebnik, S., Perona, P., Sato, Y., Schmid, C. (eds.) ECCV 2012. LNCS, vol. 7576, pp. 746\u2013760. Springer, Heidelberg (2012). https:\/\/doi.org\/10.1007\/978-3-642-33715-4_54"},{"key":"4_CR52","unstructured":"Takmaz, A., Fedele, E., Sumner, R.W., Pollefeys, M., Tombari, F., Engelmann, F.: OpenMask3D: open-vocabulary 3D instance segmentation. In: International Conference on Neural Information Processing Systems (NeurIPS) (2023)"},{"key":"4_CR53","unstructured":"Tan, H., Wu, S., Pi, J.: Semantic diffusion network for semantic segmentation. In: International Conference on Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"4_CR54","doi-asserted-by":"publisher","unstructured":"Touvron, H., Cord, M., J\u00e9gou, H.: DeiT III: revenge of\u00a0the\u00a0ViT. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part XXIV, pp. 516\u2013533. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20053-3_30","DOI":"10.1007\/978-3-031-20053-3_30"},{"key":"4_CR55","doi-asserted-by":"crossref","unstructured":"Tschernezki, V., Laina, I., Larlus, D., Vedaldi, A.: Neural feature fusion fields: 3D distillation of self-supervised 2D image representations. In: International Conference on 3D Vision (3DV) (2022)","DOI":"10.1109\/3DV57658.2022.00056"},{"key":"4_CR56","doi-asserted-by":"crossref","unstructured":"Wang, X., Gupta, A.: Unsupervised learning of visual representations using videos. In: International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.320"},{"key":"4_CR57","doi-asserted-by":"crossref","unstructured":"Weder, S., Blum, H., Engelmann, F., Pollefeys, M.: LabelMaker: automatic semantic label generation from RGB-D trajectories. In: International Conference on 3D Vision (3DV) (2024)","DOI":"10.1109\/3DV62453.2024.00075"},{"key":"4_CR58","unstructured":"Weinzaepfel, P., et al.: CroCo: self-supervised pre-training for 3D vision tasks by cross-view completion. In: International Conference on Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"4_CR59","doi-asserted-by":"crossref","unstructured":"Yang, L., Kang, B., Huang, Z., Xu, X., Feng, J., Zhao, H.: Depth anything: unleashing the power of large-scale unlabeled data. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.00987"},{"key":"4_CR60","doi-asserted-by":"crossref","unstructured":"Yeshwanth, C., Liu, Y.C., Nie\u00dfner, M., Dai, A.: ScanNet++: a high-fidelity dataset of 3D indoor scenes. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00008"},{"key":"4_CR61","unstructured":"Zhang, J., et al.: A tale of two features: stable diffusion complements DINO for zero-shot semantic correspondence. In: International Conference on Neural Information Processing Systems (NeurIPS) (2023)"},{"key":"4_CR62","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., Torralba, A.: Scene parsing through ade20K dataset. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.544"},{"key":"4_CR63","doi-asserted-by":"crossref","unstructured":"Zhou, S., et al.: Feature 3DGS: supercharging 3D Gaussian splatting to enable distilled feature fields. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.02048"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72627-9_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T22:45:48Z","timestamp":1732920348000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72627-9_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,20]]},"ISBN":["9783031726262","9783031726279"],"references-count":63,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72627-9_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,20]]},"assertion":[{"value":"20 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}