{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:15:39Z","timestamp":1777655739169,"version":"3.51.4"},"publisher-location":"Cham","reference-count":60,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031734137","type":"print"},{"value":"9783031734144","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T00:00:00Z","timestamp":1729814400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T00:00:00Z","timestamp":1729814400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73414-4_21","type":"book-chapter","created":{"date-parts":[[2024,10,24]],"date-time":"2024-10-24T17:02:54Z","timestamp":1729789374000},"page":"364-381","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Multi-modal Relation Distillation for\u00a0Unified 3D Representation Learning"],"prefix":"10.1007","author":[{"given":"Huiqun","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yiping","family":"Bao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Panwang","family":"Pan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zeming","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiao","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ruijie","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Di","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,25]]},"reference":[{"key":"21_CR1","doi-asserted-by":"crossref","unstructured":"Andonian, A., Chen, S., Hamid, R.: Robust cross-modal representation learning with progressive self-distillation. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 16430\u201316441 (2022)","DOI":"10.1109\/CVPR52688.2022.01594"},{"key":"21_CR2","doi-asserted-by":"crossref","unstructured":"Armeni, I., et al.: 3d semantic parsing of large-scale indoor spaces. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 1534\u20131543 (2016)","DOI":"10.1109\/CVPR.2016.170"},{"key":"21_CR3","unstructured":"Cadena, C., Dick, A.R., Reid, I.D.: Multi-modal auto-encoders as joint estimators for robotics scene understanding. In: Robotics: Science and Systems, vol.\u00a05 (2016)"},{"key":"21_CR4","unstructured":"Chang, A.X., et al.: Shapenet: an information-rich 3d model repository. arXiv preprint arXiv:1512.03012 (2015)"},{"key":"21_CR5","doi-asserted-by":"crossref","unstructured":"Chen, A., et al.: Pimae: point cloud and image interactive masked autoencoders for 3d object detection. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 5291\u20135301 (2023)","DOI":"10.1109\/CVPR52729.2023.00512"},{"key":"21_CR6","unstructured":"Chen, G., Wang, M., Yang, Y., Yu, K., Yuan, L., Yue, Y.: Pointgpt: auto-regressively generative pre-training from point clouds. Adv. Neural Inform. Process. Syst. (2023)"},{"key":"21_CR7","doi-asserted-by":"crossref","unstructured":"Collins, J., et al.: ABO: dataset and benchmarks for real-world 3d object understanding. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 21094\u201321104 (2022)","DOI":"10.1109\/CVPR52688.2022.02045"},{"key":"21_CR8","doi-asserted-by":"crossref","unstructured":"Deitke, M., et al.: Objaverse: a universe of annotated 3d objects. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 13142\u201313153 (2023)","DOI":"10.1109\/CVPR52729.2023.01263"},{"key":"21_CR9","unstructured":"Dong, R., et al.: Autoencoders as cross-modal teachers: can pretrained 2d image transformers help 3d representation learning? In: International Conference on Learning Representation (2023)"},{"key":"21_CR10","doi-asserted-by":"crossref","unstructured":"Fu, H., et al.: 3d-future: 3d furniture shape with texture. Int. J. Comput. Vis. 3313\u20133337 (2021)","DOI":"10.1007\/s11263-021-01534-z"},{"key":"21_CR11","doi-asserted-by":"crossref","unstructured":"Gao, Y., et al.: Softclip: softer cross-modal alignment makes clip stronger. In: AAAI, pp. 1860\u20131868 (2024)","DOI":"10.1609\/aaai.v38i3.27955"},{"key":"21_CR12","unstructured":"Gao, Y., et al.: Pyramidclip: hierarchical feature alignment for vision-language model pretraining. Adv. Neural Inform. Process. Syst. 35, 35959\u201335970 (2022)"},{"key":"21_CR13","first-page":"6704","volume":"35","author":"S Goel","year":"2022","unstructured":"Goel, S., Bansal, H., Bhatia, S., Rossi, R., Vinay, V., Grover, A.: Cyclip: cyclic contrastive language-image pretraining. Adv. Neural Inform. Process. Syst. 35, 6704\u20136719 (2022)","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"21_CR14","doi-asserted-by":"crossref","unstructured":"Guo, Z., Li, X., Heng, P.A.: Joint-mae: 2d-3d joint masked autoencoders for 3d point cloud pre-training. In: IJCAI, pp. 791\u2013799 (2023)","DOI":"10.24963\/ijcai.2023\/88"},{"key":"21_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"21_CR16","doi-asserted-by":"crossref","unstructured":"Hegde, D., Valanarasu, J.M.J., Patel, V.: Clip goes 3d: leveraging prompt tuning for language grounded 3d recognition. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 2028\u20132038 (2023)","DOI":"10.1109\/ICCVW60793.2023.00217"},{"key":"21_CR17","doi-asserted-by":"crossref","unstructured":"Hoffmann, D.T., Behrmann, N., Gall, J., Brox, T., Noroozi, M.: Ranking info noise contrastive estimation: boosting contrastive learning via ranked positives. In: AAAI, vol.\u00a036, pp. 897\u2013905 (2022)","DOI":"10.1609\/aaai.v36i1.19972"},{"key":"21_CR18","doi-asserted-by":"crossref","unstructured":"Huang, S., Xie, Y., Zhu, S.C., Zhu, Y.: Spatio-temporal self-supervised representation learning for 3d point clouds. In: International Conference on Computer Vision, pp. 6535\u20136545 (2021)","DOI":"10.1109\/ICCV48922.2021.00647"},{"key":"21_CR19","doi-asserted-by":"crossref","unstructured":"Huang, T., et al.: Clip2point: transfer clip to point cloud classification with image-depth pre-training. In: International Conference on Computer Vision, pp. 22157\u201322167 (2023)","DOI":"10.1109\/ICCV51070.2023.02025"},{"key":"21_CR20","unstructured":"Kim, B., Choi, S., Hwang, D., Lee, M., Lee, H.: Transferring pre-trained multimodal representations with cross-modal similarity matching. Adv. Neural Inform. Process. Syst. (2022)"},{"key":"21_CR21","doi-asserted-by":"crossref","unstructured":"Li, H., Zhou, X., Tuan, L.A., Miao, C.: Rethinking negative pairs in code search. In: EMNLP, pp. 12760\u201312774 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.786"},{"key":"21_CR22","doi-asserted-by":"crossref","unstructured":"Li, R., Li, X., Fu, C.W., Cohen-Or, D., Heng, P.A.: Pu-gan: a point cloud upsampling adversarial network. In: International Conference on Computer Vision, pp. 7203\u20137212 (2019)","DOI":"10.1109\/ICCV.2019.00730"},{"key":"21_CR23","doi-asserted-by":"crossref","unstructured":"Li, Y., et\u00a0al.: Deepfusion: lidar-camera deep fusion for multi-modal 3d object detection. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 17182\u201317191 (2022)","DOI":"10.1109\/CVPR52688.2022.01667"},{"key":"21_CR24","unstructured":"Liang, V.W., Zhang, Y., Kwon, Y., Yeung, S., Zou, J.Y.: Mind the gap: understanding the modality gap in multi-modal contrastive representation learning. Adv. Neural Inform. Process. Syst. 35, 17612\u201317625 (2022)"},{"key":"21_CR25","unstructured":"Liu, H., Simonyan, K., Yang, Y.: DARTS: differentiable architecture search. In: International Conference on Learning Representation (2019)"},{"key":"21_CR26","unstructured":"Liu, M., et al.: Openshape: scaling up 3d shape representation towards open-world understanding. Adv. Neural Inform. Process. Syst. (2023)"},{"key":"21_CR27","doi-asserted-by":"crossref","unstructured":"Liu, Z., Zhang, Z., Cao, Y., Hu, H., Tong, X.: Group-free 3d object detection via transformers. In: International Conference on Computer Vision, pp. 2949\u20132958 (2021)","DOI":"10.1109\/ICCV48922.2021.00294"},{"key":"21_CR28","unstructured":"Luo, T., Rockwell, C., Lee, H., Johnson, J.: Scalable 3d captioning with pretrained models. In: Oh, A., Naumann, T., Globerson, A., Saenko, K., Hardt, M., Levine, S. (eds.) Advance Neural Information Processing System (2023)"},{"key":"21_CR29","doi-asserted-by":"crossref","unstructured":"Ma, W., Xu, M., Li, X., Zhou, X.: Multicad: contrastive representation learning for multi-modal 3d computer-aided design models. In: ACM International Conference on Information Knowledge Management (2023)","DOI":"10.1145\/3583780.3614982"},{"key":"21_CR30","doi-asserted-by":"crossref","unstructured":"Misra, I., Girdhar, R., Joulin, A.: An end-to-end transformer model for 3d object detection. In: International Conference on Computer Vision, pp. 2906\u20132917 (2021)","DOI":"10.1109\/ICCV48922.2021.00290"},{"key":"21_CR31","doi-asserted-by":"publisher","unstructured":"Pang, Y., Wang, W., Tay, F.E.H., Liu, W., Tian, Y., Yuan, L.: Masked autoencoders for\u00a0point cloud self-supervised learning. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part II, pp. 604\u2013621. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20086-1_35","DOI":"10.1007\/978-3-031-20086-1_35"},{"key":"21_CR32","doi-asserted-by":"crossref","unstructured":"Park, W., Kim, D., Lu, Y., Cho, M.: Relational knowledge distillation. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 3967\u20133976 (2019)","DOI":"10.1109\/CVPR.2019.00409"},{"key":"21_CR33","doi-asserted-by":"crossref","unstructured":"Poursaeed, O., Jiang, T., Qiao, H., Xu, N., Kim, V.G.: Self-supervised learning of point clouds via orientation estimation. In: 3DV, pp. 1018\u20131028 (2020)","DOI":"10.1109\/3DV50981.2020.00112"},{"key":"21_CR34","unstructured":"Qi, C.R., Yi, L., Su, H., Guibas, L.J.: Pointnet++: deep hierarchical feature learning on point sets in a metric space. Adv. Neural Inform. Process. Syst. (2017)"},{"key":"21_CR35","unstructured":"Qi, Z., et al.: Contrast with reconstruct: contrastive 3d representation learning guided by generative pretraining. In: International Conference on Machine Learning (2023)"},{"key":"21_CR36","unstructured":"Qian, G., et al.: Pointnext: revisiting pointnet++ with improved training and scaling strategies. Adv. Neural Inform. Process. Syst. (2022)"},{"key":"21_CR37","doi-asserted-by":"crossref","unstructured":"Qian, G., Zhang, X., Hamdi, A., Ghanem, B.: Pix4point: image pretrained transformers for 3d point cloud understanding. In: 3DV, pp. 1280\u20131290 (2024)","DOI":"10.1109\/3DV62453.2024.00113"},{"key":"21_CR38","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763 (2021)"},{"key":"21_CR39","doi-asserted-by":"crossref","unstructured":"Rao, Y., Lu, J., Zhou, J.: Global-local bidirectional reasoning for unsupervised representation learning of 3d point clouds. In: IEEE Conference on Computer Vision Pattern Recognition, pp. 5376\u20135385 (2020)","DOI":"10.1109\/CVPR42600.2020.00542"},{"key":"21_CR40","first-page":"12942","volume":"32","author":"J Sauder","year":"2019","unstructured":"Sauder, J., Sievers, B.: Self-supervised deep learning on point clouds by reconstructing space. Adv. Neural Inform. Process. Syst. 32, 12942\u201312952 (2019)","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"21_CR41","doi-asserted-by":"crossref","unstructured":"Uy, M.A., Pham, Q.H., Hua, B.S., Nguyen, T., Yeung, S.K.: Revisiting point cloud classification: a new benchmark dataset and classification model on real-world data. In: International Conference on Computer Vision, pp. 1588\u20131597 (2019)","DOI":"10.1109\/ICCV.2019.00167"},{"key":"21_CR42","doi-asserted-by":"crossref","unstructured":"Vu, T., Kim, K., Luu, T.M., Nguyen, T., Yoo, C.D.: Softgroup for 3d instance segmentation on point clouds. In: IEEE Conference on Computer Vision Pattern Recognition, pp. 2708\u20132717 (2022)","DOI":"10.1109\/CVPR52688.2022.00273"},{"key":"21_CR43","doi-asserted-by":"crossref","unstructured":"Wang, H., Liu, Q., Yue, X., Lasenby, J., Kusner, M.J.: Unsupervised point cloud pre-training via occlusion completion. In: International Conference on Computer Vision, pp. 9782\u20139792 (2021)","DOI":"10.1109\/ICCV48922.2021.00964"},{"key":"21_CR44","doi-asserted-by":"crossref","unstructured":"Wang, H., Huang, D., Wang, Y.: Gridnet: efficiently learning deep hierarchical representation for 3d point cloud understanding. Front. Comput. Sci. 16, 161301 (2022)","DOI":"10.1007\/s11704-020-9521-2"},{"key":"21_CR45","unstructured":"Wang, Z., et al.: Connecting multi-modal contrastive representations. Adv. Neural Inform. Process. Syst. 36 (2024)"},{"key":"21_CR46","doi-asserted-by":"crossref","unstructured":"Wojek, C., Walk, S., Roth, S., Schiele, B.: Monocular 3d scene understanding with explicit occlusion reasoning. In: IEEE Conference on Computer Vision Pattern Recognition, pp. 1993\u20132000. IEEE (2011)","DOI":"10.1109\/CVPR.2011.5995547"},{"key":"21_CR47","unstructured":"Wu, Z., et al.: 3d shapenets: a deep representation for volumetric shapes. In: IEEE Conference on Computer Vision Pattern Recognition, pp. 1912\u20131920 (2015)"},{"key":"21_CR48","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"574","DOI":"10.1007\/978-3-030-58580-8_34","volume-title":"Computer Vision \u2013 ECCV 2020","author":"S Xie","year":"2020","unstructured":"Xie, S., Gu, J., Guo, D., Qi, C.R., Guibas, L., Litany, O.: PointContrast: unsupervised pre-training for 3D point cloud understanding. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12348, pp. 574\u2013591. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58580-8_34"},{"key":"21_CR49","doi-asserted-by":"crossref","unstructured":"Xue, L., et al.:: Ulip: learning a unified representation of language, images, and point clouds for 3d understanding. In: IEEE Conference on Computer Vision Pattern Recognition, pp. 1179\u20131189 (2023)","DOI":"10.1109\/CVPR52729.2023.00120"},{"key":"21_CR50","doi-asserted-by":"crossref","unstructured":"Xue, L., et al.: Ulip-2: towards scalable multimodal pre-training for 3d understanding. In: IEEE Conference on Computer Vision and Pattern Recognition (2024)","DOI":"10.1109\/CVPR52733.2024.02558"},{"key":"21_CR51","doi-asserted-by":"crossref","unstructured":"Yan, S., et al.: Implicit autoencoder for point-cloud self-supervised representation learning. In: International Conference on Computer Vision, pp. 14530\u201314542 (2023)","DOI":"10.1109\/ICCV51070.2023.01336"},{"key":"21_CR52","unstructured":"Yao, L., et al.: Filip: fine-grained interactive language-image pre-training. In: International Conference on Learning Representation (2022)"},{"key":"21_CR53","doi-asserted-by":"crossref","unstructured":"Yin, T., Zhou, X., Krahenbuhl, P.: Center-based 3d object detection and tracking. In: IEEE Conference on Computer Vision Pattern Recognition, pp. 11784\u201311793 (2021)","DOI":"10.1109\/CVPR46437.2021.01161"},{"key":"21_CR54","doi-asserted-by":"crossref","unstructured":"Yu, X., Tang, L., Rao, Y., Huang, T., Zhou, J., Lu, J.: Point-bert: pre-training 3d point cloud transformers with masked point modeling. In: IEEE Conference on Computer Vision Pattern Recognition, pp. 19313\u201319322 (2022)","DOI":"10.1109\/CVPR52688.2022.01871"},{"key":"21_CR55","doi-asserted-by":"crossref","unstructured":"Yuan, X., et al.: Multimodal contrastive training for visual representation learning. In: IEEE Conference on Computer Vision and Pattern Recognition (2021)","DOI":"10.1109\/CVPR46437.2021.00692"},{"key":"21_CR56","unstructured":"Zhang, R., et al.: Point-m2ae: multi-scale masked autoencoders for hierarchical point cloud pre-training. Adv. Neural Inform. Process. Syst. 35, 27061\u201327074 (2022)"},{"key":"21_CR57","doi-asserted-by":"crossref","unstructured":"Zhang, R., et al.: Pointclip: point cloud understanding by clip. In: IEEE Conference Computer Vision and Pattern Recognition, pp. 8552\u20138562 (2022)","DOI":"10.1109\/CVPR52688.2022.00836"},{"key":"21_CR58","doi-asserted-by":"crossref","unstructured":"Zhang, R., Wang, L., Qiao, Y., Gao, P., Li, H.: Learning 3d representations from 2d pre-trained models via image-to-point masked autoencoders. In: IEEE Conference Computer Vision and Pattern Recognition, pp. 21769\u201321780 (2023)","DOI":"10.1109\/CVPR52729.2023.02085"},{"key":"21_CR59","unstructured":"Zhou, J., Wang, J., Ma, B., Liu, Y.S., Huang, T., Wang, X.: Uni3d: exploring unified 3d representation at scale. In: International Conference on Learning Representation (2023)"},{"key":"21_CR60","doi-asserted-by":"crossref","unstructured":"Zhu, X., et al.: Pointclip v2: prompting clip and GPT for powerful 3d open-world learning. In: International Conference on Computer Vision, pp. 2639\u20132650 (2023)","DOI":"10.1109\/ICCV51070.2023.00249"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73414-4_21","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,24]],"date-time":"2024-10-24T17:11:51Z","timestamp":1729789911000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73414-4_21"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,25]]},"ISBN":["9783031734137","9783031734144"],"references-count":60,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73414-4_21","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,25]]},"assertion":[{"value":"25 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}