{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,10]],"date-time":"2026-02-10T00:17:56Z","timestamp":1770682676904,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62122010"],"award-info":[{"award-number":["62122010"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Key R&D Program of China","award":["2022ZD011550"],"award-info":[{"award-number":["2022ZD011550"]}]},{"name":"CCF-DiDi GAIA Collaborative Research Funds for Young Scholars"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612107","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"3745-3754","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":15,"title":["Transferring CLIP's Knowledge into Zero-Shot Point Cloud Semantic Segmentation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1077-8142","authenticated-orcid":false,"given":"Yuanbin","family":"Wang","sequence":"first","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8996-9907","authenticated-orcid":false,"given":"Shaofei","family":"Huang","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3895-1288","authenticated-orcid":false,"given":"Yulu","family":"Gao","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5702-1254","authenticated-orcid":false,"given":"Zhen","family":"Wang","sequence":"additional","affiliation":[{"name":"Didi Chuxing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8423-6170","authenticated-orcid":false,"given":"Rui","family":"Wang","sequence":"additional","affiliation":[{"name":"Didi Chuxing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3370-7711","authenticated-orcid":false,"given":"Kehua","family":"Sheng","sequence":"additional","affiliation":[{"name":"Didi Chuxing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3354-8018","authenticated-orcid":false,"given":"Bo","family":"Zhang","sequence":"additional","affiliation":[{"name":"Didi Chuxing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9180-2935","authenticated-orcid":false,"given":"Si","family":"Liu","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume":"201","author":"Behley J.","unstructured":"J. Behley, M. Garbade, A. Milioto, J. Quenzel, S. Behnke, C. Stachniss, and J. Gall. 2019. SemanticKITTI: A Dataset for Semantic Scene Understanding of LiDAR Sequences. In ICCV.","journal-title":"J. Gall."},{"key":"e_1_3_2_1_2_1","volume-title":"Zero-shot semantic segmentation. \u00e7","author":"Bucher Maxime","year":"2019","unstructured":"Maxime Bucher, Tuan-Hung Vu, Matthieu Cord, and Patrick P\u00e9rez. 2019. Zero-shot semantic segmentation. \u00e7 (2019)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Runnan Chen Youquan Liu Lingdong Kong Xinge Zhu Yuexin Ma Yikang Li Yuenan Hou Yu Qiao and Wenping Wang. 2023. CLIP2Scene: Towards Label-efficient 3D Scene Understanding by CLIP. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00678"},{"key":"e_1_3_2_1_4_1","volume-title":"Zero-shot Point Cloud Segmentation by Transferring Geometric Primitives. arXiv preprint arXiv:2210.09923","author":"Chen Runnan","year":"2022","unstructured":"Runnan Chen, Xinge Zhu, Nenglun Chen, Wei Li, Yuexin Ma, Ruigang Yang, and Wenping Wang. 2022. Zero-shot Point Cloud Segmentation by Transferring Geometric Primitives. arXiv preprint arXiv:2210.09923 (2022)."},{"key":"e_1_3_2_1_5_1","volume-title":"Cenet: Toward concise and efficient lidar semantic segmentation for autonomous driving. In ICME.","author":"Cheng HuiXian","year":"2022","unstructured":"HuiXian Cheng, XianFeng Han, and GuoQiang Xiao. 2022. Cenet: Toward concise and efficient lidar semantic segmentation for autonomous driving. In ICME."},{"key":"e_1_3_2_1_6_1","volume-title":"Mitigating the hubness problem for zero-shot learning of 3d objects. arXiv preprint arXiv:1907.06371","author":"Cheraghian Ali","year":"2019","unstructured":"Ali Cheraghian, Shafin Rahman, Dylan Campbell, and Lars Petersson. 2019b. Mitigating the hubness problem for zero-shot learning of 3d objects. arXiv preprint arXiv:1907.06371 (2019)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Ali Cheraghian Shafin Rahman and Lars Petersson. 2019a. Zero-shot learning of 3d point cloud objects. In MVA.","DOI":"10.23919\/MVA.2019.8758063"},{"key":"e_1_3_2_1_8_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_9_1","volume-title":"Lubing Zhou, Holger Caesar, Oscar Beijbom, and Abhinav Valada.","author":"Fong Whye Kit","year":"2021","unstructured":"Whye Kit Fong, Rohit Mohan, Juana Valeria Hurtado, Lubing Zhou, Holger Caesar, Oscar Beijbom, and Abhinav Valada. 2021. Panoptic nuScenes: A Large-Scale Benchmark for LiDAR Panoptic Segmentation and Tracking. arXiv preprint arXiv:2109.03805 (2021)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Daniel Garrido Rui Rodrigues A Augusto Sousa Joao Jacob and Daniel Castro Silva. 2021. Point cloud interaction and manipulation in virtual reality. In AIVR.","DOI":"10.1145\/3480433.3480437"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Golnaz Ghiasi Xiuye Gu Yin Cui and Tsung-Yi Lin. 2022. Scaling open-vocabulary image segmentation with image-level labels. In ECCV.","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"e_1_3_2_1_12_1","volume-title":"Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921","author":"Gu Xiuye","year":"2021","unstructured":"Xiuye Gu, Tsung-Yi Lin, Weicheng Kuo, and Yin Cui. 2021. Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921 (2021)."},{"key":"e_1_3_2_1_13_1","unstructured":"Zhangxuan Gu Siyuan Zhou Li Niu Zihan Zhao and Liqing Zhang. 2020. Context-aware feature generation for zero-shot semantic segmentation. In ACM MM."},{"key":"e_1_3_2_1_14_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR."},{"key":"e_1_3_2_1_15_1","volume-title":"Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton, Oriol Vinyals, and Jeff Dean. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)."},{"key":"e_1_3_2_1_16_1","unstructured":"Chao Jia Yinfei Yang Ye Xia Yi-Ting Chen Zarana Parekh Hieu Pham Quoc Le Yun-Hsuan Sung Zhen Li and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In ICML."},{"key":"e_1_3_2_1_17_1","volume-title":"Language-driven semantic segmentation. arXiv preprint arXiv:2201.03546","author":"Li Boyi","year":"2022","unstructured":"Boyi Li, Kilian Q Weinberger, Serge Belongie, Vladlen Koltun, and Ren\u00e9 Ranftl. 2022. Language-driven semantic segmentation. arXiv preprint arXiv:2201.03546 (2022)."},{"key":"e_1_3_2_1_18_1","volume-title":"Language-Level Semantics Conditioned 3D Point Cloud Segmentation. arXiv preprint arXiv:2107.00430","author":"Liu Bo","year":"2021","unstructured":"Bo Liu, Shuang Deng, Qiulei Dong, and Zhanyi Hu. 2021a. Language-Level Semantics Conditioned 3D Point Cloud Segmentation. arXiv preprint arXiv:2107.00430 (2021)."},{"key":"e_1_3_2_1_19_1","unstructured":"Zhengzhe Liu Xiaojuan Qi and Chi-Wing Fu. 2021b. 3d-to-2d distillation for indoor scene parsing. In CVPR."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Bj\u00f6rn Michele Alexandre Boulch Gilles Puy Maxime Bucher and Renaud Marlet. 2021. Generative zero-shot learning for semantic segmentation of 3d point clouds. In 3DV.","DOI":"10.1109\/3DV53792.2021.00107"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Seyed Iman Mirzadeh Mehrdad Farajtabar Ang Li Nir Levine Akihiro Matsukawa and Hassan Ghasemzadeh. 2020. Improved knowledge distillation via teacher assistant. In AAAI.","DOI":"10.1609\/aaai.v34i04.5963"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Songyou Peng Kyle Genova Chiyu Jiang Andrea Tagliasacchi Marc Pollefeys Thomas Funkhouser et al. 2023. Openscene: 3d scene understanding with open vocabularies. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"e_1_3_2_1_23_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Corentin Sautier Gilles Puy Spyros Gidaris Alexandre Boulch Andrei Bursuc and Renaud Marlet. 2022. Image-to-lidar self-supervised distillation for autonomous driving data. In CVPR.","DOI":"10.1109\/CVPR52688.2022.00966"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351042"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Haotian Tang Zhijian Liu Shengyu Zhao Yujun Lin Ji Lin Hanrui Wang and Song Han. 2020. Searching efficient 3d architectures with sparse point-voxel convolution. In ECCV.","DOI":"10.1007\/978-3-030-58604-1_41"},{"key":"e_1_3_2_1_27_1","volume-title":"Stanley: The robot that won the DARPA Grand Challenge. J FIELD ROBOT","author":"Thrun Sebastian","year":"2006","unstructured":"Sebastian Thrun, Mike Montemerlo, Hendrik Dahlkamp, David Stavens, Andrei Aron, James Diebel, Philip Fong, John Gale, Morgan Halpenny, Gabriel Hoffmann, et al. 2006. Stanley: The robot that won the DARPA Grand Challenge. J FIELD ROBOT (2006)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Guiyu Tian Shuai Wang Jie Feng Li Zhou and Yadong Mu. 2020. Cap2seg: Inferring semantic and spatial context from captions for zero-shot image segmentation. In ACM MM.","DOI":"10.1145\/3394171.3413990"},{"key":"e_1_3_2_1_29_1","volume-title":"Attention is all you need. NeurIPS","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. NeurIPS (2017)."},{"key":"e_1_3_2_1_30_1","volume-title":"Head: Hetero-assists distillation for heterogeneous object detectors. In ECCV.","author":"Wang Luting","year":"2022","unstructured":"Luting Wang, Xiaojie Li, Yue Liao, Zeren Jiang, Jianlong Wu, Fei Wang, Chen Qian, and Si Liu. 2022. Head: Hetero-assists distillation for heterogeneous object detectors. In ECCV."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Xin Wen Zhizhong Han Geunhyuk Youk and Yu-Shen Liu. 2020. CF-SIS: Semantic-instance segmentation of 3D point clouds by context fusion with self-attention. In ACM MM.","DOI":"10.1145\/3394171.3413829"},{"key":"e_1_3_2_1_32_1","unstructured":"Jian Wu Jianbo Jiao Qingxiong Yang Zheng-Jun Zha and Xuejin Chen. 2019. Ground-aware point cloud semantic segmentation for autonomous driving. In ACM MM."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00845"},{"key":"e_1_3_2_1_34_1","unstructured":"Chenfeng Xu Shijia Yang Tomer Galanti Bichen Wu Xiangyu Yue Bohan Zhai Wei Zhan Peter Vajda Kurt Keutzer and Masayoshi Tomizuka. 2022b. Image2Point: 3D Point-Cloud Understanding with 2D Image Pretrained Models. In ECCV."},{"key":"e_1_3_2_1_35_1","volume-title":"Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, and Xiaolong Wang.","author":"Xu Jiarui","year":"2022","unstructured":"Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, and Xiaolong Wang. 2022a. Groupvit: Semantic segmentation emerges from text supervision. In CVPR."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Xu Yan Jiantao Gao Chaoda Zheng Chao Zheng Ruimao Zhang Shuguang Cui and Zhen Li. 2022. 2dpass: 2d priors assisted semantic segmentation on lidar point clouds. In ECCV.","DOI":"10.1007\/978-3-031-19815-1_39"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Yuhang Zang Wei Li Kaiyang Zhou Chen Huang and Chen Change Loy. 2022. Open-vocabulary detr with conditional matching. In ECCV.","DOI":"10.1007\/978-3-031-20077-9_7"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Peng Zhang Li Su Liang Li BingKun Bao Pamela Cosman GuoRong Li and Qingming Huang. 2019. Training efficient saliency prediction models with knowledge distillation. In ACM MM.","DOI":"10.1145\/3343031.3351089"},{"key":"e_1_3_2_1_39_1","volume-title":"Pointclip: Point cloud understanding by clip. In CVPR.","author":"Zhang Renrui","year":"2022","unstructured":"Renrui Zhang, Ziyu Guo, Wei Zhang, Kunchang Li, Xupeng Miao, Bin Cui, Yu Qiao, Peng Gao, and Hongsheng Li. 2022. Pointclip: Point cloud understanding by clip. In CVPR."},{"key":"e_1_3_2_1_40_1","volume-title":"Luowei Zhou, Xiyang Dai, Lu Yuan, Yin Li, et al.","author":"Zhong Yiwu","year":"2022","unstructured":"Yiwu Zhong, Jianwei Yang, Pengchuan Zhang, Chunyuan Li, Noel Codella, Liunian Harold Li, Luowei Zhou, Xiyang Dai, Lu Yuan, Yin Li, et al. 2022. Regionclip: Region-based language-image pretraining. In CVPR."},{"key":"e_1_3_2_1_41_1","volume-title":"Chen Change Loy, and Bo Dai","author":"Zhou Chong","year":"2022","unstructured":"Chong Zhou, Chen Change Loy, and Bo Dai. 2022. Extract free dense labels from clip. In ECCV."},{"key":"e_1_3_2_1_42_1","volume-title":"PointCLIP V2: Adapting CLIP for Powerful 3D Open-world Learning. arXiv preprint arXiv:2211.11682","author":"Zhu Xiangyang","year":"2022","unstructured":"Xiangyang Zhu, Renrui Zhang, Bowei He, Ziyao Zeng, Shanghang Zhang, and Peng Gao. 2022. PointCLIP V2: Adapting CLIP for Powerful 3D Open-world Learning. arXiv preprint arXiv:2211.11682 (2022)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Xinge Zhu Hui Zhou Tai Wang Fangzhou Hong Yuexin Ma Wei Li Hongsheng Li and Dahua Lin. 2021. Cylindrical and asymmetrical 3d convolution networks for lidar segmentation. In CVPR.","DOI":"10.1109\/CVPR46437.2021.00981"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612107","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612107","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:59:50Z","timestamp":1755820790000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612107"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":43,"alternative-id":["10.1145\/3581783.3612107","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612107","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}