{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T20:22:12Z","timestamp":1771705332425,"version":"3.50.1"},"publisher-location":"Cham","reference-count":69,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726323","type":"print"},{"value":"9783031726330","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T00:00:00Z","timestamp":1732233600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T00:00:00Z","timestamp":1732233600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72633-0_25","type":"book-chapter","created":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T07:57:04Z","timestamp":1732175824000},"page":"442-460","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Open Vocabulary 3D Scene Understanding via\u00a0Geometry Guided Self-Distillation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3675-9508","authenticated-orcid":false,"given":"Pengfei","family":"Wang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1579-2357","authenticated-orcid":false,"given":"Yuxi","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0760-5267","authenticated-orcid":false,"given":"Shuai","family":"Li","sequence":"additional","affiliation":[]},{"given":"Zhaoxiang","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Zhen","family":"Lei","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2078-4215","authenticated-orcid":false,"given":"Lei","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,22]]},"reference":[{"key":"25_CR1","unstructured":"Vit-gpt2 image captioning. https:\/\/huggingface.co\/nlpconnect\/vit-gpt2-image-captioning\/discussions"},{"key":"25_CR2","doi-asserted-by":"crossref","unstructured":"Caesar, H., et al.: nuscenes: a multimodal dataset for autonomous driving. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp, 11621\u201311631 (2020)","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"25_CR3","doi-asserted-by":"crossref","unstructured":"Chang, A., et al.: Matterport3d: learning from RGB-d data in indoor environments. arXiv preprint arXiv:1709.06158 (2017)","DOI":"10.1109\/3DV.2017.00081"},{"key":"25_CR4","doi-asserted-by":"crossref","unstructured":"Chen, A., et al.: PiMAE: point cloud and image interactive masked autoencoders for 3d object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5291\u20135301 (2023)","DOI":"10.1109\/CVPR52729.2023.00512"},{"key":"25_CR5","unstructured":"Chen, R., et al.: Towards label-free scene understanding by vision foundation models. In: Thirty-seventh Conference on Neural Information Processing Systems (2023)"},{"key":"25_CR6","doi-asserted-by":"crossref","unstructured":"Chen, R., et al.: Clip2scene: towards label-efficient 3d scene understanding by clip. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7020\u20137030 (2023)","DOI":"10.1109\/CVPR52729.2023.00678"},{"key":"25_CR7","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"key":"25_CR8","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1290\u20131299 (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"25_CR9","doi-asserted-by":"crossref","unstructured":"Choy, C., Gwak, J., Savarese, S.: 4d spatio-temporal convnets: Minkowski convolutional neural networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3075\u20133084 (2019)","DOI":"10.1109\/CVPR.2019.00319"},{"key":"25_CR10","doi-asserted-by":"crossref","unstructured":"Dai, A., Chang, A.X., Savva, M., Halber, M., Funkhouser, T., Nie\u00dfner, M.: ScanNet: richly-annotated 3d reconstructions of indoor scenes. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5828\u20135839 (2017)","DOI":"10.1109\/CVPR.2017.261"},{"key":"25_CR11","doi-asserted-by":"crossref","unstructured":"Dai, A., Ritchie, D., Bokeloh, M., Reed, S., Sturm, J., Nie\u00dfner, M.: ScanComplete: large-scale scene completion and semantic segmentation for 3d scans. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4578\u20134587 (2018)","DOI":"10.1109\/CVPR.2018.00481"},{"key":"25_CR12","doi-asserted-by":"crossref","unstructured":"Ding, R., Yang, J., Xue, C., Zhang, W., Bai, S., Qi, X.: PLA: language-driven open-vocabulary 3d scene understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7010\u20137019 (2023)","DOI":"10.1109\/CVPR52729.2023.00677"},{"key":"25_CR13","doi-asserted-by":"crossref","unstructured":"Du, Y., Wei, F., Zhang, Z., Shi, M., Gao, Y., Li, G.C.: Learning to prompt for open-vocabulary object detection with vision-language model. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01369"},{"issue":"3","key":"25_CR14","doi-asserted-by":"publisher","first-page":"163347","DOI":"10.1007\/s11704-022-2015-7","volume":"16","author":"J Fan","year":"2022","unstructured":"Fan, J., Wang, Y., Guan, H., Song, C., Zhang, Z.: Toward few-shot domain adaptation with perturbation-invariant representation and transferable prototypes. Front. Comp. Sci. 16(3), 163347 (2022)","journal-title":"Front. Comp. Sci."},{"key":"25_CR15","doi-asserted-by":"crossref","unstructured":"Feng, C., et al.: PromptDet: towards open-vocabulary detection using uncurated images. In: ECCV (2022)","DOI":"10.1007\/978-3-031-20077-9_41"},{"key":"25_CR16","unstructured":"Ghiasi, G., Gu, X., Cui, Y., Lin, T.Y.: Open-vocabulary image segmentation. In: ECCV (2022)"},{"key":"25_CR17","doi-asserted-by":"publisher","unstructured":"Ghiasi, G., Gu, X., Cui, Y., Lin, T.Y.: Scaling open-vocabulary image segmentation with image-level labels. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision - ECCV 2022, ECCV 2022, LNCS, vol. 13696, pp. 540\u2013557. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20059-5_31","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"25_CR18","doi-asserted-by":"crossref","unstructured":"Graham, B., Engelcke, M., Van Der\u00a0Maaten, L.: 3d semantic segmentation with submanifold sparse convolutional networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 9224\u20139232 (2018)","DOI":"10.1109\/CVPR.2018.00961"},{"key":"25_CR19","unstructured":"Gu, X., Lin, T.Y., Kuo, W., Cui, Y.: Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921 (2021)"},{"key":"25_CR20","unstructured":"Gu, X., Lin, T.Y., Kuo, W., Cui, Y.: Open-vocabulary object detection via vision and language knowledge distillation. In: ICLR (2021)"},{"key":"25_CR21","doi-asserted-by":"crossref","unstructured":"Han, L., Zheng, T., Xu, L., Fang, L.: OccuSeg: occupancy-aware 3d instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2940\u20132949 (2020)","DOI":"10.1109\/CVPR42600.2020.00301"},{"key":"25_CR22","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"25_CR23","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"25_CR24","doi-asserted-by":"crossref","unstructured":"Hu, W., Zhao, H., Jiang, L., Jia, J., Wong, T.T.: Bidirectional projection network for cross dimension scene understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14373\u201314382 (2021)","DOI":"10.1109\/CVPR46437.2021.01414"},{"key":"25_CR25","unstructured":"Hu, X., et al.: Semantic anything in 3d gaussians. arXiv preprint arXiv:2401.17857 (2024)"},{"key":"25_CR26","doi-asserted-by":"crossref","unstructured":"Hu, Z., et al.: VMNet: voxel-mesh network for geodesic-aware 3d semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15488\u201315498 (2021)","DOI":"10.1109\/ICCV48922.2021.01520"},{"key":"25_CR27","doi-asserted-by":"crossref","unstructured":"Huang, J., Zhang, H., Yi, L., Funkhouser, T., Nie\u00dfner, M., Guibas, L.J.: TextureNet: consistent local parametrizations for learning from high-resolution signals on meshes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4440\u20134449 (2019)","DOI":"10.1109\/CVPR.2019.00457"},{"key":"25_CR28","doi-asserted-by":"crossref","unstructured":"Huang, T., et al.: Clip2point: transfer clip to point cloud classification with image-depth pre-training. arXiv preprint arXiv:2210.01055 (2022)","DOI":"10.1109\/ICCV51070.2023.02025"},{"key":"25_CR29","doi-asserted-by":"crossref","unstructured":"Kerbl, B., Kopanas, G., Leimk\u00fchler, T., Drettakis, G.: 3d gaussian splatting for real-time radiance field rendering. ACM Trans. Graph. 42(4), 1\u2013139 (2023)","DOI":"10.1145\/3592433"},{"key":"25_CR30","doi-asserted-by":"crossref","unstructured":"Kerr, J., Kim, C.M., Goldberg, K., Kanazawa, A., Tancik, M.: LERF: language embedded radiance fields. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 19729\u201319739 (2023)","DOI":"10.1109\/ICCV51070.2023.01807"},{"key":"25_CR31","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. arXiv preprint arXiv:2304.02643 (2023)"},{"key":"25_CR32","unstructured":"Kuo, W., Cui, Y., Gu, X., Piergiovanni, A., Angelova, A.: F-VLM: open-vocabulary object detection upon frozen vision and language models. arXiv preprint arXiv:2209.15639 (2022)"},{"key":"25_CR33","doi-asserted-by":"crossref","unstructured":"Lambert, J., Liu, Z., Sener, O., Hays, J., Koltun, V.: MSeg: a composite dataset for multi-domain semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2879\u20132888 (2020)","DOI":"10.1109\/CVPR42600.2020.00295"},{"key":"25_CR34","unstructured":"Li, B., Weinberger, K.Q., Belongie, S., Koltun, V., Ranftl, R.: Language-driven semantic segmentation. In: International Conference on Learning Representations (2022). https:\/\/openreview.net\/forum?id=RriDjddCLN"},{"key":"25_CR35","unstructured":"Li, B., Weinberger, K.Q., Belongie, S.J., Koltun, V., Ranftl, R.: Language-driven semantic segmentation. In: ICLR (2022)"},{"key":"25_CR36","doi-asserted-by":"crossref","unstructured":"Li, J., He, X., Wen, Y., Gao, Y., Cheng, X., Zhang, D.: Panoptic-PHNet: towards real-time and high-precision lidar panoptic segmentation via clustering pseudo heatmap. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11809\u201311818 (2022)","DOI":"10.1109\/CVPR52688.2022.01151"},{"key":"25_CR37","unstructured":"Liang, F., et al.: Open-vocabulary semantic segmentation with mask-adapted clip. arXiv abs\/2210.04150 (2022)"},{"key":"25_CR38","doi-asserted-by":"crossref","unstructured":"Liang, F., et al.: Open-vocabulary semantic segmentation with mask-adapted clip. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7061\u20137070 (2023)","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"25_CR39","unstructured":"Liu, K., et al.: Weakly supervised 3d open-vocabulary segmentation. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"25_CR40","doi-asserted-by":"crossref","unstructured":"L\u00fcddecke, T., Ecker, A.S.: Image segmentation using text and image prompts. In: CVPR (2021)","DOI":"10.1109\/CVPR52688.2022.00695"},{"issue":"1","key":"25_CR41","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1145\/3503250","volume":"65","author":"B Mildenhall","year":"2021","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: Nerf: representing scenes as neural radiance fields for view synthesis. Commun. ACM 65(1), 99\u2013106 (2021)","journal-title":"Commun. ACM"},{"key":"25_CR42","doi-asserted-by":"crossref","unstructured":"Misra, I., Girdhar, R., Joulin, A.: An end-to-end transformer model for 3d object detection. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00290"},{"key":"25_CR43","doi-asserted-by":"crossref","unstructured":"Nekrasov, A., Schult, J., Litany, O., Leibe, B., Engelmann, F.: Mix3d: out-of-context data augmentation for 3d scenes. In: 2021 International Conference on 3D Vision (3DV), pp. 116\u2013125. IEEE (2021)","DOI":"10.1109\/3DV53792.2021.00022"},{"key":"25_CR44","doi-asserted-by":"crossref","unstructured":"Papon, J., Abramov, A., Schoeler, M., Worgotter, F.: Voxel cloud connectivity segmentation-supervoxels for point clouds. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2027\u20132034 (2013)","DOI":"10.1109\/CVPR.2013.264"},{"key":"25_CR45","doi-asserted-by":"crossref","unstructured":"Peng, S., Genova, K., Jiang, C., Tagliasacchi, A., Pollefeys, M., Funkhouser, T., et\u00a0al.: Openscene: 3d scene understanding with open vocabularies. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 815\u2013824 (2023)","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"25_CR46","unstructured":"Qi, C.R., Yi, L., Su, H., Guibas, L.J.: Pointnet++: deep hierarchical feature learning on point sets in a metric space. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"25_CR47","doi-asserted-by":"crossref","unstructured":"Qin, M., Li, W., Zhou, J., Wang, H., Pfister, H.: LangSplat: 3d language gaussian splatting. arXiv preprint arXiv:2312.16084 (2023)","DOI":"10.1109\/CVPR52733.2024.01895"},{"key":"25_CR48","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"25_CR49","doi-asserted-by":"crossref","unstructured":"Rao, Y., et al.: DenseCLIP: language-guided dense prediction with context-aware prompting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18082\u201318091 (2022)","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"25_CR50","unstructured":"Rasheed, H., Maaz, M., Khattak, M.U., Khan, S., Khan, F.S.: Bridging the gap between object and image-level representations for open-vocabulary detection. In: 36th Conference on Neural Information Processing Systems (NIPS) (2022)"},{"key":"25_CR51","unstructured":"Rasheed, H., Maaz, M., Khattak, M.U., Khan, S., Khan, F.S.: Bridging the gap between object and image-level representations for open-vocabulary detection. In: NeurIPS (2022)"},{"key":"25_CR52","doi-asserted-by":"crossref","unstructured":"Robert, D., Vallet, B., Landrieu, L.: Learning multi-view aggregation in the wild for large-scale 3d semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5575\u20135584 (2022)","DOI":"10.1109\/CVPR52688.2022.00549"},{"key":"25_CR53","doi-asserted-by":"crossref","unstructured":"Schult, J., Engelmann, F., Kontogianni, T., Leibe, B.: DualConvMesh-Net: joint geodesic and Euclidean convolutions on 3d meshes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8612\u20138622 (2020)","DOI":"10.1109\/CVPR42600.2020.00864"},{"key":"25_CR54","unstructured":"Takmaz, A., Fedele, E., Sumner, R.W., Pollefeys, M., Tombari, F., Engelmann, F.: Openmask3d: open-vocabulary 3d instance segmentation. arXiv preprint arXiv:2306.13631 (2023)"},{"key":"25_CR55","doi-asserted-by":"crossref","unstructured":"Tatarchenko, M., Park, J., Koltun, V., Zhou, Q.Y.: Tangent convolutions for dense prediction in 3d. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3887\u20133896 (2018)","DOI":"10.1109\/CVPR.2018.00409"},{"key":"25_CR56","doi-asserted-by":"crossref","unstructured":"Vu, T., Kim, K., Luu, T.M., Nguyen, X.T., Yoo, C.D.: Softgroup for 3d instance segmentation on 3d point clouds. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00273"},{"key":"25_CR57","doi-asserted-by":"crossref","unstructured":"Wang, H., et al.: Pulling target to source: a new perspective on domain adaptive semantic segmentation. arXiv preprint arXiv:2305.13752 (2023)","DOI":"10.1007\/s11263-024-02285-3"},{"key":"25_CR58","unstructured":"Wang, P., et al.: Ofa: unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. CoRR abs\/2202.03052 (2022)"},{"key":"25_CR59","doi-asserted-by":"crossref","unstructured":"Wang, Y., Liang, J., Xiao, J., Mei, S., Yang, Y., Zhang, Z.: Informative data mining for one-shot cross-domain semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1064\u20131074 (2023)","DOI":"10.1109\/ICCV51070.2023.00104"},{"key":"25_CR60","doi-asserted-by":"crossref","unstructured":"Wang, Y., Peng, J., Zhang, Z.: Uncertainty-aware pseudo label refinery for domain adaptive semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9092\u20139101 (2021)","DOI":"10.1109\/ICCV48922.2021.00896"},{"key":"25_CR61","doi-asserted-by":"crossref","unstructured":"Xu, M., et al.: A simple baseline for zero-shot semantic segmentation with pre-trained vision-language model. arXiv preprint arXiv:2112.14757 (2021)","DOI":"10.1007\/978-3-031-19818-2_42"},{"key":"25_CR62","doi-asserted-by":"crossref","unstructured":"Yang, J., Ding, R., Wang, Z., Qi, X.: RegionPLC: regional point-language contrastive learning for open-world 3d scene understanding. arXiv preprint arXiv:2304.00962 (2023)","DOI":"10.1109\/CVPR52733.2024.01874"},{"key":"25_CR63","doi-asserted-by":"crossref","unstructured":"Ye, D., et al.: LidarmultiNet: towards a unified multi-task network for lidar perception. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a037, pp. 3231\u20133240 (2023)","DOI":"10.1609\/aaai.v37i3.25429"},{"key":"25_CR64","doi-asserted-by":"crossref","unstructured":"Zang, Y., Li, W., Zhou, K., Huang, C., Loy, C.C.: Open-vocabulary detr with conditional matching. In: ECCV (2022)","DOI":"10.1007\/978-3-031-20077-9_7"},{"key":"25_CR65","doi-asserted-by":"crossref","unstructured":"Zhang, H., et al.: Mp-former: mask-piloted transformer for image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18074\u201318083 (2023)","DOI":"10.1109\/CVPR52729.2023.01733"},{"key":"25_CR66","doi-asserted-by":"crossref","unstructured":"Zhang, J., Dong, R., Ma, K.: Clip-fo3d: learning free open-world 3d scene representations from 2d dense clip. arXiv preprint arXiv:2303.04748 (2023)","DOI":"10.1109\/ICCVW60793.2023.00219"},{"key":"25_CR67","doi-asserted-by":"crossref","unstructured":"Zhang, R., et al.: Pointclip: point cloud understanding by clip. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8552\u20138562 (2022)","DOI":"10.1109\/CVPR52688.2022.00836"},{"key":"25_CR68","doi-asserted-by":"crossref","unstructured":"Zhong, Y., et al.: RegionCLIP: region-based language-image pretraining. In: CVPR (2021)","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"25_CR69","doi-asserted-by":"crossref","unstructured":"Zhou, C., Loy, C.C., Dai, B.: Extract free dense labels from clip. In: European Conference on Computer Vision (ECCV) (2022)","DOI":"10.1007\/978-3-031-19815-1_40"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72633-0_25","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T08:15:55Z","timestamp":1732176955000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72633-0_25"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,22]]},"ISBN":["9783031726323","9783031726330"],"references-count":69,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72633-0_25","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,22]]},"assertion":[{"value":"22 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}