{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T19:19:22Z","timestamp":1757618362067,"version":"3.44.0"},"publisher-location":"Cham","reference-count":40,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031959103"},{"type":"electronic","value":"9783031959110"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-95911-0_25","type":"book-chapter","created":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T06:07:43Z","timestamp":1750486063000},"page":"353-367","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["NVSMask3D: Hard Visual Prompting with\u00a0Camera Pose Interpolation for\u00a03D Open Vocabulary Instance Segmentation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-7768-2471","authenticated-orcid":false,"given":"Junyuan","family":"Fang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0695-5839","authenticated-orcid":false,"given":"Zihan","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2838-9518","authenticated-orcid":false,"given":"Yejun","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1281-4370","authenticated-orcid":false,"given":"Shuzhe","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3819-5280","authenticated-orcid":false,"given":"Iaroslav","family":"Melekhov","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5088-4041","authenticated-orcid":false,"given":"Juho","family":"Kannala","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,6,16]]},"reference":[{"key":"25_CR1","unstructured":"Boudjoghra, M.E.A., et al.: Open-yolo 3D: towards fast and accurate open-vocabulary 3D instance segmentation (2024)"},{"key":"25_CR2","doi-asserted-by":"crossref","unstructured":"Chen, R., et al.: Clip2scene: towards label-efficient 3D scene understanding by clip. In: CVPR, pp. 7020\u20137030 (2023)","DOI":"10.1109\/CVPR52729.2023.00678"},{"key":"25_CR3","doi-asserted-by":"crossref","unstructured":"Cheng, T., Song, L., Ge, Y., Liu, W., Wang, X., Shan, Y.: Yolo-world: real-time open-vocabulary object detection. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.01599"},{"key":"25_CR4","doi-asserted-by":"crossref","unstructured":"Dai, A., Chang, A.X., Savva, M., Halber, M., Funkhouser, T., Nie\u00dfner, M.: Scannet: richly-annotated 3D reconstructions of indoor scenes. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.261"},{"key":"25_CR5","doi-asserted-by":"crossref","unstructured":"Ding, R., Yang, J., Xue, C., Zhang, W., Bai, S., Qi, X.: PLA: language-driven open-vocabulary 3D scene understanding. In: CVPR, pp. 7010\u20137019 (2023)","DOI":"10.1109\/CVPR52729.2023.00677"},{"key":"25_CR6","doi-asserted-by":"crossref","unstructured":"Ding, R., Yang, J., Xue, C., Zhang, W., Bai, S., Qi, X.: Lowis3d: language-driven open-world instance-level 3D scene understanding. IEEE Trans. Pattern Anal. Mach. Intell. (2024)","DOI":"10.1109\/TPAMI.2024.3410324"},{"key":"25_CR7","unstructured":"Engelmann, F., Manhardt, F., Niemeyer, M., Tateno, K., Pollefeys, M., Tombari, F.: OpenNeRF: open set 3D neural scene segmentation with pixel-wise features and rendered novel views. In: ICLR (2024)"},{"key":"25_CR8","doi-asserted-by":"crossref","unstructured":"Ghiasi, G., Gu, X., Cui, Y., Lin, T.Y.: Scaling open-vocabulary image segmentation with image-level labels. In: ECCV, pp. 540\u2013557. Springer (2022)","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"25_CR9","doi-asserted-by":"crossref","unstructured":"Huang, Z., Wu, X., Chen, X., Zhao, H., Zhu, L., Lasenby, J.: Openins3d: snap and lookup for 3D open-vocabulary instance segmentation. In: ECCV (2024)","DOI":"10.1007\/978-3-031-73033-7_10"},{"key":"25_CR10","doi-asserted-by":"crossref","unstructured":"Jia, M., et al.: Visual prompt tuning. In: ECCV, pp. 709\u2013727. Springer (2022)","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"25_CR11","doi-asserted-by":"crossref","unstructured":"Kerbl, B., Kopanas, G., Leimk\u00fchler, T., Drettakis, G.: 3D gaussian splatting for real-time radiance field rendering. ACM Trans. Graph. 42(4) (2023). https:\/\/repo-sam.inria.fr\/fungraph\/3d-gaussian-splatting\/","DOI":"10.1145\/3592433"},{"key":"25_CR12","doi-asserted-by":"crossref","unstructured":"Kim, Y., Lee, J., Kim, J.H., Ha, J.W., Zhu, J.Y.: Dense text-to-image generation with attention modulation. In: ICCV, pp. 7701\u20137711 (2023)","DOI":"10.1109\/ICCV51070.2023.00708"},{"key":"25_CR13","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et al.: Segment anything. In: ICCV, pp. 4015\u20134026 (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"25_CR14","doi-asserted-by":"crossref","unstructured":"Landrieu, L., Boussaha, M.: Point cloud oversegmentation with graph-structured deep metric learning. In: CVPR, pp. 7440\u20137449 (2019)","DOI":"10.1109\/CVPR.2019.00762"},{"key":"25_CR15","unstructured":"Li, B., Weinberger, K.Q., Belongie, S., Koltun, V., Ranftl, R.: Language-driven semantic segmentation. In: ICLR (2022). https:\/\/openreview.net\/forum?id=RriDjddCLN"},{"key":"25_CR16","doi-asserted-by":"crossref","unstructured":"Liang, F., et al.: Open-vocabulary semantic segmentation with mask-adapted clip. In: CVPR, pp. 7061\u20137070 (2023)","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"25_CR17","unstructured":"Lu, S., Chang, H., Jing, E.P., Boularias, A., Bekris, K.: Ovir-3d: open-vocabulary 3D instance retrieval without training on 3D data. In: Conference on Robot Learning, pp. 1610\u20131620. PMLR (2023)"},{"issue":"1","key":"25_CR18","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1145\/3503250","volume":"65","author":"B Mildenhall","year":"2021","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: Nerf: representing scenes as neural radiance fields for view synthesis. Commun. ACM 65(1), 99\u2013106 (2021)","journal-title":"Commun. ACM"},{"key":"25_CR19","unstructured":"Minderer, M., Gritsenko, A., Houlsby, N.: Scaling open-vocabulary object detection. In: NeurIPS (2024)"},{"key":"25_CR20","doi-asserted-by":"crossref","unstructured":"Nguyen, P., et al.: Open3dis: open-vocabulary 3D instance segmentation with 2D mask guidance. In: CVPR, pp. 4018\u20134028 (2024)","DOI":"10.1109\/CVPR52733.2024.00385"},{"key":"25_CR21","doi-asserted-by":"crossref","unstructured":"Nguyen, P.D., et al.: Open3dis: open-vocabulary 3D instance segmentation with 2D mask guidance. In: CVPR (2023)","DOI":"10.1109\/CVPR52733.2024.00385"},{"key":"25_CR22","doi-asserted-by":"crossref","unstructured":"Peng, S., et al.: Openscene: 3D scene understanding with open vocabularies. In: CVPR, pp. 815\u2013824 (2023)","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"25_CR23","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.126658","volume":"555","author":"H Pham","year":"2023","unstructured":"Pham, H., et al.: Combined scaling for zero-shot transfer learning. Neurocomputing 555, 126658 (2023)","journal-title":"Neurocomputing"},{"key":"25_CR24","unstructured":"Qi, C.R., Su, H., Mo, K., Guibas, L.J.: Pointnet: deep learning on point sets for 3D classification and segmentation. In: CVPR, pp. 652\u2013660 (2017)"},{"key":"25_CR25","unstructured":"Qi, C.R., Yi, L., Su, H., Guibas, L.J.: Pointnet++: deep hierarchical feature learning on point sets in a metric space. In: NeurIPS (2017)"},{"key":"25_CR26","doi-asserted-by":"crossref","unstructured":"Qin, M., Li, W., Zhou, J., Wang, H., Pfister, H.: Langsplat: 3D language gaussian splatting. In: CVPR (2023)","DOI":"10.1109\/CVPR52733.2024.01895"},{"key":"25_CR27","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763. PMLR (2021)"},{"key":"25_CR28","doi-asserted-by":"crossref","unstructured":"Robert, D., Raguet, H., Landrieu, L.: Efficient 3D semantic segmentation with superpoint transformer. In: ICCV, pp. 17195\u201317204 (2023)","DOI":"10.1109\/ICCV51070.2023.01577"},{"key":"25_CR29","doi-asserted-by":"crossref","unstructured":"Rozenberszki, D., Litany, O., Dai, A.: Language-grounded indoor 3D semantic segmentation in the wild. In: ECCV, pp. 125\u2013141. Springer (2022)","DOI":"10.1007\/978-3-031-19827-4_8"},{"key":"25_CR30","doi-asserted-by":"crossref","unstructured":"Schult, J., Engelmann, F., Hermans, A., Litany, O., Tang, S., Leibe, B.: Mask3d: mask transformer for 3D semantic instance segmentation. In: ICRA, pp. 8216\u20138223. IEEE (2023)","DOI":"10.1109\/ICRA48891.2023.10160590"},{"key":"25_CR31","doi-asserted-by":"crossref","unstructured":"Shtedritski, A., Rupprecht, C., Vedaldi, A.: What does clip know about a red circle? Visual prompt engineering for VLMs. In: ICCV, pp. 11987\u201311997 (2023)","DOI":"10.1109\/ICCV51070.2023.01101"},{"key":"25_CR32","unstructured":"Straub, J., et al.: The replica dataset: a digital replica of indoor spaces. arXiv preprint arXiv:1906.05797 (2019)"},{"key":"25_CR33","unstructured":"Takmaz, A., Fedele, E., Sumner, R.W., Pollefeys, M., Tombari, F., Engelmann, F.: Openmask3d: open-vocabulary 3D instance segmentation. In: NeurIPS (2023)"},{"key":"25_CR34","unstructured":"Wang, T., et al.: Caption anything: interactive image description with diverse multimodal controls. arXiv preprint arXiv:2305.02677 (2023)"},{"key":"25_CR35","doi-asserted-by":"crossref","unstructured":"Yang, J., Ding, R., Deng, W., Wang, Z., Qi, X.: Regionplc: regional point-language contrastive learning for open-world 3D scene understanding. In: CVPR, pp. 19823\u201319832 (2024)","DOI":"10.1109\/CVPR52733.2024.01874"},{"key":"25_CR36","unstructured":"Yang, L., Wang, Y., Li, X., Wang, X., Yang, J.: Fine-grained visual prompting. In: NeurIPS, vol.\u00a036, pp. 24993\u201325006. Curran Associates, Inc. (2023)"},{"key":"25_CR37","doi-asserted-by":"crossref","unstructured":"Yeshwanth, C., Liu, Y.C., Nie\u00dfner, M., Dai, A.: Scannet++: a high-fidelity dataset of 3D indoor scenes. In: ICCV, pp. 12\u201322 (2023)","DOI":"10.1109\/ICCV51070.2023.00008"},{"key":"25_CR38","unstructured":"Yoo, S., Kim, E., Jung, D., Lee, J., Yoon, S.: Improving visual prompt tuning for self-supervised vision transformers. In: ICML, pp. 40075\u201340092. PMLR (2023)"},{"key":"25_CR39","doi-asserted-by":"crossref","unstructured":"Zhai, X., et al.: Lit: zero-shot transfer with locked-image text tuning. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01759"},{"key":"25_CR40","doi-asserted-by":"crossref","unstructured":"Zhang, J., Dong, R., Ma, K.: Clip-fo3d: learning free open-world 3D scene representations from 2D dense clip. In: ICCV, pp. 2048\u20132059 (2023)","DOI":"10.1109\/ICCVW60793.2023.00219"}],"container-title":["Lecture Notes in Computer Science","Image Analysis"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-95911-0_25","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T21:16:54Z","timestamp":1757193414000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-95911-0_25"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031959103","9783031959110"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-95911-0_25","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"16 June 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"SCIA","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Scandinavian Conference on Image Analysis","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Reykjavik","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Iceland","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 June 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 June 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"scia2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/scia2025.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}