{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:04:12Z","timestamp":1777655052749,"version":"3.51.4"},"publisher-location":"Cham","reference-count":32,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031731945","type":"print"},{"value":"9783031731952","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:00:00Z","timestamp":1732665600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:00:00Z","timestamp":1732665600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73195-2_22","type":"book-chapter","created":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T09:36:26Z","timestamp":1732613786000},"page":"376-392","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Unlocking Textual and\u00a0Visual Wisdom: Open-Vocabulary 3D Object Detection Enhanced by\u00a0Comprehensive Guidance from\u00a0Text and\u00a0Image"],"prefix":"10.1007","author":[{"given":"Pengkun","family":"Jiao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Na","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingjing","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yu-Gang","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,27]]},"reference":[{"issue":"10","key":"22_CR1","doi-asserted-by":"publisher","first-page":"3782","DOI":"10.1109\/TITS.2019.2892405","volume":"20","author":"E Arnold","year":"2019","unstructured":"Arnold, E., Al-Jarrah, O.Y., Dianati, M., Fallah, S., Oxtoby, D., Mouzakitis, A.: A survey on 3d object detection methods for autonomous driving applications. IEEE Trans. Intell. Transp. Syst. 20(10), 3782\u20133795 (2019)","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"22_CR2","unstructured":"Cao, Y., Yihan, Z., Xu, H., Xu, D.: Coda: collaborative novel box discovery and cross-modal alignment for open-vocabulary 3d object detection. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"22_CR3","doi-asserted-by":"crossref","unstructured":"Dai, A., Chang, A.X., Savva, M., Halber, M., Funkhouser, T., Nie\u00dfner, M.: Scannet: richly-annotated 3d reconstructions of indoor scenes. In: Proceedings of Computer Vision and Pattern Recognition (CVPR). IEEE (2017)","DOI":"10.1109\/CVPR.2017.261"},{"key":"22_CR4","doi-asserted-by":"crossref","unstructured":"Ding, R., Yang, J., Xue, C., Zhang, W., Bai, S., Qi, X.: Pla: language-driven open-vocabulary 3d scene understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7010\u20137019 (2023)","DOI":"10.1109\/CVPR52729.2023.00677"},{"key":"22_CR5","doi-asserted-by":"crossref","unstructured":"Han, Y., Zhao, N., Chen, W., Ma, K.T., Zhang, H.: Dual-perspective knowledge enrichment for semi-supervised 3d object detection. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a038, pp. 2049\u20132057 (2024)","DOI":"10.1609\/aaai.v38i3.27976"},{"key":"22_CR6","doi-asserted-by":"crossref","unstructured":"Jiao, Y., Jie, Z., Chen, S., Chen, J., Ma, L., Jiang, Y.G.: Msmdfusion: fusing lidar and camera at multiple scales with multi-depth seeds for 3d object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21643\u201321652 (2023)","DOI":"10.1109\/CVPR52729.2023.02073"},{"key":"22_CR7","doi-asserted-by":"crossref","unstructured":"Kamath, A., Singh, M., LeCun, Y., Synnaeve, G., Misra, I., Carion, N.: Mdetr-modulated detection for end-to-end multi-modal understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1780\u20131790 (2021)","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"22_CR8","doi-asserted-by":"crossref","unstructured":"Lai, X., et al.: Lisa: reasoning segmentation via large language model. arXiv preprint arXiv:2308.00692 (2023)","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"22_CR9","doi-asserted-by":"crossref","unstructured":"Li*, L.H., et al.: Grounded language-image pre-training. In: CVPR (2022)","DOI":"10.1109\/CVPR52729.2023.02240"},{"key":"22_CR10","doi-asserted-by":"crossref","unstructured":"Liu, S., et\u00a0al.: Grounding dino: marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499 (2023)","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"22_CR11","unstructured":"Liu, Y., et al.: Roberta: a robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"22_CR12","doi-asserted-by":"crossref","unstructured":"Liu, Z., Zhang, Z., Cao, Y., Hu, H., Tong, X.: Group-free 3d object detection via transformers. arXiv preprint arXiv:2104.00678 (2021)","DOI":"10.1109\/ICCV48922.2021.00294"},{"key":"22_CR13","doi-asserted-by":"crossref","unstructured":"Lu, Y., et al.: Open-vocabulary point-cloud object detection without 3d annotation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1190\u20131199 (2023)","DOI":"10.1109\/CVPR52729.2023.00121"},{"key":"22_CR14","doi-asserted-by":"crossref","unstructured":"Mao, J., Shi, S., Wang, X., Li, H.: 3d object detection for autonomous driving: a comprehensive survey. Int. J. Comput. Vision, 1\u201355 (2023)","DOI":"10.1007\/s11263-023-01790-1"},{"key":"22_CR15","doi-asserted-by":"crossref","unstructured":"Misra, I., Girdhar, R., Joulin, A.: An end-to-end transformer model for 3d object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2906\u20132917 (2021)","DOI":"10.1109\/ICCV48922.2021.00290"},{"key":"22_CR16","doi-asserted-by":"crossref","unstructured":"Pan, X., Xia, Z., Song, S., Li, L.E., Huang, G.: 3d object detection with pointformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7463\u20137472 (2021)","DOI":"10.1109\/CVPR46437.2021.00738"},{"key":"22_CR17","doi-asserted-by":"crossref","unstructured":"Qi, C.R., Litany, O., He, K., Guibas, L.J.: Deep hough voting for 3d object detection in point clouds. In: proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9277\u20139286 (2019)","DOI":"10.1109\/ICCV.2019.00937"},{"key":"22_CR18","unstructured":"Qi, C.R., Yi, L., Su, H., Guibas, L.J.: Pointnet++: deep hierarchical feature learning on point sets in a metric space. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"22_CR19","unstructured":"Qian, X., et al.: Mlcvnet: multi-level context votenet for 3d object detection. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2020)"},{"key":"22_CR20","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"22_CR21","doi-asserted-by":"publisher","unstructured":"Sheng, H., et al.: Rethinking iou-based optimization for single-stage 3d object detection. In: European Conference on Computer Vision, pp. 544\u2013561. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_32","DOI":"10.1007\/978-3-031-20077-9_32"},{"key":"22_CR22","doi-asserted-by":"crossref","unstructured":"Wu, H., Wen, C., Li, W., Li, X., Yang, R., Wang, C.: Transformation-equivariant 3d object detection for autonomous driving. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a037, pp. 2795\u20132802 (2023)","DOI":"10.1609\/aaai.v37i3.25380"},{"key":"22_CR23","doi-asserted-by":"crossref","unstructured":"Yang, B., Luo, W., Urtasun, R.: Pixor: real-time 3d object detection from point clouds. In: Proceedings of the IEEE conference on Computer Vision and Pattern Recognition, pp. 7652\u20137660 (2018)","DOI":"10.1109\/CVPR.2018.00798"},{"key":"22_CR24","unstructured":"Yao, L., et al.: Detclip: dictionary-enriched visual-concept paralleled pre-training for open-world detection (2022)"},{"key":"22_CR25","doi-asserted-by":"crossref","unstructured":"Yin, T., Zhou, X., Krahenbuhl, P.: Center-based 3d object detection and tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11784\u201311793 (2021)","DOI":"10.1109\/CVPR46437.2021.01161"},{"key":"22_CR26","doi-asserted-by":"crossref","unstructured":"Zeng, Y., et al.: Clip2: contrastive language-image-point pretraining from real-world point cloud data. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15244\u201315253 (2023)","DOI":"10.1109\/CVPR52729.2023.01463"},{"key":"22_CR27","doi-asserted-by":"crossref","unstructured":"Zhang, R., et al.: Pointclip: point cloud understanding by clip. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8552\u20138562 (2022)","DOI":"10.1109\/CVPR52688.2022.00836"},{"key":"22_CR28","doi-asserted-by":"crossref","unstructured":"Zhao, N., Chua, T.S., Lee, G.H.: Sess: self-ensembling semi-supervised 3d object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11079\u201311087 (2020)","DOI":"10.1109\/CVPR42600.2020.01109"},{"key":"22_CR29","doi-asserted-by":"publisher","unstructured":"Zhou, X., Girdhar, R., Joulin, A., Kr\u00e4henb\u00fchl, P., Misra, I.: Detecting twenty-thousand classes using image-level supervision. In: European Conference on Computer Vision, pp. 350\u2013368. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_21","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"22_CR30","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Tuzel, O.: Voxelnet: end-to-end learning for point cloud based 3d object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4490\u20134499 (2018)","DOI":"10.1109\/CVPR.2018.00472"},{"key":"22_CR31","unstructured":"Zhu, C., Zhang, W., Wang, T., Liu, X., Chen, K.: Object2scene: putting objects in context for open-vocabulary 3d detection. arXiv preprint arXiv:2309.09456 (2023)"},{"key":"22_CR32","doi-asserted-by":"crossref","unstructured":"Zhu, X., et al.: Pointclip v2: prompting clip and gpt for powerful 3d open-world learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2639\u20132650 (2023)","DOI":"10.1109\/ICCV51070.2023.00249"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73195-2_22","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T10:13:12Z","timestamp":1732615992000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73195-2_22"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,27]]},"ISBN":["9783031731945","9783031731952"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73195-2_22","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,27]]},"assertion":[{"value":"27 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}