{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:16:27Z","timestamp":1777655787816,"version":"3.51.4"},"publisher-location":"Cham","reference-count":42,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729515","type":"print"},{"value":"9783031729522","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72952-2_22","type":"book-chapter","created":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T05:02:02Z","timestamp":1727672522000},"page":"381-398","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["Multi-branch Collaborative Learning Network for\u00a03D Visual Grounding"],"prefix":"10.1007","author":[{"given":"Zhipeng","family":"Qian","sequence":"first","affiliation":[]},{"given":"Yiwei","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Zhekai","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Jiayi","family":"Ji","sequence":"additional","affiliation":[]},{"given":"Xiawu","family":"Zheng","sequence":"additional","affiliation":[]},{"given":"Xiaoshuai","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Rongrong","family":"Ji","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,1]]},"reference":[{"key":"22_CR1","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"422","DOI":"10.1007\/978-3-030-58452-8_25","volume-title":"Computer Vision \u2013 ECCV 2020","author":"P Achlioptas","year":"2020","unstructured":"Achlioptas, P., Abdelreheem, A., Xia, F., Elhoseiny, M., Guibas, L.: ReferIt3D: neural listeners for fine-grained 3D object identification in real-world scenes. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 422\u2013440. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_25"},{"key":"22_CR2","doi-asserted-by":"publisher","unstructured":"Bolya, D., Zhou, C., Xiao, F., Lee, Y.J.: Yolact: real-time instance segmentation. In: 2019 IEEE\/CVF International Conference on Computer Vision (ICCV), October 2019. https:\/\/doi.org\/10.1109\/iccv.2019.00925, http:\/\/dx.doi.org\/10.1109\/iccv.2019.00925","DOI":"10.1109\/iccv.2019.00925"},{"key":"22_CR3","doi-asserted-by":"crossref","unstructured":"Cai, D., Zhao, L., Zhang, J., Sheng, L., Xu, D.: 3DJCG: a unified framework for joint dense captioning and visual grounding on 3D point clouds. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16464\u201316473 (2022)","DOI":"10.1109\/CVPR52688.2022.01597"},{"key":"22_CR4","doi-asserted-by":"publisher","unstructured":"Caruana, R., Pratt, L., Thrun, S.: Multitask Learning , p. 893, January 2017. https:\/\/doi.org\/10.1007\/978-1-4899-7687-1_100322, http:\/\/dx.doi.org\/10.1007\/978-1-4899-7687-1_100322","DOI":"10.1007\/978-1-4899-7687-1_100322"},{"key":"22_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"202","DOI":"10.1007\/978-3-030-58565-5_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"DZ Chen","year":"2020","unstructured":"Chen, D.Z., Chang, A.X., Nie\u00dfner, M.: ScanRefer: 3D object localization in RGB-D scans using natural language. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12365, pp. 202\u2013221. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58565-5_13"},{"key":"22_CR6","doi-asserted-by":"crossref","unstructured":"Chen, D.Z., Wu, Q., Nie\u00dfner, M., Chang, A.X.: D3net: a speaker-listener architecture for semi-supervised dense captioning and visual grounding in RGB-D scans (2021)","DOI":"10.1007\/978-3-031-19824-3_29"},{"key":"22_CR7","first-page":"20522","volume":"35","author":"S Chen","year":"2022","unstructured":"Chen, S., Guhur, P.L., Tapaswi, M., Schmid, C., Laptev, I.: Language conditioned spatial relation reasoning for 3D object grounding. Adv. Neural Inf. Process. Syst. 35, 20522\u201320535 (2022)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"22_CR8","doi-asserted-by":"publisher","unstructured":"Dai, A., Chang, A.X., Savva, M., Halber, M., Funkhouser, T., Niessner, M.: ScanNet: richly-annotated 3D reconstructions of indoor scenes. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), July 2017. https:\/\/doi.org\/10.1109\/cvpr.2017.261, http:\/\/dx.doi.org\/10.1109\/cvpr.2017.261","DOI":"10.1109\/cvpr.2017.261"},{"key":"22_CR9","doi-asserted-by":"crossref","unstructured":"Feng, M., et al.: Free-form description guided 3D visual graph network for object grounding in point cloud. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3722\u20133731 (2021)","DOI":"10.1109\/ICCV48922.2021.00370"},{"key":"22_CR10","unstructured":"Fu, C.Y., Shvets, M., Berg, A.: Retinamask: learning to predict masks improves state-of-the-art single-shot detection for free, January 2019. $$\\text{arXiv}$$: Computer Vision and Pattern RecognitionRecognition, $$\\text{ arXiv }$$: Computer Vision and Pattern Recognition"},{"key":"22_CR11","doi-asserted-by":"crossref","unstructured":"Han, L., Zheng, T., Xu, L., Fang, L.: Occuseg: occupancy-aware 3D instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2940\u20132949 (2020)","DOI":"10.1109\/CVPR42600.2020.00301"},{"key":"22_CR12","doi-asserted-by":"publisher","unstructured":"He, K., Gkioxari, G., Dollar, P., Girshick, R.: Mask R-CNN. IEEE Trans. Pattern Anal. Mach. Intell. 386\u2013397 (2020). https:\/\/doi.org\/10.1109\/tpami.2018.2844175, http:\/\/dx.doi.org\/10.1109\/tpami.2018.2844175","DOI":"10.1109\/tpami.2018.2844175"},{"key":"22_CR13","doi-asserted-by":"publisher","unstructured":"Hua, G., Liao, M., Tian, S., Zhang, Y., Zou, W.: Multiple relational learning network for joint referring expression comprehension and segmentation. IEEE Trans. Multimed. 1\u201313 (2023). https:\/\/doi.org\/10.1109\/tmm.2023.3241802, http:\/\/dx.doi.org\/10.1109\/tmm.2023.3241802","DOI":"10.1109\/tmm.2023.3241802"},{"key":"22_CR14","doi-asserted-by":"crossref","unstructured":"Huang, P.H., Lee, H.H., Chen, H.T., Liu, T.L.: Text-guided graph neural networks for referring 3d instance segmentation. In: Proceedings of the AAAI Conference on Artificial Intelligence, May 2021. Proceedings of the ... AAAI Conference on Artificial Intelligence","DOI":"10.1609\/aaai.v35i2.16253"},{"key":"22_CR15","doi-asserted-by":"publisher","unstructured":"Jain, A., Gkanatsios, N., Mediratta, I., Fragkiadaki, K.: Bottom up top down detection transformers for language grounding in images and point clouds. In: Avidan, S., Brostow, G., Cisse, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022. ECCV 2022. LNCS, vol. 13696, pp. 417\u2013433. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20059-5_24","DOI":"10.1007\/978-3-031-20059-5_24"},{"key":"22_CR16","doi-asserted-by":"publisher","first-page":"3962","DOI":"10.1109\/TMM.2022.3169061","volume":"25","author":"J Ji","year":"2023","unstructured":"Ji, J., et al.: Multi-branch distance-sensitive self-attention network for image captioning. IEEE Trans. Multimed. 25, 3962\u20133974 (2023). https:\/\/doi.org\/10.1109\/TMM.2022.3169061","journal-title":"IEEE Trans. Multimed."},{"key":"22_CR17","doi-asserted-by":"publisher","first-page":"4321","DOI":"10.1109\/TIP.2022.3183434","volume":"31","author":"J Ji","year":"2022","unstructured":"Ji, J., Ma, Y., Sun, X., Zhou, Y., Wu, Y., Ji, R.: Knowing what to learn: a metric-oriented focal mechanism for image captioning. IEEE Trans. Image Process. 31, 4321\u20134335 (2022). https:\/\/doi.org\/10.1109\/TIP.2022.3183434","journal-title":"IEEE Trans. Image Process."},{"key":"22_CR18","doi-asserted-by":"publisher","unstructured":"Ji, J., et al.: Attacking image captioning towards accuracy-preserving target words removal. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 4226\u20134234. MM \u201920, Association for Computing Machinery, New York, NY, USA (2020). https:\/\/doi.org\/10.1145\/3394171.3414009","DOI":"10.1145\/3394171.3414009"},{"key":"22_CR19","doi-asserted-by":"publisher","unstructured":"Li, Q., Zhang, Y., Sun, S., Wu, J., Zhao, X., Tan, M.: Cross-modality synergy network for referring expression comprehension and segmentation. Neurocomputing 99\u2013114 (2022). https:\/\/doi.org\/10.1016\/j.neucom.2021.09.066, http:\/\/dx.doi.org\/10.1016\/j.neucom.2021.09.066","DOI":"10.1016\/j.neucom.2021.09.066"},{"key":"22_CR20","doi-asserted-by":"crossref","unstructured":"Liang, Z., Li, Z., Xu, S., Tan, M., Jia, K.: Instance segmentation in 3D scenes using semantic superpoint tree networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2783\u20132792 (2021)","DOI":"10.1109\/ICCV48922.2021.00278"},{"key":"22_CR21","unstructured":"Lin, H., et al.: A unified framework for 3d point cloud visual grounding. arXiv preprint arXiv:2308.11887 (2023)"},{"key":"22_CR22","doi-asserted-by":"publisher","unstructured":"Lin, T.Y., Goyal, P., Girshick, R., He, K., Dollar, P.: Focal loss for dense object detection. In: 2017 IEEE International Conference on Computer Vision (ICCV), October 2017. https:\/\/doi.org\/10.1109\/iccv.2017.324, http:\/\/dx.doi.org\/10.1109\/iccv.2017.324","DOI":"10.1109\/iccv.2017.324"},{"key":"22_CR23","unstructured":"Liu, Y., et al.: Roberta: a robustly optimized bert pretraining approach"},{"key":"22_CR24","doi-asserted-by":"publisher","unstructured":"Liu, Z., Zhang, Z., Cao, Y., Hu, H., Tong, X.: Group-free 3d object detection via transformers. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), October 2021. https:\/\/doi.org\/10.1109\/iccv48922.2021.00294, http:\/\/dx.doi.org\/10.1109\/iccv48922.2021.00294","DOI":"10.1109\/iccv48922.2021.00294"},{"key":"22_CR25","doi-asserted-by":"publisher","unstructured":"Luo, G., et al.: Multi-task collaborative network for joint referring expression comprehension and segmentation. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), June 2020. https:\/\/doi.org\/10.1109\/cvpr42600.2020.01005, http:\/\/dx.doi.org\/10.1109\/cvpr42600.2020.01005","DOI":"10.1109\/cvpr42600.2020.01005"},{"key":"22_CR26","doi-asserted-by":"crossref","unstructured":"Luo, J., et al.: 3D-SPS: single-stage 3D visual grounding via referred point progressive selection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16454\u201316463 (2022)","DOI":"10.1109\/CVPR52688.2022.01596"},{"key":"22_CR27","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109420","volume":"138","author":"Y Ma","year":"2023","unstructured":"Ma, Y., Ji, J., Sun, X., Zhou, Y., Ji, R.: Towards local visual modeling for image captioning. Pattern Recognit. 138, 109420 (2023)","journal-title":"Pattern Recognit."},{"key":"22_CR28","doi-asserted-by":"crossref","unstructured":"Ma, Y., Xu, G., Sun, X., Yan, M., Zhang, J., Ji, R.: X-CLIP: end-to-end multi-grained contrastive learning for video-text retrieval. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 638\u2013647 (2022)","DOI":"10.1145\/3503161.3547910"},{"key":"22_CR29","doi-asserted-by":"crossref","unstructured":"Ma, Y., et al.: X-Mesh: towards fast and accurate text-driven 3d stylization via dynamic textual guidance. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2749\u20132760 (2023)","DOI":"10.1109\/ICCV51070.2023.00258"},{"key":"22_CR30","doi-asserted-by":"publisher","unstructured":"Milletari, F., Navab, N., Ahmadi, S.A.: V-net: fully convolutional neural networks for volumetric medical image segmentation. In: 2016 Fourth International Conference on 3D Vision (3DV), October 2016. https:\/\/doi.org\/10.1109\/3dv.2016.79, http:\/\/dx.doi.org\/10.1109\/3dv.2016.79","DOI":"10.1109\/3dv.2016.79"},{"key":"22_CR31","doi-asserted-by":"crossref","unstructured":"Nekrasov, V., Dharmasiri, T., Spek, A., Drummond, T., Shen, C., Reid, I.: Real-time joint semantic segmentation and depth estimation using asymmetric annotations, September 2018. $$\\text{ arXiv }$$: Computer Vision and Pattern Recognition,$$\\text{ arXiv }$$: Computer Vision and Pattern Recognition","DOI":"10.1109\/ICRA.2019.8794220"},{"key":"22_CR32","unstructured":"Qi, C., Yi, L., Su, H., Guibas, L.: Pointnet++: deep hierarchical feature learning on point sets in a metric space, June 2017. Cornell University - arXiv, Cornell University - arXiv"},{"key":"22_CR33","doi-asserted-by":"crossref","unstructured":"Qian, Z., Ma, Y., Ji, J., Sun, X.: X-RefSeg3D: enhancing referring 3D instance segmentation via structured cross-modal graph neural networks. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a038, pp. 4551\u20134559 (2024)","DOI":"10.1609\/aaai.v38i5.28254"},{"key":"22_CR34","doi-asserted-by":"publisher","unstructured":"Rezatofighi, H., Tsoi, N., Gwak, J., Sadeghian, A., Reid, I., Savarese, S.: Generalized intersection over union: a metric and a loss for bounding box regression. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), June 2019. https:\/\/doi.org\/10.1109\/cvpr.2019.00075, http:\/\/dx.doi.org\/10.1109\/cvpr.2019.00075","DOI":"10.1109\/cvpr.2019.00075"},{"key":"22_CR35","doi-asserted-by":"crossref","unstructured":"Sun, J., Qing, C., Tan, J., Xu, X.: Superpoint transformer for 3d scene instance segmentation, November 2022","DOI":"10.1609\/aaai.v37i2.25335"},{"key":"22_CR36","doi-asserted-by":"crossref","unstructured":"Wu, C., et al.: 3D-STMN: dependency-driven superpoint-text matching network for end-to-end 3D referring expression segmentation (2023)","DOI":"10.1609\/aaai.v38i6.28408"},{"key":"22_CR37","doi-asserted-by":"crossref","unstructured":"Wu, Y., Cheng, X., Zhang, R., Cheng, Z., Zhang, J.: EDA: explicit text-decoupling and dense alignment for 3d visual grounding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.01843"},{"key":"22_CR38","unstructured":"Yang, D., et al.: Sam as the guide: mastering pseudo-label refinement in semi-supervised referring expression segmentation. arXiv preprint arXiv:2406.01451 (2024)"},{"key":"22_CR39","doi-asserted-by":"publisher","unstructured":"Yang, D., et al.: Semi-supervised panoptic narrative grounding. In: Proceedings of the 31st ACM International Conference on Multimedia, October 2023. https:\/\/doi.org\/10.1145\/3581783.3612259, http:\/\/dx.doi.org\/10.1145\/3581783.3612259","DOI":"10.1145\/3581783.3612259"},{"key":"22_CR40","doi-asserted-by":"crossref","unstructured":"Yang, Z., Zhang, S., Wang, L., Luo, J.: Sat: 2D semantics assisted training for 3d visual grounding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1856\u20131866 (2021)","DOI":"10.1109\/ICCV48922.2021.00187"},{"key":"22_CR41","doi-asserted-by":"crossref","unstructured":"Yuan, Z., et al.: InstanceRefer: cooperative holistic understanding for visual grounding on point clouds through instance multi-level contextual referring. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1791\u20131800 (2021)","DOI":"10.1109\/ICCV48922.2021.00181"},{"key":"22_CR42","doi-asserted-by":"crossref","unstructured":"Zhao, L., Cai, D., Sheng, L., Xu, D.: 3DVG-Transformer: relation modeling for visual grounding on point clouds. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2928\u20132937 (2021)","DOI":"10.1109\/ICCV48922.2021.00292"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72952-2_22","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T05:14:42Z","timestamp":1727673282000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72952-2_22"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,1]]},"ISBN":["9783031729515","9783031729522"],"references-count":42,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72952-2_22","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,1]]},"assertion":[{"value":"1 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}