{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T07:50:04Z","timestamp":1774079404001,"version":"3.50.1"},"reference-count":53,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T00:00:00Z","timestamp":1774051200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T00:00:00Z","timestamp":1774051200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int. J. Mach. Learn. &amp; Cyber."],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1007\/s13042-026-03042-3","type":"journal-article","created":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T05:21:50Z","timestamp":1774070510000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Enhancing spatial-semantic coherence of 3D visual grounding via spatial-aware encoder and target refinement"],"prefix":"10.1007","volume":"17","author":[{"given":"Shucheng","family":"Wan","sequence":"first","affiliation":[]},{"given":"Mingwen","family":"Shao","sequence":"additional","affiliation":[]},{"given":"Lingzhuang","family":"Meng","sequence":"additional","affiliation":[]},{"given":"Jie","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,3,21]]},"reference":[{"key":"3042_CR1","unstructured":"Ahn M, Brohan A, Brown N, Chebotar Y, Cortes O, David B, Finn C, Fu C, Gopalakrishnan K, Hausman K, et al (2022) Do as i can, not as i say: Grounding language in robotic affordances. arXiv preprint arXiv:2204.01691"},{"key":"3042_CR2","doi-asserted-by":"crossref","unstructured":"Zhao G, Li G, Chen W, Yu Y (2024) Over-nav: Elevating iterative vision-and-language navigation with open-vocabulary detection and structured representation. In: Proceedings of the IEEE\/CVF conference on xomputer vision and pattern recognition, 16296\u201316306","DOI":"10.1109\/CVPR52733.2024.01542"},{"key":"3042_CR3","doi-asserted-by":"crossref","unstructured":"Li X, Zhang M, Geng Y, Geng H, Long Y, Shen Y, Zhang R, Liu J, Dong H (2024) Manipllm: Embodied multimodal large language model for object-centric robotic manipulation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 18061\u201318070","DOI":"10.1109\/CVPR52733.2024.01710"},{"key":"3042_CR4","doi-asserted-by":"crossref","unstructured":"Pan C, Yaman B, Nesti T, Mallik A, Allievi AG, Velipasalar S, Ren L (2024) Vlp: Vision language planning for autonomous driving. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 14760\u201314769","DOI":"10.1109\/CVPR52733.2024.01398"},{"key":"3042_CR5","doi-asserted-by":"crossref","unstructured":"Dai A, Chang AX, Savva M, Halber M, Funkhouser T, Nie\u00dfner M (2017) Scannet: Richly-annotated 3d reconstructions of indoor scenes. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 5828\u20135839","DOI":"10.1109\/CVPR.2017.261"},{"key":"3042_CR6","doi-asserted-by":"crossref","unstructured":"Qi CR, Litany O, He K, Guibas LJ (2019) Deep hough voting for 3d object detection in point clouds. In: Proceedings of the IEEE\/CVF international conference on computer vision, 9277\u20139286","DOI":"10.1109\/ICCV.2019.00937"},{"key":"3042_CR7","doi-asserted-by":"crossref","unstructured":"Rukhovich D, Vorontsova A, Konushin A (2022) Fcaf3d: Fully convolutional anchor-free 3d object detection. In: European Conference on Computer Vision, 477\u2013493. Springer","DOI":"10.1007\/978-3-031-20080-9_28"},{"issue":"12","key":"3042_CR8","doi-asserted-by":"publisher","first-page":"4338","DOI":"10.1109\/TPAMI.2020.3005434","volume":"43","author":"Y Guo","year":"2020","unstructured":"Guo Y, Wang H, Hu Q, Liu H, Liu L, Bennamoun M (2020) Deep learning for 3d point clouds: a survey. IEEE Trans Pattern Anal Mach Intell 43(12):4338\u20134364","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"3042_CR9","unstructured":"Qi CR, Su H, Mo K, Guibas LJ (2017) Pointnet: Deep learning on point sets for 3d classification and segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 652\u2013660"},{"key":"3042_CR10","doi-asserted-by":"crossref","unstructured":"Chen Y, Hu VT, Gavves E, Mensink T, Mettes P, Yang P, Snoek CG (2020) Pointmixup: Augmentation for point clouds. In: Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part III 16, 330\u2013345. Springer","DOI":"10.1007\/978-3-030-58580-8_20"},{"key":"3042_CR11","doi-asserted-by":"crossref","unstructured":"Kim S, Lee S, Hwang D, Lee J, Hwang SJ, Kim HJ (2021) Point cloud augmentation with weighted local transformations. In: Proceedings of the IEEE\/CVF international conference on computer vision, 548\u2013557","DOI":"10.1109\/ICCV48922.2021.00059"},{"key":"3042_CR12","first-page":"23192","volume":"35","author":"G Qian","year":"2022","unstructured":"Qian G, Li Y, Peng H, Mai J, Hammoud H, Elhoseiny M, Ghanem B (2022) Pointnext: Revisiting pointnet++ with improved training and scaling strategies. Adv Neural Inf Process Syst 35:23192\u201323204","journal-title":"Adv Neural Inf Process Syst"},{"key":"3042_CR13","doi-asserted-by":"crossref","unstructured":"Yang J, Ding R, Deng W, Wang Z, Qi X (2024) Regionplc: Regional point-language contrastive learning for open-world 3d scene understanding. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 19823\u201319832","DOI":"10.1109\/CVPR52733.2024.01874"},{"key":"3042_CR14","doi-asserted-by":"crossref","unstructured":"Achlioptas P, Abdelreheem A, Xia F, Elhoseiny M, Guibas L (2020) Referit3d: Neural listeners for fine-grained 3d object identification in real-world scenes. In: Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part I 16, 422\u2013440. Springer","DOI":"10.1007\/978-3-030-58452-8_25"},{"key":"3042_CR15","doi-asserted-by":"crossref","unstructured":"Huang P-H, Lee H-H, Chen H-T, Liu T-L (2021) Text-guided graph neural networks for referring 3d instance segmentation. In: Proceedings of the AAAI conference on artificial intelligence, 35:1610\u20131618","DOI":"10.1609\/aaai.v35i2.16253"},{"key":"3042_CR16","doi-asserted-by":"crossref","unstructured":"Yang Z, Zhang S, Wang L, Luo J (2021) Sat: 2d semantics assisted training for 3d visual grounding. In: Proceedings of the IEEE\/CVF international conference on computer vision, 1856\u20131866","DOI":"10.1109\/ICCV48922.2021.00187"},{"key":"3042_CR17","doi-asserted-by":"crossref","unstructured":"Zhao L, Cai D, Sheng L, Xu D (2021) 3dvg-transformer: Relation modeling for visual grounding on point clouds. In: Proceedings of the IEEE\/CVF international conference on computer vision, 2928\u20132937","DOI":"10.1109\/ICCV48922.2021.00292"},{"key":"3042_CR18","doi-asserted-by":"crossref","unstructured":"He D, Zhao Y, Luo J, Hui T, Huang S, Zhang A, Liu S (2021) Transrefer3d: Entity-and-relation aware transformer for fine-grained 3d visual grounding. In: Proceedings of the 29th ACM international conference on multimedia, 2344\u20132352","DOI":"10.1145\/3474085.3475397"},{"issue":"1","key":"3042_CR19","doi-asserted-by":"publisher","first-page":"509","DOI":"10.1109\/TWC.2024.3495812","volume":"24","author":"G Pan","year":"2025","unstructured":"Pan G, Wu Q, Zhou B, Li J, Wang W, Ding G, Yau DKY (2025) Spectrum prediction with deep 3d pyramid vision transformer learning. IEEE Trans Wireless Commun 24(1):509\u2013525","journal-title":"IEEE Trans Wireless Commun"},{"key":"3042_CR20","doi-asserted-by":"crossref","unstructured":"Zhu Z, Ma X, Chen Y, Deng Z, Huang S, Li Q (2023) 3d-vista: Pre-trained transformer for 3d vision and text alignment. In: Proceedings of the IEEE\/CVF international conference on computer vision, 2911\u20132921","DOI":"10.1109\/ICCV51070.2023.00272"},{"key":"3042_CR21","doi-asserted-by":"crossref","unstructured":"Hsu J, Mao J, Wu J (2023) Ns3d: Neuro-symbolic grounding of 3d objects and relations. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 2614\u20132623","DOI":"10.1109\/CVPR52729.2023.00257"},{"key":"3042_CR22","doi-asserted-by":"crossref","unstructured":"Shi X, Wu Z, Lee S (2024) Aware visual grounding in 3d scenes. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 14056\u201314065","DOI":"10.1109\/CVPR52733.2024.01333"},{"key":"3042_CR23","doi-asserted-by":"crossref","unstructured":"Jain A, Gkanatsios N, Mediratta I, Fragkiadaki K (2022) Bottom up top down detection transformers for language grounding in images and point clouds. In: European conference on computer vision, 417\u2013433. Springer","DOI":"10.1007\/978-3-031-20059-5_24"},{"key":"3042_CR24","doi-asserted-by":"crossref","unstructured":"Feng M, Li Z, Li Q, Zhang L, Zhang X, Zhu G, Zhang H, Wang Y, Mian A (2021) Free-form description guided 3d visual graph network for object grounding in point cloud. In: Proceedings of the IEEE\/CVF international conference on computer vision, 3722\u20133731","DOI":"10.1109\/ICCV48922.2021.00370"},{"key":"3042_CR25","doi-asserted-by":"crossref","unstructured":"Zhang Y, Gong Z, Chang AX (2023) Multi3drefer: Grounding text description to multiple 3d objects. In: Proceedings of the IEEE\/CVF international conference on computer vision, 15225\u201315236","DOI":"10.1109\/ICCV51070.2023.01397"},{"key":"3042_CR26","first-page":"37146","volume":"35","author":"E Bakr","year":"2022","unstructured":"Bakr E, Alsaedy Y, Elhoseiny M (2022) Look around and refer: 2d synthetic semantics knowledge distillation for 3d visual grounding. Adv Neural Inf Process Syst 35:37146\u201337158","journal-title":"Adv Neural Inf Process Syst"},{"key":"3042_CR27","doi-asserted-by":"crossref","unstructured":"Huang S, Chen Y, Jia J, Wang L (2022) Multi-view transformer for 3d visual grounding. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 15524\u201315533","DOI":"10.1109\/CVPR52688.2022.01508"},{"key":"3042_CR28","doi-asserted-by":"crossref","unstructured":"Guo Z, Tang Y, Zhang R, Wang D, Wang Z, Zhao B, Li X (2023) Viewrefer: Grasp the multi-view knowledge for 3d visual grounding. In: Proceedings of the IEEE\/CVF international conference on computer vision, 15372\u201315383","DOI":"10.1109\/ICCV51070.2023.01410"},{"key":"3042_CR29","doi-asserted-by":"crossref","unstructured":"Yuan Z, Yan X, Liao Y, Zhang R, Wang S, Li Z, Cui S (2021) Instancerefer: Cooperative holistic understanding for visual grounding on point clouds through instance multi-level contextual referring. In: Proceedings of the IEEE\/CVF international conference on computer vision, 1791\u20131800","DOI":"10.1109\/ICCV48922.2021.00181"},{"key":"3042_CR30","unstructured":"Roh J, Desingh K, Farhadi A, Fox D (2022) Languagerefer: Spatial-language model for 3d visual grounding. In: Conference on robot learning, 1046\u20131056. PMLR"},{"key":"3042_CR31","unstructured":"Qi CR, Yi L, Su H, Guibas LJ (2017) Pointnet++: Deep hierarchical feature learning on point sets in a metric space. Advances in neural information processing systems 30"},{"key":"3042_CR32","doi-asserted-by":"crossref","unstructured":"Liu Y, Wang R, Shan S, Chen X (2018) Structure inference net: Object detection using scene-level context and instance-level relationships. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 6985\u20136994","DOI":"10.1109\/CVPR.2018.00730"},{"key":"3042_CR33","doi-asserted-by":"crossref","unstructured":"Zhang M, Tseng C, Kreiman G (2020) Putting visual object recognition in context. In: Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition, 12985\u201312994","DOI":"10.1109\/CVPR42600.2020.01300"},{"issue":"6","key":"3042_CR34","doi-asserted-by":"publisher","first-page":"712","DOI":"10.1016\/j.cviu.2010.02.004","volume":"114","author":"C Galleguillos","year":"2010","unstructured":"Galleguillos C, Belongie S (2010) Context based object categorization: a critical survey. Comput Vis Image Underst 114(6):712\u2013722","journal-title":"Comput Vis Image Underst"},{"issue":"1","key":"3042_CR35","doi-asserted-by":"publisher","first-page":"389","DOI":"10.5721\/EuJRS20144723","volume":"47","author":"M Li","year":"2014","unstructured":"Li M, Zang S, Zhang B, Li S, Wu C (2014) A review of remote sensing image classification techniques: The role of spatio-contextual information. Euro J Remote Sens 47(1):389\u2013411","journal-title":"Euro J Remote Sens"},{"key":"3042_CR36","doi-asserted-by":"crossref","unstructured":"Teney D, Liu L, Den Hengel A (2017) Graph-structured representations for visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 1\u20139","DOI":"10.1109\/CVPR.2017.344"},{"key":"3042_CR37","doi-asserted-by":"crossref","unstructured":"Alam F, Ofli F, Imran M (2018) Crisismmd: Multimodal twitter datasets from natural disasters. In: Proceedings of the international AAAI conference on web and social media, 12","DOI":"10.1609\/icwsm.v12i1.14983"},{"key":"3042_CR38","unstructured":"Pranesh R (2022) Exploring multimodal features and fusion strategies for analyzing disaster tweets. In: Proceedings of the Eighth Workshop on Noisy User-generated Text (W-NUT 2022), 62\u201368"},{"issue":"5","key":"3042_CR39","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2021.102610","volume":"58","author":"J Xue","year":"2021","unstructured":"Xue J, Wang Y, Tian Y, Li Y, Shi L, Wei L (2021) Detecting fake news by exploring the consistency of multimodal data. Inform Process Manag 58(5):102610","journal-title":"Inform Process Manag"},{"issue":"5","key":"3042_CR40","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3326362","volume":"38","author":"Y Wang","year":"2019","unstructured":"Wang Y, Sun Y, Liu Z, Sarma SE, Bronstein MM, Solomon JM (2019) Dynamic graph cnn for learning on point clouds. ACM Transactions on Graphics (tog) 38(5):1\u201312","journal-title":"ACM Transactions on Graphics (tog)"},{"key":"3042_CR41","unstructured":"Kipf TN, Welling M (2016) Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907"},{"key":"3042_CR42","unstructured":"Velickovic P, Cucurull G, Casanova A, Romero A, Lio P, Bengio Y, etal (2017) Graph attention networks. stat 1050(20), 10\u201348550"},{"key":"3042_CR43","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown T, Mann B, Ryder N, Subbiah M, Kaplan JD, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A (2020) etal Language models are few-shot learners. Adv Neural Inf Process Syst 33:1877\u20131901","journal-title":"Adv Neural Inf Process Syst"},{"key":"3042_CR44","doi-asserted-by":"crossref","unstructured":"Chen DZ, Chang AX, Nie\u00dfner M (2020) Scanrefer: 3d object localization in rgb-d scans using natural language. In: European Conference on Computer Vision, 202\u2013221. Springer","DOI":"10.1007\/978-3-030-58565-5_13"},{"key":"3042_CR45","unstructured":"Yang L, Zhang Z, Qi Z, Xu Y, Liu W, Shan Y, Li B, Yang W, Li P, Wang Y, et al (2024) Exploiting contextual objects and relations for 3d visual grounding. Advances in Neural Information Processing Systems 36"},{"key":"3042_CR46","doi-asserted-by":"crossref","unstructured":"Guo Z, Tang Y, Zhang R, Wang D, Wang Z, Zhao B, Li X (2023) Viewrefer: Grasp the multi-view knowledge for 3d visual grounding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 15372\u201315383","DOI":"10.1109\/ICCV51070.2023.01410"},{"key":"3042_CR47","doi-asserted-by":"crossref","unstructured":"Zhang T, He S, Dai T, Wang Z, Chen B, Xia S-T (2024) Vision-language pre-training with object contrastive learning for 3d scene understanding. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, 7296\u20137304","DOI":"10.1609\/aaai.v38i7.28559"},{"key":"3042_CR48","doi-asserted-by":"crossref","unstructured":"Liu Y, Liu D, Guo Z, Hu W (2024) Cross-task knowledge transfer for semi-supervised joint 3d grounding and captioning. In: Proceedings of the 32nd ACM International Conference on Multimedia, 3818\u20133827","DOI":"10.1145\/3664647.3680614"},{"key":"3042_CR49","doi-asserted-by":"crossref","unstructured":"Wang T, Mao X, Zhu C, Xu R, Lyu R, Li P, Chen X, Zhang W, Chen K, Xue T, etal (2024)Embodiedscan: A holistic multi-modal 3d perception suite towards embodied ai. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 19757\u201319767","DOI":"10.1109\/CVPR52733.2024.01868"},{"key":"3042_CR50","doi-asserted-by":"crossref","unstructured":"Qian Z, Ma Y, Lin Z, Ji J, Zheng X, Sun X, Ji R (2024) Multi-branch collaborative learning network for 3d visual grounding. In: European Conference on Computer Vision, 381\u2013398. Springer","DOI":"10.1007\/978-3-031-72952-2_22"},{"key":"3042_CR51","unstructured":"Devlin J (2018) Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805"},{"key":"3042_CR52","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"3042_CR53","doi-asserted-by":"crossref","unstructured":"Jiang L, Zhao H, Shi S, Liu S, Fu C-W, Jia J (2020) Pointgroup: Dual-set point grouping for 3d instance segmentation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 4867\u20134876","DOI":"10.1109\/CVPR42600.2020.00492"}],"container-title":["International Journal of Machine Learning and Cybernetics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-026-03042-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13042-026-03042-3","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-026-03042-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T05:21:59Z","timestamp":1774070519000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13042-026-03042-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,21]]},"references-count":53,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2026,5]]}},"alternative-id":["3042"],"URL":"https:\/\/doi.org\/10.1007\/s13042-026-03042-3","relation":{},"ISSN":["1868-8071","1868-808X"],"issn-type":[{"value":"1868-8071","type":"print"},{"value":"1868-808X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,3,21]]},"assertion":[{"value":"13 June 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 February 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 March 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"All authors declared that they have no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Funding and\/or Conflict of interest"}},{"value":"The authors declare no Conflict of interest.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"211"}}