{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:12:15Z","timestamp":1778080335161,"version":"3.51.4"},"reference-count":174,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"10","license":[{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"State Key Laboratory of General Artificial Intelligence"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Neural Netw. Learning Syst."],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1109\/tnnls.2025.3584895","type":"journal-article","created":{"date-parts":[[2025,7,24]],"date-time":"2025-07-24T17:56:16Z","timestamp":1753379776000},"page":"17717-17737","source":"Crossref","is-referenced-by-count":14,"title":["A Survey on Text-Guided 3-D Visual Grounding: Elements, Recent Advances, and Future Directions"],"prefix":"10.1109","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8179-4508","authenticated-orcid":false,"given":"Daizong","family":"Liu","sequence":"first","affiliation":[{"name":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9329-754X","authenticated-orcid":false,"given":"Yang","family":"Liu","sequence":"additional","affiliation":[{"name":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wencan","family":"Huang","sequence":"additional","affiliation":[{"name":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9860-0922","authenticated-orcid":false,"given":"Wei","family":"Hu","sequence":"additional","affiliation":[{"name":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3005434"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.3390\/rs12111729"},{"key":"ref3","first-page":"40","article-title":"Learning representations and generative models for 3D point clouds","volume-title":"Proc. 35th Int. Conf. Mach. Learn.","volume":"80","author":"Achlioptas"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.2977924"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2019.2927869"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2002.1114854"},{"key":"ref7","first-page":"5885","article-title":"Computer-aided design as language","volume-title":"Proc. NeurIPS","author":"Ganin"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.2985588"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01018"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2023.3309104"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_13"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_25"},{"key":"ref13","article-title":"Recent advances in multi-modal 3D scene understanding: A comprehensive survey and evaluation","author":"Lei","year":"2023","journal-title":"arXiv:2310.15676"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3296889"},{"key":"ref15","article-title":"Language-guided robot grasping: CLIP-based referring grasp synthesis in clutter","author":"Tziafas","year":"2023","journal-title":"arXiv:2311.05779"},{"key":"ref16","article-title":"3D-VLA: A 3D vision-language-action generative world model","author":"Zhen","year":"2024","journal-title":"arXiv:2403.09631"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3258628"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3532626"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3556537"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3042066"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.127599"},{"key":"ref22","article-title":"Towards visual grounding: A survey","author":"Xiao","year":"2024","journal-title":"arXiv:2412.20206"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72673-6_16"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00446"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3177134"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3031371"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00466"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58555-6_3"},{"key":"ref29","first-page":"1046","article-title":"LanguageRefer: Spatial-language model for 3D visual grounding","volume":"164","author":"Roh","year":"2022","journal-title":"Proc. Mach. Learn. Res."},{"key":"ref30","first-page":"20522","article-title":"Language conditioned spatial relation reasoning for 3D object grounding","volume-title":"Proc. NeurIPS","author":"Chen"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475397"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611902"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01596"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01843"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00292"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00068"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00187"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01508"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00251"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00937"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00294"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00492"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00963"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00301"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2017.90"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-11015-4_29"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/MGRS.2019.2937630"},{"key":"ref49","first-page":"1","article-title":"PointNet++: Deep hierarchical feature learning on point sets in a metric space","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Qi"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00319"},{"key":"ref51","article-title":"HAM: Hierarchical attention model with high performance for 3D visual grounding","volume-title":"arXiv:2210.12513","author":"Chen","year":"2022"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01410"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1907.11692"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00370"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W15-2812"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00677"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16253"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1609.02907"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref61","article-title":"Graph attention networks","volume-title":"arXiv:1710.10903","author":"Veli\u010dkovi\u0107","year":"2017"},{"key":"ref62","first-page":"1","article-title":"Inductive representation learning on large graphs","volume-title":"Proc. NeurIPS","author":"Hamilton"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01597"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00272"},{"key":"ref65","first-page":"37146","article-title":"Look around and refer: 2D synthetic semantics knowledge distillation for 3D visual grounding","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Bakr"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00257"},{"key":"ref67","article-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling","author":"Chung","year":"2014","journal-title":"arXiv:1412.3555"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref69","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00597"},{"key":"ref71","article-title":"Four ways to improve verbo-visual fusion for dense 3D visual grounding","author":"Unal","year":"2023","journal-title":"arXiv:2309.04561"},{"key":"ref72","article-title":"Learning point-language hierarchical alignment for 3D visual grounding","volume-title":"arXiv:2210.12513","author":"Chen","year":"2022"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00181"},{"key":"ref74","first-page":"10984","article-title":"Context-aware alignment and mutual masking for 3D-language pre-training","volume-title":"Proc. CVPR","author":"Zhao"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.656"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_24"},{"key":"ref78","article-title":"Toward explainable and fine-grained 3D grounding through referring textual phrases","author":"Yuan","year":"2022","journal-title":"arXiv:2207.01821"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28408"},{"key":"ref80","first-page":"487","article-title":"D3Net: A unified speaker-listener architecture for 3D dense captioning and visual grounding","volume-title":"Proc. ECCV","author":"Chen"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.56"},{"key":"ref82","article-title":"CoT3DRef: Chain-of-thoughts data-efficient 3D visual grounding","author":"Abdelrahman","year":"2023","journal-title":"arXiv:2310.06214"},{"key":"ref83","first-page":"49542","article-title":"Exploiting contextual objects and relations for 3D visual grounding","volume-title":"Proc. NeurIPS","volume":"36","author":"Yang"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01397"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01660"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/AISP61396.2024.10475280"},{"key":"ref87","article-title":"Data-efficient 3D visual grounding via order-aware referring","author":"Wu","year":"2024","journal-title":"arXiv:2403.16539"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01340"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/3dv62453.2024.00033"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72784-9_11"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1145\/3595916.3626405"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01633"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01333"},{"key":"ref94","article-title":"Dual attribute-spatial relation alignment for 3D visual grounding","author":"Xu","year":"2024","journal-title":"arXiv:2406.08907"},{"key":"ref95","article-title":"Multi-object 3D grounding with dynamic modules and language-informed spatial attention","author":"Zhang","year":"2024","journal-title":"arXiv:2410.22306"},{"key":"ref96","article-title":"Fine-grained spatial and verbal losses for 3D visual grounding","author":"Dey","year":"2024","journal-title":"arXiv:2411.03405"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i8.32863"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.2978386"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680614"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680758"},{"key":"ref101","article-title":"SeCG: Semantic-enhanced 3D visual grounding via cross-modal graph attention","author":"Xiao","year":"2024","journal-title":"arXiv:2403.08182"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i4.28186"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01241"},{"key":"ref104","first-page":"26650","article-title":"LAMM: Language-assisted multi-modal instruction-tuning dataset, framework, and benchmark","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yin"},{"key":"ref105","article-title":"Uni3DL: Unified model for 3D and language understanding","author":"Li","year":"2023","journal-title":"arXiv:2312.03026"},{"key":"ref106","article-title":"PointCloud-text matching: Benchmark datasets and a baseline","author":"Feng","year":"2024","journal-title":"arXiv:2403.19386"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25177"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28559"},{"key":"ref109","article-title":"Visual programming for zero-shot open-vocabulary 3D visual grounding","author":"Yuan","year":"2023","journal-title":"arXiv:2311.15383"},{"key":"ref110","article-title":"Grounding 3D scene affordance from egocentric interactions","author":"Liu","year":"2024","journal-title":"arXiv:2409.19650"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-78113-1_17"},{"key":"ref112","article-title":"A unified framework for 3D point cloud visual grounding","author":"Lin","year":"2023","journal-title":"arXiv:2308.11887"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01320"},{"key":"ref114","article-title":"PD-TPE: Parallel decoder with text-guided position encoding for 3D visual grounding","author":"Hou","year":"2024","journal-title":"arXiv:2407.14491v1"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.00347"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72952-2_22"},{"key":"ref117","article-title":"Weakly-supervised 3D visual grounding based on visual linguistic alignment","author":"Xu","year":"2023","journal-title":"arXiv:2312.09625"},{"key":"ref118","article-title":"Weakly-supervised 3D referring expression segmentation","author":"Liu","year":"2024","journal-title":"Openreview"},{"key":"ref119","article-title":"EmbodiedScan: A holistic multi-modal 3D perception suite towards embodied AI","author":"Wang","year":"2023","journal-title":"arXiv:2312.16170"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28525"},{"key":"ref121","article-title":"HiFi-CS: Towards open vocabulary visual grounding for robotic grasping using vision-language models","author":"Bhat","year":"2024","journal-title":"arXiv:2409.10419"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.236"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00208"},{"key":"ref124","article-title":"WildRefer: 3D object localization in large-scale dynamic scenes with multi-modal visual data and natural language","author":"Lin","year":"2023","journal-title":"arXiv:2304.05645"},{"key":"ref125","article-title":"LLM-grounder: Open-vocabulary 3D visual grounding with large language model as an agent","author":"Yang","year":"2023","journal-title":"arXiv:2309.12311"},{"key":"ref126","first-page":"1","article-title":"Transcribe3D: Grounding LLMs using transcribed information for 3D referential reasoning with self-corrected finetuning","volume-title":"Proc. 2nd Workshop Lang. Robot Learn., Lang. Grounding","author":"Fang"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/ICMEW63481.2024.10645462"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01260"},{"key":"ref129","article-title":"3D-GRAND: Towards better grounding and less hallucination for 3D-LLMs","volume-title":"arXiv:2406.05132","author":"Yang","year":"2024"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73242-3_9"},{"key":"ref131","article-title":"Language-image models with 3D understanding","author":"Hyun Cho","year":"2024","journal-title":"arXiv:2405.03685"},{"key":"ref132","article-title":"Task-oriented sequential grounding and navigation in 3D scenes","author":"Zhang","year":"2024","journal-title":"arXiv:2408.04034"},{"key":"ref133","article-title":"Solving zero-shot 3D visual grounding as constraint satisfaction problems","author":"Yuan","year":"2024","journal-title":"arXiv:2411.14594"},{"key":"ref134","article-title":"SeeGround: See and ground for zero-shot open-vocabulary 3D visual grounding","author":"Li","year":"2024","journal-title":"arXiv:2412.04383"},{"key":"ref135","article-title":"VLM-grounder: A VLM agent for zero-shot 3D visual grounding","author":"Xu","year":"2024","journal-title":"arXiv:2410.13860"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.1470"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i8.32875"},{"key":"ref138","article-title":"Auto-GPT for online decision making: Benchmarks and additional opinions","author":"Yang","year":"2023","journal-title":"arXiv:2306.02224"},{"key":"ref139","first-page":"68539","article-title":"Toolformer: Language models can teach themselves to use tools","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Schick"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1109\/JAS.2023.123618"},{"issue":"8","key":"ref141","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI Blog"},{"key":"ref142","first-page":"22199","article-title":"Large language models are zero-shot reasoners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Kojima"},{"key":"ref143","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume-title":"Proc. NeurIPS","author":"Lee"},{"key":"ref144","first-page":"34892","article-title":"Visual instruction tuning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Liu"},{"key":"ref145","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023","journal-title":"arXiv:2304.10592"},{"key":"ref146","article-title":"MPLUG-owl: Modularization empowers large language models with multimodality","author":"Ye","year":"2023","journal-title":"arXiv:2304.14178"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.377"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.458"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.94"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2016.18"},{"key":"ref151","first-page":"20482","article-title":"3D-LLM: Injecting the 3D world into large language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Hong"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00593"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00257"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3365249"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548004"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00150"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01030"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16175"},{"key":"ref159","article-title":"GLIDE: Towards photorealistic image generation and editing with text-guided diffusion models","author":"Nichol","year":"2021","journal-title":"arXiv:2112.10741"},{"key":"ref160","article-title":"Hierarchical text-conditional image generation with CLIP latents","author":"Ramesh","year":"2022","journal-title":"arXiv:2204.06125"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref162","first-page":"200","article-title":"Multimodal few-shot learning with frozen language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Tsimpoukelli"},{"key":"ref163","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref164","first-page":"23716","article-title":"Flamingo: A visual language model for few-shot learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Alayrac"},{"key":"ref165","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. NeurIPS","author":"Brown"},{"key":"ref166","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv:2302.13971"},{"key":"ref167","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv:2307.09288"},{"key":"ref168","first-page":"49250","article-title":"InstructBLIP: Towards general-purpose vision-language models with instruction tuning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Dai"},{"key":"ref169","article-title":"GPT4Point: A unified framework for point-language understanding and generation","author":"Qi","year":"2023","journal-title":"arXiv:2312.02980"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681257"},{"key":"ref171","article-title":"Scene-LLM: Extending language model for 3D visual understanding and reasoning","author":"Fu","year":"2024","journal-title":"arXiv:2403.11401"},{"key":"ref172","article-title":"GPT4Scene: Understand 3D scenes from videos with vision-language models","author":"Qi","year":"2025","journal-title":"arXiv:2501.01428"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1109\/CASE59546.2024.10711845"},{"key":"ref174","article-title":"ROOT: VLM based system for indoor scene understanding and beyond","author":"Wang","year":"2024","journal-title":"arXiv:2411.15714"}],"container-title":["IEEE Transactions on Neural Networks and Learning Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/5962385\/11195929\/11095744.pdf?arnumber=11095744","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T17:39:26Z","timestamp":1759945166000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11095744\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10]]},"references-count":174,"journal-issue":{"issue":"10"},"URL":"https:\/\/doi.org\/10.1109\/tnnls.2025.3584895","relation":{},"ISSN":["2162-237X","2162-2388"],"issn-type":[{"value":"2162-237X","type":"print"},{"value":"2162-2388","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,10]]}}}