{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T22:16:26Z","timestamp":1775859386849,"version":"3.50.1"},"reference-count":67,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100011227","name":"Shenzhen Key Laboratory of Robotics and Computer Vision","doi-asserted-by":"publisher","award":["ZDSYS20220330160557001"],"award-info":[{"award-number":["ZDSYS20220330160557001"]}],"id":[{"id":"10.13039\/501100011227","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100008100","name":"High-Level Special Funds from the Southern University of Science and Technology, Shenzhen, China","doi-asserted-by":"publisher","award":["G03034K003"],"award-info":[{"award-number":["G03034K003"]}],"id":[{"id":"10.13039\/501100008100","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Automat. Sci. Eng."],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/tase.2025.3542418","type":"journal-article","created":{"date-parts":[[2025,2,17]],"date-time":"2025-02-17T18:42:54Z","timestamp":1739817774000},"page":"12418-12435","source":"Crossref","is-referenced-by-count":21,"title":["FoundationGrasp: Generalizable Task-Oriented Grasping With Foundation Models"],"prefix":"10.1109","volume":"22","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8287-7188","authenticated-orcid":false,"given":"Chao","family":"Tang","sequence":"first","affiliation":[{"name":"Shenzhen Key Laboratory of Robotics and Computer Vision and the Department of Electronic and Electrical Engineering, Southern University of Science and Technology, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2693-9309","authenticated-orcid":false,"given":"Dehao","family":"Huang","sequence":"additional","affiliation":[{"name":"Shenzhen Key Laboratory of Robotics and Computer Vision and the Department of Electronic and Electrical Engineering, Southern University of Science and Technology, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4127-2720","authenticated-orcid":false,"given":"Wenlong","family":"Dong","sequence":"additional","affiliation":[{"name":"Shenzhen Key Laboratory of Robotics and Computer Vision and the Department of Electronic and Electrical Engineering, Southern University of Science and Technology, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3974-1629","authenticated-orcid":false,"given":"Ruinian","family":"Xu","sequence":"additional","affiliation":[{"name":"Institute for Robotics and Intelligent Machines, Georgia Institute of Technology, Atlanta, GA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1677-6132","authenticated-orcid":false,"given":"Hong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shenzhen Key Laboratory of Robotics and Computer Vision and the Department of Electronic and Electrical Engineering, Southern University of Science and Technology, Shenzhen, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/56.769"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2012.6385563"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/IROS40897.2019.8967992"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1177\/0278364919872545"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9196971"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1126\/science.8316836"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1038\/nature17637"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TRO.2007.914848"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2010.5649406"},{"key":"ref10","first-page":"1540","article-title":"Same object, different grasps: Data and semantic knowledge for task-oriented grasping","volume-title":"Proc. Conf. robot Learn.","author":"Murali"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2017.8206162"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/s10514-018-9784-8"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160591"},{"key":"ref14","article-title":"VoxPoser: Composable 3D value maps for robotic manipulation with language models","author":"Huang","year":"2023","journal-title":"arXiv:2307.05973"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160969"},{"key":"ref16","first-page":"492","article-title":"LM-nav: Robotic navigation with large pre-trained models of language, vision, and action","volume-title":"Proc. Conf. Robot Learn.","author":"Shah"},{"key":"ref17","article-title":"Do as I can, not as I say: Grounding language in robotic affordances","author":"Ahn","year":"2022","journal-title":"arXiv:2204.01691"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3320012"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ROBOT.1992.219918"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/70.508445"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1703.09312"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2018.2852777"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00299"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9197256"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9561877"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TIV.2023.3349171"},{"key":"ref28","article-title":"GraspLDM: Generative 6-DoF grasp synthesis using latent diffusion models","author":"Barad","year":"2023","journal-title":"arXiv:2312.11243"},{"key":"ref29","article-title":"Language-guided manipulation with diffusion policies and constrained inpainting","author":"Hao","year":"2024","journal-title":"arXiv:2406.09767"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72940-9_13"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TRO.2015.2409912"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2019.2933815"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2021.3062560"},{"key":"ref34","article-title":"Language-conditioned affordance-pose detection in 3D point clouds","author":"Nguyen","year":"2023","journal-title":"arXiv:2309.10911"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9197289"},{"key":"ref36","first-page":"178","article-title":"Language embedded radiance fields for zero-shot task-oriented grasping","volume-title":"Proc. 7th Annu. Conf. Robot Learn.","volume":"229","author":"Rashid"},{"key":"ref37","article-title":"Lan-grasp: Using large language models for semantic object grasping","author":"Mirjalili","year":"2023","journal-title":"arXiv:2310.05239"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/IROS58592.2024.10801661"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/IROS55552.2023.10342268"},{"key":"ref40","first-page":"1531","article-title":"Leveraging language for accelerated learning of tool manipulation","volume-title":"Proc. Conf. Robot Learn.","author":"Ren"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/s10514-021-10008-7"},{"key":"ref42","first-page":"23311","article-title":"Decomposing NeRF for editing via feature field distillation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Kobayashi"},{"key":"ref43","first-page":"1691","article-title":"Language grounding with 3D objects","volume-title":"Proc. Conf. Robot Learn.","author":"Thomason"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref45","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","volume":"139","author":"Radford"},{"key":"ref46","first-page":"1025","article-title":"PointNet++: Deep hierarchical feature learning on point sets in a metric space","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"30","author":"Qi"},{"key":"ref47","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018","journal-title":"arXiv:1810.04805"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01563"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.11164"},{"key":"ref50","article-title":"Grounding DINO: Marrying DINO with grounded pre-training for open-set object detection","author":"Liu","year":"2023","journal-title":"arXiv:2303.05499"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2012.2200563"},{"key":"ref52","article-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2014","journal-title":"arXiv:1412.6980"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/MRA.2011.2181749"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/HUMANOIDS.2015.7363472"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ROBOT.2000.844730"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2012.6225337"},{"key":"ref57","first-page":"250","article-title":"A2-nets: Double attention networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"31","author":"Chen"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.667"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11671"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1907.11692"},{"key":"ref61","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv:2302.13971"},{"issue":"140","key":"ref62","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref63","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref64","first-page":"34892","article-title":"Visual instruction tuning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Liu"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01807"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01895"},{"key":"ref67","volume-title":"Video Generation Models as World Simulators","year":"2024"}],"container-title":["IEEE Transactions on Automation Science and Engineering"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/8856\/10839176\/10891012.pdf?arnumber=10891012","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,16]],"date-time":"2025-04-16T17:55:09Z","timestamp":1744826109000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10891012\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":67,"URL":"https:\/\/doi.org\/10.1109\/tase.2025.3542418","relation":{},"ISSN":["1545-5955","1558-3783"],"issn-type":[{"value":"1545-5955","type":"print"},{"value":"1558-3783","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]}}}