{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,11]],"date-time":"2026-02-11T21:30:39Z","timestamp":1770845439597,"version":"3.50.1"},"reference-count":41,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,5]],"date-time":"2025-10-05T00:00:00Z","timestamp":1759622400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,5]],"date-time":"2025-10-05T00:00:00Z","timestamp":1759622400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100017607","name":"Shenzhen Fundamental Research Program","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100017607","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012234","name":"Shenzhen Peacock Plan","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012234","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,5]]},"DOI":"10.1109\/smc58881.2025.11343173","type":"proceedings-article","created":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T20:54:44Z","timestamp":1769633684000},"page":"7723-7730","source":"Crossref","is-referenced-by-count":0,"title":["QwenGrasp: Human-Robot Interactive 6-DoF Target-Oriented Grasping with Large Vision-Language Model"],"prefix":"10.1109","author":[{"given":"Xinyu","family":"Chen","sequence":"first","affiliation":[{"name":"Southern University of Science and Technology,Department of Computer Science and Engineering,Shenzhen,China,518055"}]},{"given":"Jian","family":"Yang","sequence":"additional","affiliation":[{"name":"Southern University of Science and Technology,Department of Computer Science and Engineering,Shenzhen,China,518055"}]},{"given":"Qi","family":"Zhao","sequence":"additional","affiliation":[{"name":"Southern University of Science and Technology,Department of Computer Science and Engineering,Shenzhen,China,518055"}]},{"given":"Zonghan","family":"He","sequence":"additional","affiliation":[{"name":"Southern University of Science and Technology,Department of Computer Science and Engineering,Shenzhen,China,518055"}]},{"given":"Haobin","family":"Yang","sequence":"additional","affiliation":[{"name":"Southern University of Science and Technology,Department of Computer Science and Engineering,Shenzhen,China,518055"}]},{"given":"Yuhui","family":"Shi","sequence":"additional","affiliation":[{"name":"Southern University of Science and Technology,Department of Computer Science and Engineering,Shenzhen,China,518055"}]}],"member":"263","reference":[{"key":"ref1","first-page":"287","article-title":"Do as i can, not as i say: Grounding language in robotic affordances","volume-title":"Conference on Robot Learning","author":"Brohan"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161041"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9561920"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8461041"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9197318"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2021.3131378"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1177\/0278364919868017"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161333"},{"key":"ref9","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01146"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160779"},{"key":"ref12","first-page":"53","article-title":"S4g: Amodal single-view single-shot se (3) grasp detection in cluttered scenes","volume-title":"Conference on robot learning","author":"Qin"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00299"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.16"},{"key":"ref15","first-page":"2165","article-title":"Rt-2: Vision-language-action models transfer web knowledge to robotic control","volume-title":"Conference on Robot Learning","author":"Zitkovich"},{"key":"ref16","article-title":"Langrasp: An effective approach to semantic object grasping using large language models","volume-title":"First Workshop on Vision-Language Models for Navigation and Manipulation at ICRA 2024","author":"Mirjalili"},{"key":"ref17","article-title":"Moka: Open-vocabulary robotic manipulation through mark-based visual prompting","volume-title":"First Workshop on Vision-Language Models for Navigation and Manipulation at ICRA 2024","author":"Liu"},{"key":"ref18","article-title":"Palm-e: An embodied multimodal language model","author":"Driess","year":"2023"},{"key":"ref19","article-title":"Voxposer: Composable 3d value maps for robotic manipulation with language models","author":"Huang","year":"2023"},{"key":"ref20","article-title":"Qwen-vl: A frontier large vision-language model with versatile abilities","author":"Bai","year":"2023"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/COASE.2016.7743488"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8793972"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/IROS45743.2020.9341545"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.2970622"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2021.3092640"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2021.3123373"},{"key":"ref27","first-page":"894","article-title":"Cliport: What and where pathways for robotic manipulation","volume-title":"Conference on robot learning","author":"Shridhar"},{"key":"ref28","first-page":"13139","article-title":"Language-conditioned imitation learning for robot manipulation tasks","volume":"33","author":"Stepputtis","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref29","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref30","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9561877"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01566"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3181735"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8794435"},{"key":"ref35","article-title":"Reasoning tuning grasp: Adapting multi-modal large language models for robotic grasping","volume-title":"2nd Workshop on Language and Robot Learning: Language as Grounding","author":"Xu"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3320012"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/IROS55552.2023.10341379"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/MLCCIM60412.2023.00062"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CAC63892.2024.10865585"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CASE59546.2024.10711845"},{"key":"ref41","article-title":"Look before you leap: Unveiling the power of gpt-4v in robotic vision-language planning","author":"Hu","year":"2023"}],"event":{"name":"2025 IEEE International Conference on Systems, Man, and Cybernetics (SMC)","location":"Vienna, Austria","start":{"date-parts":[[2025,10,5]]},"end":{"date-parts":[[2025,10,8]]}},"container-title":["2025 IEEE International Conference on Systems, Man, and Cybernetics (SMC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11342430\/11342431\/11343173.pdf?arnumber=11343173","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,11]],"date-time":"2026-02-11T20:50:46Z","timestamp":1770843046000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11343173\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,5]]},"references-count":41,"URL":"https:\/\/doi.org\/10.1109\/smc58881.2025.11343173","relation":{},"subject":[],"published":{"date-parts":[[2025,10,5]]}}}