{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T16:13:56Z","timestamp":1774368836089,"version":"3.50.1"},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2025,5,6]],"date-time":"2025-05-06T00:00:00Z","timestamp":1746489600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,5,6]],"date-time":"2025-05-06T00:00:00Z","timestamp":1746489600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61903162"],"award-info":[{"award-number":["61903162"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Postgraduate Research & Practice Innovation Program of Jiangsu Province","award":["KYCX24_4093"],"award-info":[{"award-number":["KYCX24_4093"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Complex Intell. Syst."],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s40747-025-01893-x","type":"journal-article","created":{"date-parts":[[2025,5,6]],"date-time":"2025-05-06T06:37:25Z","timestamp":1746513445000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["VLA-Grasp: a vision-language-action modeling with cross-modality fusion for task-oriented grasping"],"prefix":"10.1007","volume":"11","author":[{"given":"Jianwei","family":"Zhu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6485-9939","authenticated-orcid":false,"given":"Xueying","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Qiang","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Mingmin","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,6]]},"reference":[{"issue":"2\u20133","key":"1893_CR1","doi-asserted-by":"publisher","first-page":"202","DOI":"10.1177\/0278364919872545","volume":"39","author":"K Fang","year":"2020","unstructured":"Fang K, Zhu Y, Garg A, Kurenkov A, Mehta V, Fei-Fei L, Savarese S (2020) Learning task-oriented grasping for tool manipulation from simulated self-supervision. Int J Robot Res 39(2\u20133):202\u2013216. https:\/\/doi.org\/10.1177\/0278364919872545","journal-title":"Int J Robot Res"},{"issue":"11","key":"1893_CR2","doi-asserted-by":"publisher","first-page":"7551","DOI":"10.1109\/LRA.2023.3320012","volume":"8","author":"C Tang","year":"2023","unstructured":"Tang C, Huang D, Ge W, Liu W, Zhang H (2023) Graspgpt: leveraging semantic knowledge from a large language model for task-oriented grasping. IEEE Robot Autom Lett 8(11):7551\u20137558. https:\/\/doi.org\/10.1109\/LRA.2023.3320012","journal-title":"IEEE Robot Autom Lett"},{"key":"1893_CR3","doi-asserted-by":"publisher","unstructured":"Murali A, Liu W, Marino K, Chernova S, Gupta A (2020) Same object, different grasps: Data and semantic knowledge for task-oriented grasping. In: 4th Conference on robot learning, (CoRL 2020), vol. 155, pp. 1540\u20131557 . https:\/\/doi.org\/10.48550\/arXiv.2011.06431","DOI":"10.48550\/arXiv.2011.06431"},{"key":"1893_CR4","doi-asserted-by":"publisher","unstructured":"Li H, Zhang Y, Li Y, He H (2021) Learning task-oriented dexterous grasping from human knowledge. In: 2021 IEEE International Conference on Robotics and Automation (ICRA 2021), pp. 6192\u20136198 . https:\/\/doi.org\/10.1109\/ICRA48506.2021.9562073","DOI":"10.1109\/ICRA48506.2021.9562073"},{"issue":"2","key":"1893_CR5","doi-asserted-by":"publisher","first-page":"3352","DOI":"10.1109\/LRA.2020.2975706","volume":"5","author":"M Kokic","year":"2020","unstructured":"Kokic M, Kragic D, Bohg J (2020) Learning task-oriented grasping from human activity datasets. IEEE Robot Autom Lett 5(2):3352\u20133359. https:\/\/doi.org\/10.1109\/LRA.2020.2975706","journal-title":"IEEE Robot Autom Lett"},{"key":"1893_CR6","doi-asserted-by":"publisher","unstructured":"Tang C, Huang D, Meng L, Liu W, Zhang H (2023) Task-oriented grasp prediction with visual-language inputs. In: 2023 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS 2023), pp. 4881\u20134888 . https:\/\/doi.org\/10.1109\/IROS55552.2023.10342268","DOI":"10.1109\/IROS55552.2023.10342268"},{"key":"1893_CR7","doi-asserted-by":"publisher","unstructured":"Jin S, Xu J, Lei Y, Zhang L (2024) Reasoning grasping via multimodal large language model. https:\/\/doi.org\/10.48550\/arXiv.2402.06798","DOI":"10.48550\/arXiv.2402.06798"},{"key":"1893_CR8","doi-asserted-by":"publisher","unstructured":"Chen W, Liang H, Chen Z, Sun F, Zhang J (2022) Learning 6-dof task-oriented grasp detection via implicit estimation and visual affordance. In: 2022 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS 2022), pp. 762\u2013769 . https:\/\/doi.org\/10.1109\/IROS47612.2022.9981900","DOI":"10.1109\/IROS47612.2022.9981900"},{"key":"1893_CR9","doi-asserted-by":"publisher","unstructured":"Detry R, Papon J, Matthies LH (2017) Task-oriented grasping with semantic and geometric scene understanding. In: 2017 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS 2017), pp. 3266\u20133273 . https:\/\/doi.org\/10.1109\/IROS.2017.8206162","DOI":"10.1109\/IROS.2017.8206162"},{"key":"1893_CR10","doi-asserted-by":"publisher","unstructured":"Wang J, Sun L, Zhu X, Qian Q, Tomizuka M (2023) A simple approach for general task-oriented picking using placing constraints. https:\/\/doi.org\/10.48550\/ARXIV.2304.01290","DOI":"10.48550\/ARXIV.2304.01290"},{"issue":"10","key":"1893_CR11","doi-asserted-by":"publisher","first-page":"12521","DOI":"10.1109\/TPAMI.2023.3272571","volume":"45","author":"T Zhu","year":"2023","unstructured":"Zhu T, Wu R, Hang J, Lin X, Sun Y (2023) Toward human-like grasp: functional grasp by dexterous robotic hand via object-hand semantic representation. IEEE Trans Pattern Anal Mach Intell 45(10):12521\u201312534. https:\/\/doi.org\/10.1109\/TPAMI.2023.3272571","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"1893_CR12","doi-asserted-by":"publisher","unstructured":"Mirjalili R, Krawez M, Silenzi S, Blei Y, Burgard W (2024) Lan-grasp: using large language models for semantic object grasping. https:\/\/doi.org\/10.48550\/arXiv.2310.05239","DOI":"10.48550\/arXiv.2310.05239"},{"key":"1893_CR13","unstructured":"Frome A, Corrado GS, Shlens J, Bengio S, Dean J, Ranzato MA, Mikolov T (2013) Devise: a deep visual-semantic embedding model. In: Advances in neural information processing systems (NIPS 2013), vol. 26, pp. 2121\u20132129"},{"key":"1893_CR14","doi-asserted-by":"publisher","unstructured":"Guo Z, Zhang R, Zhu X, Tang Y, Ma X, Han J, Chen K, Gao P, Li X, Li H, Heng P (2023) Point-bind & Point-LLM: aligning point cloud with multi-modality for 3D understanding, generation, and instruction following. https:\/\/doi.org\/10.48550\/ARXIV.2309.00615","DOI":"10.48550\/ARXIV.2309.00615"},{"key":"1893_CR15","doi-asserted-by":"publisher","unstructured":"Thoppilan R, Freitas DD, Hall J, Shazeer N, Kulshreshtha A, Cheng H-T, Jin A, Bos T, Baker L, Du Y, Li Y, Lee H, Zheng HS, Ghafouri A, Menegali M, Huang Y, Krikun M, Lepikhin D, Qin J, Chen D, Xu Y, Chen Z, Roberts A, Bosma M, Zhao V, Zhou Y, Chang C-C, Krivokon I, Rusch W, Pickett M, Srinivasan P, Man L, Meier-Hellstern K, Morris MR, Doshi T, Santos RD, Duke T, Soraker J, Zevenbergen B, Prabhakaran V, Diaz M, Hutchinson B, Olson K, Molina A, Hoffman-John E, Lee J, Aroyo L, Rajakumar R, Butryna A, Lamm M, Kuzmina V, Fenton J, Cohen A, Bernstein R, Kurzweil R, Aguera-Arcas B, Cui C, Croak M, Chi E, Le Q (2022) LaMDA: Language models for dialog applications . https:\/\/doi.org\/10.48550\/arXiv.2201.08239","DOI":"10.48550\/arXiv.2201.08239"},{"issue":"8","key":"1893_CR16","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford A, Wu J, Child R, Luan D, Amodei D, Sutskever I et al (2019) Language models are unsupervised multitask learners. OpenAI blog 1(8):9","journal-title":"OpenAI blog"},{"key":"1893_CR17","doi-asserted-by":"publisher","unstructured":"Brown T, Mann B, Ryder N, Subbiah M, Kaplan JD, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A, Agarwal S, Herbert-Voss A, Krueger G, Henighan T, Child R, Ramesh A, Ziegler D, Wu J, Winter C, Hesse C, Chen M, Sigler E, Litwin M, Gray S, Chess B, Clark J, Berner C, McCandlish S, Radford A, Sutskever I, Amodei D (2020) Language models are few-shot learners. In: Advances in neural information processing systems (NIPS 2020), vol. 33, pp. 1877\u20131901 . https:\/\/doi.org\/10.48550\/arXiv.2005.14165","DOI":"10.48550\/arXiv.2005.14165"},{"issue":"10","key":"1893_CR18","doi-asserted-by":"publisher","first-page":"3473","DOI":"10.1002\/acs.3885","volume":"38","author":"Z Peng","year":"2024","unstructured":"Peng Z, Song X, Song S, Stojanovic V (2024) Spatiotemporal fault estimation for switched nonlinear reaction-diffusion systems via adaptive iterative learning. Int J Adapt Control Signal Process 38(10):3473\u20133483. https:\/\/doi.org\/10.1002\/acs.3885","journal-title":"Int J Adapt Control Signal Process"},{"key":"1893_CR19","unstructured":"Jang E, Vijayanarasimhan S, Pastor P, Ibarz J, Levine S (2017) End-to-end learning of semantic grasping. In: Proceedings of the 1st Annual Conference on Robot Learning (CoRL 2017), vol. 78, pp. 119\u2013132"},{"key":"1893_CR20","doi-asserted-by":"publisher","unstructured":"Song Y, Sun P, Ren Y, Zheng Y, Zhang Y (2023) Learning 6-DoF fine-grained Grasp detection based on part affordance grounding. https:\/\/doi.org\/10.48550\/ARXIV.2301.11564","DOI":"10.48550\/ARXIV.2301.11564"},{"key":"1893_CR21","doi-asserted-by":"publisher","unstructured":"Tang C, Huang D, Dong W, Xu R, Zhang H (2024) FoundationGrasp: generalizable task-oriented grasping with foundation models. https:\/\/doi.org\/10.48550\/ARXIV.2404.10399","DOI":"10.48550\/ARXIV.2404.10399"},{"issue":"6","key":"1893_CR22","doi-asserted-by":"publisher","first-page":"7451","DOI":"10.1007\/s40747-023-01135-y","volume":"9","author":"Z Peng","year":"2023","unstructured":"Peng Z, Song X, Song S, Stojanovic V (2023) Hysteresis quantified control for switched reaction-diffusion systems and its application. Complex Intell Syst 9(6):7451\u20137460. https:\/\/doi.org\/10.1007\/s40747-023-01135-y","journal-title":"Complex Intell Syst"},{"key":"1893_CR23","doi-asserted-by":"publisher","unstructured":"Rashid A, Sharma S, Kim CM, Kerr J, Chen LY, Kanazawa A, Goldberg K (2023) Language embedded radiance fields for zero-shot task-oriented grasping. In: Conference on Robot Learning (CoRL 2023), vol. 229, pp. 178\u2013200 . https:\/\/doi.org\/10.48550\/arXiv.2309.07970","DOI":"10.48550\/arXiv.2309.07970"},{"key":"1893_CR24","doi-asserted-by":"publisher","unstructured":"Liu W, Daruna AA, Chernova S (2020) CAGE: context-aware grasping engine. In: 2020 IEEE International Conference on Robotics and Automation (ICRA 2020), pp. 2550\u20132556 . https:\/\/doi.org\/10.1109\/ICRA40945.2020.9197289","DOI":"10.1109\/ICRA40945.2020.9197289"},{"key":"1893_CR25","doi-asserted-by":"publisher","unstructured":"Li H, Mao W, Deng W, Meng C, Zhang R, Jia F, Wang T, Fan H, Wang H, Deng X (2024) SegGrasp: zero-shot task-oriented grasping via semantic and geometric guided segmentation. https:\/\/doi.org\/10.48550\/arXiv.2410.08901","DOI":"10.48550\/arXiv.2410.08901"},{"issue":"12","key":"1893_CR26","doi-asserted-by":"publisher","first-page":"8232","DOI":"10.1109\/LRA.2023.3326001","volume":"8","author":"V Holomjova","year":"2023","unstructured":"Holomjova V, Starkey AJ, Yun B, Mei\u00dfner P (2023) One-shot learning for task-oriented grasping. IEEE Robot Autom Lett 8(12):8232\u20138238. https:\/\/doi.org\/10.1109\/LRA.2023.3326001","journal-title":"IEEE Robot Autom Lett"},{"key":"1893_CR27","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, Krueger G, Sutskever I (2021) Learning transferable visual models from natural language supervision. In: Proceedings of the 38th International Conference on Machine Learning (ICML 2021), vol. 139, pp. 8748\u20138763"},{"issue":"2","key":"1893_CR28","doi-asserted-by":"publisher","first-page":"581","DOI":"10.1007\/S11263-023-01891-X","volume":"132","author":"P Gao","year":"2024","unstructured":"Gao P, Geng S, Zhang R, Ma T, Fang R, Zhang Y, Li H, Qiao Y (2024) Clip-adapter: Better vision-language models with feature adapters. Int J Comput Vis 132(2):581\u2013595. https:\/\/doi.org\/10.1007\/S11263-023-01891-X","journal-title":"Int J Comput Vis"},{"key":"1893_CR29","doi-asserted-by":"publisher","unstructured":"Wei Z, Pan Z, Owens A (2024) Efficient vision-language pre-training by cluster masking. In: 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2024), pp. 26805\u201326815 . https:\/\/doi.org\/10.1109\/CVPR52733.2024.02532","DOI":"10.1109\/CVPR52733.2024.02532"},{"key":"1893_CR30","doi-asserted-by":"publisher","unstructured":"Parmar N, Vaswani A, Uszkoreit J, Kaiser Shazeer N, Ku A, Tran D (2018) Image Transformer . https:\/\/doi.org\/10.48550\/arXiv.1802.05751","DOI":"10.48550\/arXiv.1802.05751"},{"key":"1893_CR31","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser Lu, Polosukhin I (2017) Attention is all you need. In: Guyon I, Luxburg UV, Bengio S, Wallach H, Fergus R, Vishwanathan S, Garnett R (eds) Advances in Neural Information Processing Systems (NIPS 2017), vol. 30. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/8563f5ee243547dee91fbd053c1c4a845aa-Paper.pdf. Accessed on 17 Apr 2025"},{"key":"1893_CR32","doi-asserted-by":"publisher","unstructured":"Patankar A, Phi K, Mahalingam D, Chakraborty N, Ramakrishnan I (2023) Task-oriented grasping with point cloud representation of objects. In: 2023 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS 2023), pp. 6853\u20136860 . https:\/\/doi.org\/10.1109\/IROS55552.2023.10342318","DOI":"10.1109\/IROS55552.2023.10342318"},{"key":"1893_CR33","doi-asserted-by":"publisher","unstructured":"Xue L, Gao M, Xing C, Mart\u00edn-Mart\u00edn R, Wu J, Xiong C, Xu R, Niebles JC, Savarese S (2023) ULIP: learning a unified representation of language, images, and point clouds for 3d understanding. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2023), pp. 1179\u20131189 . https:\/\/doi.org\/10.1109\/CVPR52729.2023.00120","DOI":"10.1109\/CVPR52729.2023.00120"},{"key":"1893_CR34","doi-asserted-by":"publisher","unstructured":"Zhang R, Guo Z, Zhang W, Li K, Miao X, Cui B, Qiao Y, Gao P, Li H (2022) Pointclip: Point cloud understanding by CLIP. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022), pp. 8542\u20138552 . https:\/\/doi.org\/10.1109\/CVPR52688.2022.00836","DOI":"10.1109\/CVPR52688.2022.00836"},{"key":"1893_CR35","doi-asserted-by":"publisher","unstructured":"Li K, Wang J, Yang L, Lu C, Dai B (2024) Semgrasp : Semantic grasp generation via language aligned discretization. In: The European Conference on Computer Vision (ECCV 2024), vol. 15060, pp. 109\u2013127 . https:\/\/doi.org\/10.1007\/978-3-031-72627-9_7","DOI":"10.1007\/978-3-031-72627-9_7"},{"key":"1893_CR36","doi-asserted-by":"publisher","unstructured":"Li M, Zhao Q, Lyu S, Wang C, Ma Y, Cheng G, Yang C (2024) Ovgnet: a unified visual-linguistic framework for open-vocabulary robotic grasping. In: 2024 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS 2024), pp. 7507\u20137513 .https:\/\/doi.org\/10.1109\/IROS58592.2024.10802654","DOI":"10.1109\/IROS58592.2024.10802654"},{"key":"1893_CR37","doi-asserted-by":"publisher","unstructured":"Lu Y, Deng B, Wang Z, Zhi P, Li Y, Wang S (2022) Hybrid physical metric for 6-dof grasp pose detection. In: 2022 International Conference on Robotics and Automation (ICRA 2022), pp. 8238\u20138244 . https:\/\/doi.org\/10.1109\/ICRA46639.2022.9811961","DOI":"10.1109\/ICRA46639.2022.9811961"},{"key":"1893_CR38","doi-asserted-by":"publisher","unstructured":"Xu K, Zhao S, Zhou Z, Li Z, Pi H, Zhu Y, Wang Y, Xiong R (2023) A joint modeling of vision-language-action for target-oriented grasping in clutter. In: 2023 IEEE International Conference on Robotics and Automation (ICRA 2023), pp. 11597\u201311604 . https:\/\/doi.org\/10.1109\/ICRA48891.2023.10161041","DOI":"10.1109\/ICRA48891.2023.10161041"},{"key":"1893_CR39","doi-asserted-by":"publisher","unstructured":"Hu X, Xie X (2024) Pointnet++ network with contextual feature and mutual learning for point sets https:\/\/doi.org\/10.20944\/preprints202403.0743.v1","DOI":"10.20944\/preprints202403.0743.v1"},{"key":"1893_CR40","doi-asserted-by":"publisher","unstructured":"Choi M, Kim H, Han B, Xu N, Lee KM (2020) Channel attention is all you need for video frame interpolation. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI 2020), pp. 10663\u201310671 . https:\/\/doi.org\/10.1609\/AAAI.V34I07.6693","DOI":"10.1609\/AAAI.V34I07.6693"},{"key":"1893_CR41","doi-asserted-by":"publisher","unstructured":"Kingma DP, Ba J (2015) Adam: a method for stochastic optimization. In: 3rd International Conference on Learning Representations (ICLR 2015). https:\/\/doi.org\/10.48550\/arXiv.1412.6980","DOI":"10.48550\/arXiv.1412.6980"},{"key":"1893_CR42","doi-asserted-by":"publisher","unstructured":"Devlin J, Chang M, Lee K, Toutanova K (2019) BERT: pre-training of deep bidirectional transformers for language understanding, 4171\u20134186 https:\/\/doi.org\/10.18653\/V1\/N19-1423","DOI":"10.18653\/V1\/N19-1423"}],"container-title":["Complex &amp; Intelligent Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s40747-025-01893-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s40747-025-01893-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s40747-025-01893-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,17]],"date-time":"2025-05-17T11:23:09Z","timestamp":1747480989000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s40747-025-01893-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,6]]},"references-count":42,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["1893"],"URL":"https:\/\/doi.org\/10.1007\/s40747-025-01893-x","relation":{},"ISSN":["2199-4536","2198-6053"],"issn-type":[{"value":"2199-4536","type":"print"},{"value":"2198-6053","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,5,6]]},"assertion":[{"value":"7 December 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 March 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 May 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"The article was submitted with the consent of all the authors to participate.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}}],"article-number":"272"}}