{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T03:21:43Z","timestamp":1779247303701,"version":"3.51.4"},"reference-count":33,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,19]]},"DOI":"10.1109\/icra55743.2025.11128140","type":"proceedings-article","created":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T17:28:56Z","timestamp":1756834136000},"page":"1-7","source":"Crossref","is-referenced-by-count":5,"title":["RTAGrasp: Learning Task-Oriented Grasping from Human Videos via Retrieval, Transfer, and Alignment"],"prefix":"10.1109","author":[{"given":"Wenlong","family":"Dong","sequence":"first","affiliation":[{"name":"Southern University of Science and Technology,Shenzhen Key Laboratory of Robotics and Computer Vision,Shenzhen,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dehao","family":"Huang","sequence":"additional","affiliation":[{"name":"Southern University of Science and Technology,Shenzhen Key Laboratory of Robotics and Computer Vision,Shenzhen,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiangshan","family":"Liu","sequence":"additional","affiliation":[{"name":"Southern University of Science and Technology,Department of Electronic and Electrical Engineering,Shenzhen,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chao","family":"Tang","sequence":"additional","affiliation":[{"name":"Southern University of Science and Technology,Shenzhen Key Laboratory of Robotics and Computer Vision,Shenzhen,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Southern University of Science and Technology,Shenzhen Key Laboratory of Robotics and Computer Vision,Shenzhen,China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3320012"},{"key":"ref2","article-title":"Grasp as you say: Language-guided dexterous grasp generation","author":"Wei","year":"2024","journal-title":"arXiv preprint"},{"key":"ref3","first-page":"15401557","article-title":"Same object, different grasps: Data and semantic knowledge for task-oriented grasping","volume-title":"Conference on robot learning. PMLR","author":"Murali"},{"key":"ref4","article-title":"Language embedded radiance fields for zero-shot task-oriented grasping","volume-title":"7th Annual Conference on Robot Learning","author":"Rashid"},{"key":"ref5","article-title":"Langrasp: Using large language models for semantic object grasping","author":"Mirjalili","year":"2023","journal-title":"arXiv preprint"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72940-9_13"},{"key":"ref7","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning. PMLR","author":"Radford"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.3389\/fpsyg.2020.573730"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CIRA.2005.1554357"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ROBOT.2007.363582"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/IROS55552.2023.10342268"},{"key":"ref13","article-title":"Reasoning grasping via multimodal large language model","author":"Jin","year":"2024","journal-title":"arXiv preprint"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610008"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/IROS58592.2024.10801661"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611213"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/IROS58592.2024.10801656"},{"key":"ref18","article-title":"Leveraging semantic and geometric information for zero-shot robot-to-human handover","author":"Liu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref19","article-title":"Optimizing nerf-based slam with trajectory smoothness constraints","author":"He","year":"2024","journal-title":"arXiv preprint"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00989"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00938"},{"key":"ref23","volume-title":"The grasps under varied object orientation dataset: Relation between grasps and object orientation","author":"Cheng","year":"2021"},{"key":"ref24","volume-title":"Gpt-4v: Openai vision model","year":"2023"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1016\/0166-2236(95)93921-J"},{"key":"ref26","first-page":"9459","article-title":"Retrievalaugmented generation for knowledge-intensive nlp tasks","volume":"33","author":"Lewis","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref27","first-page":"1363","article-title":"Emergent correspondence from image diffusion","volume":"36","author":"Tang","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref28","article-title":"A tale of two features: Stable diffusion complements dino for zero-shot semantic correspondence","volume":"36","author":"Zhang","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00297"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-008-0152-6"},{"issue":"3","key":"ref31","first-page":"4","article-title":"Deep vit features as dense visual descriptors","volume":"2","author":"Amir","year":"2021","journal-title":"arXiv preprint"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9561877"},{"key":"ref33","article-title":"Dinov2: Learning robust visual features without supervision","author":"Oquab","year":"2023","journal-title":"arXiv preprint"}],"event":{"name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","location":"Atlanta, GA, USA","start":{"date-parts":[[2025,5,19]]},"end":{"date-parts":[[2025,5,23]]}},"container-title":["2025 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11127273\/11127223\/11128140.pdf?arnumber=11128140","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,3]],"date-time":"2025-09-03T06:16:16Z","timestamp":1756880176000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11128140\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,19]]},"references-count":33,"URL":"https:\/\/doi.org\/10.1109\/icra55743.2025.11128140","relation":{},"subject":[],"published":{"date-parts":[[2025,5,19]]}}}