{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,28]],"date-time":"2026-02-28T18:38:20Z","timestamp":1772303900179,"version":"3.50.1"},"reference-count":45,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"12","license":[{"start":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T00:00:00Z","timestamp":1764547200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T00:00:00Z","timestamp":1764547200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T00:00:00Z","timestamp":1764547200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Knut and Alice Wallenberg Foundation through Wallenberg AI, Autonomous Systems, and Software Program"},{"name":"European Union&#x2019;s Horizon Europe Framework Programme","award":["101070596"],"award-info":[{"award-number":["101070596"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Robot. Autom. Lett."],"published-print":{"date-parts":[[2025,12]]},"DOI":"10.1109\/lra.2025.3625497","type":"journal-article","created":{"date-parts":[[2025,10,27]],"date-time":"2025-10-27T18:06:17Z","timestamp":1761588377000},"page":"12995-13002","source":"Crossref","is-referenced-by-count":2,"title":["S$^{2}$-Diffusion: Generalizing From Instance-Level to Category-Level Skills in Robot Manipulation"],"prefix":"10.1109","volume":"10","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5655-0990","authenticated-orcid":false,"given":"Quantao","family":"Yang","sequence":"first","affiliation":[{"name":"Division of Robotics, Perception and Learning (RPL), KTH Royal Institute of Technology, Stockholm, Sweden"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3827-3824","authenticated-orcid":false,"given":"Michael C.","family":"Welle","sequence":"additional","affiliation":[{"name":"Division of Robotics, Perception and Learning (RPL), KTH Royal Institute of Technology, Stockholm, Sweden"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2965-2953","authenticated-orcid":false,"given":"Danica","family":"Kragic","sequence":"additional","affiliation":[{"name":"Division of Robotics, Perception and Learning (RPL), KTH Royal Institute of Technology, Stockholm, Sweden"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7248-1112","authenticated-orcid":false,"given":"Olov","family":"Andersson","sequence":"additional","affiliation":[{"name":"Division of Robotics, Perception and Learning (RPL), KTH Royal Institute of Technology, Stockholm, Sweden"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2020.xvi.061"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.026"},{"key":"ref3","first-page":"1199","article-title":"VIOLA: Imitation learning for vision-based manipulation with object proposal priors","volume-title":"Proc. Conf. Robot Learn.","author":"Zhu","year":"2023"},{"key":"ref4","first-page":"6892","article-title":"Open X-embodiment: Robotic learning datasets and RT-X models","volume-title":"Proc. IEEE Int. Conf. Robot. Automat.","author":"ONeill","year":"2024"},{"key":"ref5","article-title":"OCTO: An open-source generalist robot policy","author":"Team","year":"2024"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2025.3544909"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.016"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2024.3443610"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2025.3619794"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/RO-MAN60168.2024.10731242"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00987"},{"key":"ref13","article-title":"Grounded SAM: Assembling open-world models for diverse visual tasks","author":"Ren","year":"2024"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0688"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3196123"},{"key":"ref16","first-page":"3766","article-title":"Scaling up and distilling down: Language-guided robot skill acquisition","volume-title":"Proc. Conf. Robot Learn.","author":"Ha","year":"2023"},{"key":"ref17","article-title":"Unleashing large-scale video generative pre-training for visual robot manipulation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Wu","year":"2024"},{"key":"ref18","article-title":"3D diffuser actor: Policy diffusion with 3D scene representations","author":"Ke","year":"2024"},{"key":"ref19","article-title":"ACT3D: Infinite resolution action detection transformer for robotic manipulation","author":"Gervet","year":"2023"},{"key":"ref20","article-title":"Learning generalizable feature fields for mobile manipulation","author":"Qiu","year":"2024"},{"key":"ref21","article-title":"SAM2Act: Integrating visual foundation model with a memory architecture for robotic manipulation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Fang","year":"2025"},{"key":"ref22","article-title":"GenDP: 3D semantic fields for category-level generalizable diffusion policy","volume-title":"Proc. 8th Annu. Conf. Robot Learn.","volume":"2","author":"Wang","year":"2024"},{"key":"ref23","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2021"},{"key":"ref24","article-title":"Theia: Distilling diverse vision foundation models for robot learning","volume-title":"Proc. Conf. Robot Learn.","author":"Shang","year":"2024"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.091"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref27","article-title":"Segment everything everywhere all at once","volume":"36","author":"Zou","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref28","article-title":"Do as I can, not as I say: Grounding language in robotic affordances","volume-title":"Proc. Conf. Robot Learn.","author":"Ahn","year":"2022"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161534"},{"key":"ref30","article-title":"Eureka: Human-level reward design via coding large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ma","year":"2024"},{"key":"ref31","first-page":"14743","article-title":"Zero-shot reward specification via grounded natural language","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Mahmoudieh","year":"2022"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.121"},{"key":"ref33","article-title":"MOKA: Open-vocabulary robotic manipulation through mark-based visual prompting","volume-title":"Proc. Robot.: Sci. Syst.","author":"Liu","year":"2024"},{"key":"ref34","article-title":"Rt-2: Vision-language-action models transfer web knowledge to robotic control","volume-title":"Proc. 7th Conf. Robot Learn.","author":"Brohan","year":"2023"},{"key":"ref35","article-title":"Openvla: An open-source vision-language-action model","volume-title":"Proc. Conf. Robot Learn.","author":"Kim","year":"2024"},{"key":"ref36","article-title":"Scaling cross-embodied learning: One policy for manipulation, navigation, locomotion and aviation","volume-title":"Proc. Conf. Robot Learn.","author":"Doshi","year":"2024"},{"key":"ref37","article-title":"ShapeNet: An information-rich 3D model repository","author":"Chang","year":"2015"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.067"},{"key":"ref39","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"NeurIPS"},{"key":"ref40","article-title":"Imitating human behaviour with diffusion models","author":"Pearce","year":"2023"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.050"},{"key":"ref42","article-title":"Denoising diffusion implicit models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Song","year":"2021"},{"key":"ref43","article-title":"What matters in learning from offline human demonstrations for robot manipulation","volume-title":"Proc. Conf. Robot Learn.","author":"Mandlekar","year":"2021"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01111"},{"key":"ref45","article-title":"Quest2ROS: An app to facilitate teleoperating robots","volume-title":"Proc. 7th Int. Workshop Virtual, Augmented, Mixed-Reality Hum.-Robot Interact.","author":"Welle","year":"2024"}],"container-title":["IEEE Robotics and Automation Letters"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/7083369\/11215960\/11217171.pdf?arnumber=11217171","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T19:03:08Z","timestamp":1764010988000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11217171\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12]]},"references-count":45,"journal-issue":{"issue":"12"},"URL":"https:\/\/doi.org\/10.1109\/lra.2025.3625497","relation":{},"ISSN":["2377-3766","2377-3774"],"issn-type":[{"value":"2377-3766","type":"electronic"},{"value":"2377-3774","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,12]]}}}