{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T16:37:57Z","timestamp":1757608677878,"version":"3.44.0"},"reference-count":58,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,19]]},"DOI":"10.1109\/icra55743.2025.11127873","type":"proceedings-article","created":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T17:28:56Z","timestamp":1756834136000},"page":"2086-2093","source":"Crossref","is-referenced-by-count":0,"title":["Embodiment-agnostic Action Planning via Object-Part Scene Flow"],"prefix":"10.1109","author":[{"given":"Weiliang","family":"Tang","sequence":"first","affiliation":[{"name":"Chinese University of Hong Kong,Department of Computer Science and Engineering"}]},{"given":"Jia-Hui","family":"Pan","sequence":"additional","affiliation":[{"name":"Chinese University of Hong Kong,Department of Computer Science and Engineering"}]},{"given":"Wei","family":"Zhan","sequence":"additional","affiliation":[{"name":"UC Berkeley,The Department of Mechanical Engineering"}]},{"given":"Jianshu","family":"Zhou","sequence":"additional","affiliation":[{"name":"UC Berkeley,The Department of Mechanical Engineering"}]},{"given":"Huaxiu","family":"Yao","sequence":"additional","affiliation":[{"name":"UNC-Chapel Hill,The Department of Computer Science"}]},{"given":"Yun-Hui","family":"Liu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,The Department of Mechanical and Automation Engineering"}]},{"given":"Masayoshi","family":"Tomizuka","sequence":"additional","affiliation":[{"name":"UC Berkeley,The Department of Mechanical Engineering"}]},{"given":"Mingyu","family":"Ding","sequence":"additional","affiliation":[{"name":"UC Berkeley,The Department of Mechanical Engineering"}]},{"given":"Chi-Wing","family":"Fu","sequence":"additional","affiliation":[{"name":"Chinese University of Hong Kong,Department of Computer Science and Engineering"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.2174\/1573399812666160613113556"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2017.7989324"},{"key":"ref3","first-page":"4790","article-title":"Neural pro-gram synthesis from diverse demonstration videos","volume-title":"International Conference on Machine Learning. PMLR","author":"Sun","year":"2018"},{"journal-title":"Rt-2: Vision-language-action models transfer web knowledge torobotic control","year":"2023","author":"Brohan","key":"ref4"},{"journal-title":"Open x-embodiment: Robotic learning datasets and rt-x models","year":"2023","author":"Padalkar","key":"ref5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.026"},{"journal-title":"3d-vla: A 3d vision-language-action generative world model","year":"2024","author":"Zhen","key":"ref7"},{"key":"ref8","first-page":"589","article-title":"Motion planner augmented reinforcement learning for robot manipulation in obstructed environments","volume-title":"Conference on Robot Learning. PMLR","author":"Yamada","year":"2021"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3059912"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9812140"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/IROS45743.2020.9340947"},{"key":"ref12","first-page":"651","article-title":"Dexvip: Learning dexterous grasping with human hand pose priors from video","volume-title":"Conference on Robot Learning. PMLR","author":"Mandikal","year":"2022"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.043"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.012"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01324"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00329"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00328"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73116-7_18"},{"journal-title":"General flow as foundation affordance for scalable robot learning","year":"2024","author":"Yuan","key":"ref19"},{"journal-title":"Large-scale actionless video pre-training via discrete diffusion for efficient policy learning","year":"2024","author":"He","key":"ref20"},{"journal-title":"Flow as the cross-domain manipulation interface","year":"2024","author":"Xu","key":"ref21"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8462901"},{"key":"ref23","article-title":"Third-person visual imitation learning via decoupled hierarchical controller","volume":"32","author":"Sharma","year":"2019","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2020.xvi.024"},{"journal-title":"Perceptual values from observation","year":"2019","author":"Edwards","key":"ref25"},{"journal-title":"Reinforcement learning with videos: Combining offline observations with interaction","year":"2020","author":"Schmeckpeper","key":"ref26"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/IROS51168.2021.9636080"},{"key":"ref28","first-page":"1930","article-title":"Model-based inverse reinforcement learning from visual demonstrations","volume-title":"Conference on Robot Learning. PMLR","author":"Das","year":"2021"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8462891"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/IROS40897.2019.8968142"},{"journal-title":"On-line object representations with contrastive learning","year":"2019","author":"Pirk","key":"ref31"},{"journal-title":"R3m: A universal visual representation for robot manipulation","year":"2022","author":"Nair","key":"ref32"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/IROS58592.2024.10801835"},{"key":"ref34","article-title":"Learning universal policies via text-guided video generation","volume":"36","author":"Du","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"journal-title":"ivideogpt: Interactive videogpts are scalable world models","year":"2024","author":"Wu","key":"ref35"},{"key":"ref36","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Advances in neural information processing systems"},{"journal-title":"Open-vla: An open-source vision-language-action model","year":"2024","author":"Kim","key":"ref37"},{"key":"ref38","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning. PMLR","author":"Radford","year":"2021"},{"key":"ref39","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"International conference on machine learning. PMLR","author":"Li","year":"2022"},{"key":"ref40","first-page":"5583","article-title":"Vilt: Vision-and-language transformer without convolution or region supervision","volume-title":"International conference on machine learning. PMLR","author":"Kim","year":"2021"},{"journal-title":"Visu-albert: A simple and performant baseline for vision and language","year":"2019","author":"Li","key":"ref41"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00915"},{"journal-title":"Grounded sam: Assembling open-world models for diverse visual tasks","year":"2024","author":"Ren","key":"ref43"},{"journal-title":"Language-driven semantic segmentation","year":"2022","author":"Li","key":"ref44"},{"journal-title":"Learning to act from actionless videos through dense correspondences","year":"2023","author":"Ko","key":"ref45"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/iccv.2015.522"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1177\/0278364917735594"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73033-7_2"},{"key":"ref50","first-page":"1094","article-title":"Meta-world: A benchmark and evaluation for multi-task and meta reinforcement learning","volume-title":"Conference on robot learning. PMLR","author":"Yu","year":"2020"},{"journal-title":"Relay policy learning: Solving long-horizon tasks via imitation and reinforcement learning","year":"2019","author":"Gupta","key":"ref51"},{"journal-title":"Eureka: Human-level reward design via coding large language models","year":"2023","author":"Ma","key":"ref52"},{"journal-title":"Vip: Towards universal visual reward and representation via value-implicit pre-training","year":"2022","author":"Ma","key":"ref53"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2023.xix.089"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2017.2716445"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/TMECH.2022.3224183"},{"volume-title":"Open-sora: Democratizing efficient video production for all","year":"2024","author":"Zheng","key":"ref58"}],"event":{"name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","start":{"date-parts":[[2025,5,19]]},"location":"Atlanta, GA, USA","end":{"date-parts":[[2025,5,23]]}},"container-title":["2025 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11127273\/11127223\/11127873.pdf?arnumber=11127873","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,3]],"date-time":"2025-09-03T06:16:33Z","timestamp":1756880193000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11127873\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,19]]},"references-count":58,"URL":"https:\/\/doi.org\/10.1109\/icra55743.2025.11127873","relation":{},"subject":[],"published":{"date-parts":[[2025,5,19]]}}}