{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T03:28:17Z","timestamp":1777865297033,"version":"3.51.4"},"reference-count":94,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2024YFE0203100"],"award-info":[{"award-number":["2024YFE0203100"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00943","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"1-12","source":"Crossref","is-referenced-by-count":0,"title":["RoboPearls: Editable Video Simulation for Robot Manipulation"],"prefix":"10.1109","author":[{"given":"Tang","family":"Tao","sequence":"first","affiliation":[{"name":"Shenzhen Campus of Sun Yat-sen University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Likui","family":"Zhang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Youpeng","family":"Wen","sequence":"additional","affiliation":[{"name":"Shenzhen Campus of Sun Yat-sen University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kaidong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jia-Wang","family":"Bian","sequence":"additional","affiliation":[{"name":"Bytedance Seed"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xia","family":"Zhou","sequence":"additional","affiliation":[{"name":"Li Auto Inc."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tianyi","family":"Yan","sequence":"additional","affiliation":[{"name":"Li Auto Inc."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kun","family":"Zhan","sequence":"additional","affiliation":[{"name":"Li Auto Inc."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Peng","family":"Jia","sequence":"additional","affiliation":[{"name":"Li Auto Inc."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hefeng","family":"Wu","sequence":"additional","affiliation":[{"name":"Li Auto Inc."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Liang","family":"Lin","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaodan","family":"Liang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Gpt-4 technical report","author":"Achiam","year":"2023","journal-title":"arXiv preprint"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2017.18"},{"key":"ref3","article-title":"Robocat: A selfimproving foundation agent for robotic manipulation","author":"Bousmalis","year":"2023","journal-title":"arXiv preprint"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2023.xix.025"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/IRC.2020.00015"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01840"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3807511"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02029"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72664-4_21"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00014"},{"key":"ref12","author":"Coumans","year":"2016","journal-title":"Pybullet, a python module for physics simulation for games, robotics and machine learning"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-04870-8_16"},{"key":"ref14","article-title":"Vision-language models as success detectors","author":"Du","year":"2023","journal-title":"arXiv preprint"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9561982"},{"key":"ref16","article-title":"Sam2act: Integrating visual foundation model with a memory architecture for robotic manipulation","author":"Fang","year":"2025","journal-title":"arXiv preprint"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01975"},{"key":"ref18","article-title":"FigureAI","year":"2025","journal-title":"Helix: A vision-language-action model for generalist humanoid control"},{"key":"ref19","article-title":"Act3d: 3d feature field transformers for multi-task robotic manipulation","author":"Gervet","year":"2023","journal-title":"CoRL"},{"key":"ref20","first-page":"694","article-title":"Rvt: Robotic view transformer for 3d object manipulation","volume-title":"Conference on Robot Learning","author":"Goyal"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.055"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"ref23","article-title":"Maniskill2: A unified benchmark for generalizable manipulation skills","author":"Gu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref24","article-title":"Deepseek-r1: Incentivizing reasoning capability in 11 ms via reinforcement learning","author":"Guo","year":"2025","journal-title":"arXiv preprint"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2026.3675320"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160216"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201293"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610677"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.2974707"},{"key":"ref30","article-title":"Graspsplats: Efficient manipulation with 3d feature splatting","author":"Ji","year":"2024","journal-title":"arXiv preprint"},{"key":"ref31","author":"Ji","year":"2024","journal-title":"Segment any 4d gaussians"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3592433"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.120"},{"key":"ref34","article-title":"Openvla: An open-source vision-language-action model","author":"Jin Kim","year":"2024","journal-title":"arXiv preprint"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref36","first-page":"80","article-title":"Behavior-1k: A benchmark for embodied ai with $1,000 \\text{ev}{-}$ eryday activities and realistic simulation","volume-title":"Conference on Robot Learning","author":"Li"},{"key":"ref37","article-title":"Langsurf: Language-embedded surface gaussians for 3d scene understanding","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1939"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"ref40","article-title":"Rdt-1b: a diffusion foundation model for bimanual manipulation","author":"Liu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01316"},{"key":"ref42","article-title":"Robo-gs: A physics consistent spatialtemporal model for robotic arm with hybrid representation","author":"Lou","year":"2024","journal-title":"arXiv preprint"},{"key":"ref43","article-title":"Turbo-gs: Accelerating 3d gaussian fitting for highquality radiance fields","author":"Lu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/3dv66043.2025.00019"},{"key":"ref45","article-title":"What matters in learning from offline human demonstrations for robot manipulation","author":"Mandlekar","year":"2021","journal-title":"arXiv preprint"},{"key":"ref46","article-title":"A language agent for autonomous driving","author":"Mao","year":"2023","journal-title":"arXiv preprint"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3180108"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/3067695.3082052"},{"key":"ref50","article-title":"Making images real again: A comprehensive survey on deep image composition","author":"Niu","year":"2021","journal-title":"arXiv preprint"},{"key":"ref51","article-title":"NVIDIA","year":"2021","journal-title":"Nvidia isaac sim"},{"key":"ref52","article-title":"Open x-embodiment: Robotic learning datasets and rt-x models","author":"O\u2019Neill","year":"2023","journal-title":"arXiv preprint"},{"key":"ref53","article-title":"OpenAI","year":"2023","journal-title":"Gpt-4v(ision) system card"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v40i10.37787"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.133"},{"key":"ref56","article-title":"Diffusiongpt: Llm-driven text-to-image generation system","author":"Qin","year":"2024","journal-title":"arXiv preprint"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72940-9_21"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2013.6696520"},{"key":"ref59","article-title":"Robofail: Analyzing failures in robot learning policies","author":"Sagar","year":"2024","journal-title":"arXiv preprint"},{"key":"ref60","first-page":"1038","article-title":"Toolflownet: Robotic manipulation with tools via predicting tool flow from point clouds","volume-title":"Conference on Robot Learning","author":"Seita"},{"key":"ref61","article-title":"Hi robot: Open-ended instruction following with hierarchical vision-language-action models","author":"Xiaoyang Shi","year":"2025","journal-title":"arXiv preprint"},{"key":"ref62","first-page":"785","article-title":"Perceiveractor: A multi-task transformer for robotic manipulation","volume-title":"Conference on Robot Learning","author":"Shridhar"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00323"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73235-5_1"},{"key":"ref65","article-title":"Octo: An open-source generalist robot policy","author":"Model Team","year":"2024","journal-title":"arXiv preprint"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2012.6386109"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01700"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01956"},{"key":"ref69","article-title":"Softzoo: A soft robot co-design benchmark for locomotion in diverse environments","volume-title":"International Conference on Learning Representations","author":"Wang"},{"key":"ref70","article-title":"Robogen: Towards unleashing infinite data for automated robot learning via generative simulation","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01428"},{"key":"ref72","first-page":"192","article-title":"Fabricflownet: Bimanual cloth manipulation with a flow-based policy","volume-title":"Conference on Robot Learning","author":"Weng"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01920"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72630-9_4"},{"key":"ref75","article-title":"Autogen: Enabling next-gen llm applications via multi-agent conversation framework","author":"Wu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-024-0436-y"},{"key":"ref77","article-title":"Chaineddiffuser: Unifying trajectory diffusion and keypose prediction for robotic manipulation","volume-title":"Conference on Robot Learning","author":"Xian"},{"key":"ref78","article-title":"Fluidlab: A differentiable environment for benchmarking complex fluid manipulation","author":"Xian","year":"2023","journal-title":"arXiv preprint"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01111"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00420"},{"key":"ref81","article-title":"Gaussianproperty: Integrating physical properties to 3d gaussians with lmms","author":"Xu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72633-0_1"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/icra57147.2024.10610443"},{"key":"ref84","article-title":"Real-time photorealistic dynamic scene representation and rendering with 4 d gaussian splatting","author":"Yang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73397-0_10"},{"key":"ref86","article-title":"Large batch optimization for deep learning: Training bert in 76 minutes","author":"You","year":"2019","journal-title":"arXiv preprint"},{"key":"ref87","first-page":"1094","article-title":"Metaworld: A benchmark and evaluation for multi-task and meta reinforcement learning","volume-title":"Conference on robot learning","author":"Yu"},{"key":"ref88","article-title":"Learning to manipulate any-where: A visual generalizable framework for reinforcement learning","author":"Yuan","year":"2024","journal-title":"arXiv preprint"},{"key":"ref89","article-title":"Gnfactor: Multi-task real robot learning with generalizable neural feature fields","author":"Ze","year":"2023","journal-title":"arXiv preprint"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19821-2_41"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-91989-3_2"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3286816"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2024.3432348"},{"key":"ref94","article-title":"Robot parkour learning","author":"Zhuang","year":"2023","journal-title":"arXiv preprint"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11444462.pdf?arnumber=11444462","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:11:58Z","timestamp":1777529518000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11444462\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":94,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00943","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}