{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T10:13:01Z","timestamp":1777889581362,"version":"3.51.4"},"reference-count":59,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00917","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"9834-9844","source":"Crossref","is-referenced-by-count":0,"title":["IRASim: A Fine-Grained World Model for Robot Manipulation"],"prefix":"10.1109","author":[{"given":"Fangqi","family":"Zhu","sequence":"first","affiliation":[{"name":"Hong Kong University of Science and Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongtao","family":"Wu","sequence":"additional","affiliation":[{"name":"ByteDance Seed"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Song","family":"Guo","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuxiao","family":"Liu","sequence":"additional","affiliation":[{"name":"ByteDance Seed"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chilam","family":"Cheang","sequence":"additional","affiliation":[{"name":"ByteDance Seed"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tao","family":"Kong","sequence":"additional","affiliation":[{"name":"ByteDance Seed"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Cosmos world foundation model platform for physical ai","author":"Agarwal","year":"2025","journal-title":"arXiv preprint"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1873"},{"key":"ref3","author":"Babaeizadeh","year":"2021","journal-title":"Fitvid: Overfitting in pixel-level video prediction"},{"key":"ref4","author":"Bao","year":"2024","journal-title":"Vidu: a highly consistent, dynamic and skilled text-to-video generator with diffusion models"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.025"},{"key":"ref6","author":"Brooks","year":"2024","journal-title":"Video generation models as world simulators"},{"key":"ref7","article-title":"Genie: Generative interactive environments","author":"Bruce","year":"2024","journal-title":"arXiv preprint"},{"key":"ref8","article-title":"Motion-conditioned diffusion model for controllable video synthesis","author":"Chen","year":"2023","journal-title":"arXiv preprint"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.026"},{"key":"ref10","first-page":"885","article-title":"Robonet: Large-scale multi-robot learning","volume-title":"Proceedings of the Conference on Robot Learning","author":"Dasari","year":"2020"},{"key":"ref11","author":"Guo","year":"2025","journal-title":"Deepseek-r1: Incentivizing reasoning capability in 11 ms via reinforcement learning"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2757"},{"key":"ref13","article-title":"Video language planning","volume-title":"The Twelfth International Conference on Learning Representations","author":"Du","year":"2024"},{"key":"ref14","article-title":"Visual foresight: Model-based deep reinforcement learning for vision-based robotic control","author":"Ebert","year":"2018","journal-title":"arXiv preprint"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2017.7989324"},{"key":"ref16","article-title":"Unsupervised learning for physical interaction through video prediction","author":"Finn","year":"2016","journal-title":"Advances in neural information processing systems, 29"},{"key":"ref17","first-page":"158","article-title":"Implicit behavioral cloning","volume-title":"Conference on robot learning","author":"Florence","year":"2022"},{"key":"ref18","article-title":"Flip: Flow-centric generative planning for general-purpose manipulation tasks","author":"Gao","year":"2024","journal-title":"arXiv preprint"},{"key":"ref19","first-page":"91560","article-title":"Vista: A generalizable driving world model with high fidelity and versatile controllability","volume":"37","author":"Gao","year":"2025","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref20","article-title":"Maskvit: Masked visual pre-training for video prediction","volume-title":"ICLR","author":"Gupta","year":"2023"},{"key":"ref21","article-title":"World models","author":"Ha","year":"2018","journal-title":"arXiv preprint"},{"key":"ref22","first-page":"2555","article-title":"Learning latent dynamics for planning from pixels","volume-title":"International conference on machine learning","author":"Hafner","year":"2019"},{"key":"ref23","article-title":"Mastering diverse domains through world models","author":"Hafner","year":"2023","journal-title":"arXiv preprint"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref25","author":"He","year":"2023","journal-title":"Latent video diffusion models for high-fidelity long video generation"},{"key":"ref26","first-page":"6629","article-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium","volume-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems","author":"Heusel","year":"2017"},{"key":"ref27","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.52202\/068431-0628"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2010.579"},{"key":"ref30","article-title":"Gaia-1: A generative world model for autonomous driving","author":"Hu","year":"2023","journal-title":"arXiv preprint"},{"issue":"1","key":"ref31","volume":"62","author":"LeCun","year":"2022","journal-title":"A path towards autonomous machine intelligence version 0.9"},{"key":"ref32","article-title":"Evaluating real-world robot manipulation policies in simulation","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1939"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3295255"},{"key":"ref35","article-title":"Latte: Latent diffusion transformer for video generation","author":"Ma","year":"2024","journal-title":"arXiv preprint"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01374"},{"key":"ref37","author":"Parker-Holder","year":"2024","journal-title":"Genie 2: A large-scale foundation world model"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"ref39","volume-title":"Sdxl: Improving latent diffusion models for high-resolution image synthesis","author":"Podell","year":"2023"},{"key":"ref40","author":"Qi","year":"2025","journal-title":"Strengthening generative robot policies through predictive world modeling"},{"key":"ref41","article-title":"Learning a driving simulator","author":"Santana","year":"2016","journal-title":"arXiv preprint"},{"key":"ref42","first-page":"2256","article-title":"Deep unsupervised learning using nonequilibrium thermodynamics","volume-title":"International conference on machine learning","author":"Sohl-Dickstein","year":"2015"},{"key":"ref43","article-title":"A controlcentric benchmark for video prediction","volume-title":"The Eleventh International Conference on Learning Representations","author":"Tian","year":"2023"},{"key":"ref44","author":"Unterthiner","year":"2019","journal-title":"Towards accurate generative models of video: A new metric & challenges"},{"key":"ref45","article-title":"Diffusion models are real-time game engines","author":"Valevski","year":"2024","journal-title":"arXiv preprint"},{"key":"ref46","first-page":"1723","article-title":"Bridgedata v2: A dataset for robot learning at scale","volume-title":"Conference on Robot Learning","author":"Walke","year":"2023"},{"key":"ref47","article-title":"Boximator: Generating rich and controllable motions for video synthesis","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref48","article-title":"Disco: Disentangled control for referring human dance generation in real world","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"ref50","author":"Wu","year":"2024","journal-title":"ivideogpt: Interactive videogpts are scalable world models"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1176\/appi.books.9781585622665.33114"},{"key":"ref52","article-title":"Spatial-temporal transformer networks for traffic flow forecasting","author":"Xu","journal-title":"arXiv preprint"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00147"},{"key":"ref54","article-title":"Learning interactive real-world simulators","volume-title":"The Twelfth International Conference on Learning Representations","author":"Yang","year":"2024"},{"key":"ref55","article-title":"Cogvideox: Text-to-video diffusion models with an expert transformer","author":"Yang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref56","author":"Yin","year":"2023","journal-title":"Dragnuwa: Fine-grained control in video generation by integrating text, image, and trajectory"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.016"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.106"},{"key":"ref59","author":"Zheng","year":"2024","journal-title":"Open-sora: Democratizing efficient video production for all"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11444825.pdf?arnumber=11444825","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:12:59Z","timestamp":1777612379000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11444825\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":59,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00917","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}