{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T06:15:28Z","timestamp":1765520128541,"version":"3.48.0"},"reference-count":34,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iros60139.2025.11246352","type":"proceedings-article","created":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T18:54:45Z","timestamp":1764269685000},"page":"21281-21288","source":"Crossref","is-referenced-by-count":0,"title":["RoboEnvision: A Long-Horizon Video Generation Model for Multi-Task Robot Manipulation"],"prefix":"10.1109","author":[{"given":"Liudi","family":"Yang","sequence":"first","affiliation":[{"name":"University of Freiburg,Department of Computer Science,Germany"}]},{"given":"Yang","family":"Bai","sequence":"additional","affiliation":[{"name":"Ludwig Maximilian University of Munich,Germany"}]},{"given":"George","family":"Eskandar","sequence":"additional","affiliation":[{"name":"Huawei Munich Research Center,Germany"}]},{"given":"Fengyi","family":"Shen","sequence":"additional","affiliation":[{"name":"Technical University of Munich,Germany"}]},{"given":"Mohammad","family":"Altillawi","sequence":"additional","affiliation":[{"name":"Huawei Munich Research Center,Germany"}]},{"given":"Dong","family":"Chen","sequence":"additional","affiliation":[{"name":"Huawei Munich Research Center,Germany"}]},{"given":"Soumajit","family":"Majumder","sequence":"additional","affiliation":[{"name":"Huawei Munich Research Center,Germany"}]},{"given":"Ziyuan","family":"Liu","sequence":"additional","affiliation":[{"name":"Huawei Munich Research Center,Germany"}]},{"given":"Gitta","family":"Kutyniok","sequence":"additional","affiliation":[{"name":"Ludwig Maximilian University of Munich,Germany"}]},{"given":"Abhinav","family":"Valada","sequence":"additional","affiliation":[{"name":"University of Freiburg,Department of Computer Science,Germany"}]}],"member":"263","reference":[{"article-title":"Video language planning","volume-title":"The Twelfth International Conference on Learning Representations","author":"Du","key":"ref1"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/icra55743.2025.11128780"},{"article-title":"Learning universal policies via text-guided video generation","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems","author":"Du","key":"ref3"},{"article-title":"Gr-2: A generative video-language-action model with web-scale knowledge for robot manipulation","year":"2024","author":"Cheang","key":"ref4"},{"article-title":"Learning interactive real-world simulators","volume-title":"The Twelfth International Conference on Learning Representations","author":"Yang","key":"ref5"},{"article-title":"Open-sora: Democratizing efficient video production for all","year":"2024","author":"Zheng","key":"ref6"},{"article-title":"Cogvideox: Text-to-video diffusion models with an expert transformer","year":"2024","author":"Yang","key":"ref7"},{"article-title":"Learning to act from actionless videos through dense correspondences","year":"2023","author":"Ko","key":"ref8"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.00245"},{"article-title":"Gen-l-video: Multi-text to long video generation via temporal co-denoising","year":"2023","author":"Wang","key":"ref10"},{"article-title":"VDT: General-purpose video diffusion transformers via mask modeling","volume-title":"The Twelfth International Conference on Learning Representations","author":"Lu","key":"ref11"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2025.xxi.010"},{"article-title":"RDT-1b: a diffusion foundation model for bimanual manipulation","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Liu","key":"ref13"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1177\/02783649241273668"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2012.6386109"},{"article-title":"Robodreamer: Learning compositional world models for robot imagination","year":"2024","author":"Zhou","key":"ref16"},{"article-title":"Genie: Generative interactive environments","volume-title":"Forty-first International Conference on Machine Learning","author":"Bruce","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73116-7_18"},{"article-title":"Irasim: Learning interactive real-robot action simulators","year":"2024","author":"Zhu","key":"ref19"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2024.xx.092"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73116-7_18"},{"article-title":"Freenoise: Tuning-free longer video diffusion via noise rescheduling","volume-title":"The Twelfth International Conference on Learning Representations","author":"Qiu","key":"ref22"},{"article-title":"Freelong: Training-free long video generation with spectralblend temporal attention","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems","author":"Lu","key":"ref23"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.73"},{"year":"2024","key":"ref25","article-title":"Learning to reason with llms"},{"year":"2025","key":"ref26","article-title":"Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning"},{"article-title":"Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models","year":"2023","author":"Ye","key":"ref27"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3295255"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/icra57147.2024.10610216"},{"article-title":"Vlabench: A large-scale benchmark for languageconditioned robotics manipulation with long-horizon reasoning tasks","year":"2024","author":"Zhang","key":"ref30"},{"article-title":"LHManip: A dataset for long-horizon language-grounded manipulation tasks in cluttered tabletop environments","volume-title":"RSS 2024 Workshop: Data Generation for Robotics","author":"Ceola","key":"ref31"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3180108"},{"article-title":"Lohoravens: A long-horizon language-conditioned benchmark for robotic tabletop manipulation","year":"2023","author":"Zhang","key":"ref33"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.016"}],"event":{"name":"2025 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)","start":{"date-parts":[[2025,10,19]]},"location":"Hangzhou, China","end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11245651\/11245652\/11246352.pdf?arnumber=11246352","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T06:10:59Z","timestamp":1765519859000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11246352\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":34,"URL":"https:\/\/doi.org\/10.1109\/iros60139.2025.11246352","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}