{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T16:48:59Z","timestamp":1777654139962,"version":"3.51.4"},"reference-count":60,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T00:00:00Z","timestamp":1715558400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T00:00:00Z","timestamp":1715558400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,5,13]]},"DOI":"10.1109\/icra57147.2024.10611220","type":"proceedings-article","created":{"date-parts":[[2024,8,8]],"date-time":"2024-08-08T17:51:05Z","timestamp":1723139465000},"page":"4796-4803","source":"Crossref","is-referenced-by-count":18,"title":["Dream2Real: Zero-Shot 3D Object Rearrangement with Vision-Language Models"],"prefix":"10.1109","author":[{"given":"Ivan","family":"Kapelyukh","sequence":"first","affiliation":[{"name":"The Robot Learning Lab at Imperial College,London"}]},{"given":"Yifei","family":"Ren","sequence":"additional","affiliation":[{"name":"The Robot Learning Lab at Imperial College,London"}]},{"given":"Ignacio","family":"Alzugaray","sequence":"additional","affiliation":[{"name":"The Dyson Robotics Lab"}]},{"given":"Edward","family":"Johns","sequence":"additional","affiliation":[{"name":"The Robot Learning Lab at Imperial College,London"}]}],"member":"263","reference":[{"key":"ref1","article-title":"CLIPort: What and where pathways for robotic manipulation","volume-title":"Conference on Robot Learning (CoRL)","author":"Shridhar"},{"key":"ref2","article-title":"RT-2: Vision-language-action models transfer web knowledge to robotic control","author":"Brohan","year":"2023"},{"key":"ref3","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International Conference on Machine Learning, ICML","author":"Radford"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3272516"},{"key":"ref6","article-title":"Rearrangement: A challenge for embodied AI","author":"Batra","year":"2020"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2012.6224553"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19842-7_28"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/IROS55552.2023.10341873"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/lra.2024.3438036"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3143518"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610792"},{"key":"ref13","article-title":"Transporter networks: Rearranging the visual world for robotic manipulation","volume-title":"Conference on Robot Learning (CoRL)","author":"Zeng"},{"key":"ref14","first-page":"806","article-title":"Predicting stable configurations for semantic placement of novel objects","volume-title":"Conference on Robot Learning, 8-11 November 2021, London, UK, ser. Proceedings of Machine Learning Research","volume":"164","author":"Paxton"},{"key":"ref15","doi-asserted-by":"crossref","DOI":"10.15607\/RSS.2023.XIX.030","article-title":"Energy-based models are zero-shot planners for compositional scene rearrangement","volume-title":"Robotics: Science and Systems XIX","author":"Gkanatsios","year":"2023"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2015.7139396"},{"key":"ref17","article-title":"My house, my rules: Learning tidying preferences with graph neural networks","volume-title":"Conference on Robot Learning (CoRL)","author":"Kapelyukh"},{"key":"ref18","article-title":"Transformers are adaptable task planners","volume-title":"Conference on Robot Learning","author":"Jain"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9811931"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.031"},{"key":"ref21","article-title":"SceneScore: Learning a cost function for object arrangement","volume-title":"CoRL 2023 Workshop on Learning Effective Abstractions for Planning (LEAP)","author":"Kapelyukh"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161528"},{"key":"ref23","article-title":"Shelving, stacking, hanging: Relational pose diffusion for multi-modal rearrangement","author":"Simeonov","year":"2023"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19842-7_21"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/IROS55552.2023.10342169"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/s10514-023-10139-z"},{"key":"ref27","article-title":"Do as I can, not as I say: Grounding language in robotic affordances","author":"Ahn","year":"2022"},{"key":"ref28","article-title":"Open-world object manipulation using pre-trained vision-language model","author":"Stone","year":"2023"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.027"},{"key":"ref30","article-title":"CACTI: A framework for scalable multi-task multi-scene visual imitation learning","author":"Mandi","year":"2022"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.010"},{"key":"ref32","article-title":"Voxposer: Composable 3d value maps for robotic manipulation with language models","volume-title":"Conference on Robot Learning (CoRL)","author":"Huang"},{"key":"ref33","article-title":"Distilled feature fields enable few-shot language-guided manipulation","author":"Shen","year":"2023"},{"key":"ref34","article-title":"Language to rewards for robotic skill synthesis","author":"Yu","year":"2023","journal-title":"Arxiv preprint arXiv:2306.08647"},{"key":"ref35","article-title":"Liv: Language-image representations and rewards for robotic control","author":"Ma","year":"2023"},{"key":"ref36","first-page":"893","article-title":"Can foundation models perform zero-shot task specification for robot manipulation?","volume-title":"Proceedings of The 4th Annual Learning for Dynamics and Control Conference, ser. Proceedings of Machine Learning Research","volume":"168","author":"Cui"},{"key":"ref37","article-title":"Language embedded radiance fields for zero-shot task-oriented grasping","volume-title":"7th Annual Conference on Robot Learning","author":"Sharma"},{"key":"ref38","article-title":"MIRA: Mental imagery for robotic affordances","volume-title":"Conference on Robot Learning (CoRL)","author":"Yen-Chen"},{"key":"ref39","first-page":"1755","article-title":"Learning multi-object dynamics with compositional neural radiance fields","volume-title":"Proceedings of The 6th Conference on Robot Learning, ser. Proceedings of Machine Learning Research","volume":"205","author":"Driess"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530127"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00098"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00381"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20062-5_2"},{"key":"ref44","article-title":"Semantic abstraction: Open-world 3D scene understanding from 2D vision-language models","volume-title":"Proceedings of the 2022 Conference on Robot Learning","author":"Ha"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2023.xix.066"},{"key":"ref46","article-title":"Clip-fields: Weakly supervised semantic fields for robotic memory","author":"Shafiullah","year":"2023","journal-title":"RSS"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_37"},{"key":"ref49","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023"},{"key":"ref50","article-title":"GPT-4 technical report","year":"2023"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1145\/237170.237269"},{"key":"ref52","article-title":"Open3D: A modern library for 3D data processing","author":"Zhou","year":"2018"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2019.2895878"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9811881"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161569"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ROBOT.2000.844730"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00455"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00816"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"ref60","article-title":"When and why vision-language models behave like bags-of-words, and what to do about it?","volume-title":"International Conference on Learning Representations","author":"Yuksekgonul"}],"event":{"name":"2024 IEEE International Conference on Robotics and Automation (ICRA)","location":"Yokohama, Japan","start":{"date-parts":[[2024,5,13]]},"end":{"date-parts":[[2024,5,17]]}},"container-title":["2024 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10609961\/10609862\/10611220.pdf?arnumber=10611220","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,11]],"date-time":"2024-08-11T04:07:05Z","timestamp":1723349225000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10611220\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,13]]},"references-count":60,"URL":"https:\/\/doi.org\/10.1109\/icra57147.2024.10611220","relation":{},"subject":[],"published":{"date-parts":[[2024,5,13]]}}}