{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,19]],"date-time":"2026-03-19T15:49:36Z","timestamp":1773935376897,"version":"3.50.1"},"reference-count":84,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T00:00:00Z","timestamp":1715558400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T00:00:00Z","timestamp":1715558400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,5,13]]},"DOI":"10.1109\/icra57147.2024.10610792","type":"proceedings-article","created":{"date-parts":[[2024,8,8]],"date-time":"2024-08-08T17:51:05Z","timestamp":1723139465000},"page":"4303-4310","source":"Crossref","is-referenced-by-count":17,"title":["SG-Bot: Object Rearrangement via Coarse-to-Fine Robotic Imagination on Scene Graphs"],"prefix":"10.1109","author":[{"given":"Guangyao","family":"Zhai","sequence":"first","affiliation":[{"name":"Technical University of Munich"}]},{"given":"Xiaoni","family":"Cai","sequence":"additional","affiliation":[{"name":"Technical University of Munich"}]},{"given":"Dianye","family":"Huang","sequence":"additional","affiliation":[{"name":"Technical University of Munich"}]},{"given":"Yan","family":"Di","sequence":"additional","affiliation":[{"name":"Technical University of Munich"}]},{"given":"Fabian","family":"Manhardt","sequence":"additional","affiliation":[{"name":"Google"}]},{"given":"Federico","family":"Tombari","sequence":"additional","affiliation":[{"name":"Technical University of Munich"}]},{"given":"Nassir","family":"Navab","sequence":"additional","affiliation":[{"name":"Technical University of Munich"}]},{"given":"Benjamin","family":"Busam","sequence":"additional","affiliation":[{"name":"Technical University of Munich"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Rearrangement: A challenge for embodied ai","author":"Batra","year":"2020"},{"key":"ref2","article-title":"Open-vocabulary object detection via vision and language knowledge distillation","volume-title":"ICLR","author":"Gu"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01217"},{"key":"ref6","article-title":"Language models are few-shot learners","author":"Brown","year":"2020","journal-title":"NeurIPS"},{"key":"ref7","article-title":"Palm: Scaling language modeling with pathways","author":"Chowdhery","year":"2022"},{"key":"ref8","article-title":"Vima: General robot manipulation with multimodal prompts","volume-title":"ICML","author":"Jiang"},{"key":"ref9","article-title":"Behavior-1k: A benchmark for embodied ai with 1, 000 everyday activities and realistic simulation","author":"Li","year":"2023","journal-title":"CoRL"},{"key":"ref10","doi-asserted-by":"crossref","DOI":"10.1109\/IROS55552.2023.10342169","article-title":"Task and motion planning with large language models for object rearrangement","author":"Ding","year":"2023"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01437"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9811817"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9811931"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3272516"},{"key":"ref15","article-title":"Do as i can, not as i say: Grounding language in robotic affordances","author":"Ahn","year":"2022","journal-title":"CoRL"},{"key":"ref16","article-title":"Socratic models: Composing zero-shot multimodal reasoning with language","volume-title":"ICLR","author":"Zeng"},{"key":"ref17","article-title":"Rt-2: Vision-language-action models transfer web knowledge to robotic control","author":"Brohan","year":"2023","journal-title":"CoRL"},{"key":"ref18","article-title":"Palm-e: An embodied multimodal language model","volume-title":"ICML","author":"Driess"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01604"},{"issue":"1","key":"ref20","first-page":"1","article-title":"A comprehensive survey of scene graphs: Generation and application","volume-title":"T-PAMI","volume":"45","author":"Chang"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298990"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00133"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00402"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00576"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00526"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr46437.2021.00743"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.330"},{"key":"ref28","article-title":"Scenegraph","author":"Fessenden","year":"2017"},{"key":"ref29","article-title":"Diffusion-based scene graph to image generation with masked contrastive pre-training","author":"Yang","year":"2022"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1177\/02783649211056674"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00381"},{"key":"ref33","article-title":"Commonscenes: Generating commonsense 3d indoor scenes with scene graphs","author":"Zhai","year":"2023","journal-title":"NeurIPS"},{"key":"ref34","article-title":"Selective object rearrangement in clutter","author":"Tang","year":"2023","journal-title":"CoRL"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9561548"},{"key":"ref36","article-title":"Sayplan: Grounding large language models using 3d scene graphs for scalable task planning","author":"Rana","year":"2023","journal-title":"CoRL"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2011.6094737"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2016.7487583"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2017.7989544"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8793616"},{"key":"ref41","first-page":"2016","article-title":"Pybullet, a python module for physics simulation for games, robotics and machine learning","author":"Coumans"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00346"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00469"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00694"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/IROS47612.2022.9981506"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00666"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/lra.2022.3189959"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9561877"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160779"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9561716"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9197485"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9812367"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00943"},{"key":"ref54","article-title":"Habitat 2.0: Training home assistants to rearrange their habitat","author":"Szot","year":"2021","journal-title":"NeurIPS"},{"key":"ref55","article-title":"Ai2-thor: An interactive 3d environment for visual ai","author":"Kolve","year":"2017"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.2974707"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01111"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/IROS51168.2021.9636667"},{"key":"ref59","article-title":"Targf: Learning target gradient field to rearrange objects without explicit goal specification","author":"Wu","year":"2022","journal-title":"NeurIPS"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01825"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.030"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611220"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161317"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160591"},{"key":"ref65","article-title":"Hierarchical text-conditional image generation with clip latents","author":"Ramesh","year":"2022"},{"key":"ref66","article-title":"Evaluating large language models trained on code","author":"Chen","year":"2021"},{"key":"ref67","article-title":"Rt-1: Robotics transformer for real-world control at scale","author":"Brohan","year":"2022"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/34.121791"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1007\/BF01427149"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2019.112948"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00030"},{"key":"ref73","article-title":"Auto-encoding variational bayes","volume-title":"ICLR","author":"Kingma"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3054619"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9811809"},{"key":"ref76","article-title":"Housecat6d\u2013a large-scale multi-modal category level 6d object perception dataset with household objects in realistic scenarios","author":"Jung","year":"2022"},{"key":"ref77","article-title":"NVISII: Nvidia scene imaging interface","author":"Morrical","year":"2020"},{"key":"ref78","article-title":"Cliport: What and where pathways for robotic manipulation","author":"Shridhar","year":"2022","journal-title":"CoRL"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00634"},{"key":"ref80","article-title":"Atiss: Autoregressive transformers for indoor scene synthesis","author":"Paschalidou","year":"2021","journal-title":"NeurIPS"},{"key":"ref81","article-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium","author":"Heusel","year":"2017","journal-title":"NeurIPS"},{"key":"ref82","article-title":"The role of imagenet classes in fr\u00e9chet inception distance","volume-title":"ICLR","author":"Kynk\u00e4\u00e4nniemi"},{"key":"ref83","article-title":"Reducing the barrier to entry of complex robotic software: a moveit! case study","author":"Coleman","year":"2014"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3240941"}],"event":{"name":"2024 IEEE International Conference on Robotics and Automation (ICRA)","location":"Yokohama, Japan","start":{"date-parts":[[2024,5,13]]},"end":{"date-parts":[[2024,5,17]]}},"container-title":["2024 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10609961\/10609862\/10610792.pdf?arnumber=10610792","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,10]],"date-time":"2024-08-10T05:21:41Z","timestamp":1723267301000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10610792\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,13]]},"references-count":84,"URL":"https:\/\/doi.org\/10.1109\/icra57147.2024.10610792","relation":{},"subject":[],"published":{"date-parts":[[2024,5,13]]}}}