{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T21:04:57Z","timestamp":1773435897548,"version":"3.50.1"},"reference-count":40,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,19]]},"DOI":"10.1109\/icra55743.2025.11127896","type":"proceedings-article","created":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T17:28:56Z","timestamp":1756834136000},"page":"13560-13566","source":"Crossref","is-referenced-by-count":2,"title":["OpenSU3D: Open World 3D Scene Understanding Using Foundation Models"],"prefix":"10.1109","author":[{"given":"Rafay","family":"Mohiuddin","sequence":"first","affiliation":[{"name":"Technical University of Munich,Chair of Computational Modeling &#x0026; Simulation,Germany,80333"}]},{"given":"Sai Manoj","family":"Prakhya","sequence":"additional","affiliation":[{"name":"Huawei Munich Research Center,Intelligent Cloud Technologies Lab,Germany,80992"}]},{"given":"Fiona","family":"Collins","sequence":"additional","affiliation":[{"name":"Technical University of Munich,Chair of Computational Modeling &#x0026; Simulation,Germany,80333"}]},{"given":"Ziyuan","family":"Liu","sequence":"additional","affiliation":[{"name":"Huawei Munich Research Center,Intelligent Cloud Technologies Lab,Germany,80992"}]},{"given":"Andr\u00e9","family":"Borrmann","sequence":"additional","affiliation":[{"name":"Technical University of Munich,Chair of Computational Modeling &#x0026; Simulation,Germany,80333"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"ref2","volume-title":"The replica dataset: A digital replica of indoor spaces.","author":"Straub","year":"2019"},{"key":"ref3","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proceedings of the 38th International Conference on Machine Learning","volume":"139","author":"Radford","year":"2021"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref5","article-title":"Visual instruction tuning","volume-title":"Proceedings of the Thirty-seventh Conference on Neural Information Processing Systems (NeurIPS)","author":"Liu","year":"2023"},{"key":"ref6","year":"2023","journal-title":"Gpt-4 technical report."},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00888"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"ref9","article-title":"3d-1lm: Injecting the 3d world into large language models","author":"Hong","year":"2023","journal-title":"in Neural Information Processing Systems"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00677"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2023.xix.066"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00695"},{"key":"ref13","volume-title":"Grounded sam: Assembling open-world models for diverse visual tasks.","author":"Ren","year":"2024"},{"key":"ref14","article-title":"Blip: Bootstrapping languageimage pre-training for unified vision-language understanding and generation","volume-title":"Proceedings of the 39th International Conference on Machine Learning","volume":"162","author":"Li","year":"2022"},{"key":"ref15","article-title":"Blip-2: Bootstrapping languageimage pre-training with frozen image encoders and large language models","volume-title":"Proceedings of the 40th International Conference on Machine Learning","volume":"202","author":"Li","year":"2023"},{"key":"ref16","article-title":"Language-driven semantic segmentation","volume-title":"International Conference on Learning Representations (ICLR)","author":"Li","year":"2022"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"ref18","first-page":"19769","article-title":"Segment everything everywhere all at once","volume-title":"in Advances in Neural Information Processing Systems (A. Oh","volume":"36","author":"Zou","year":"2023"},{"key":"ref19","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"in Advances in Neural Information Processing Systems (H. Larochelle","volume":"33","author":"Brown","year":"2020"},{"key":"ref20","volume":"abs\/2302.13971","author":"Touvron","year":"2023","journal-title":"Llama: Open and efficient foundation language models"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2019.2923960"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9196885"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2022.xviii.050"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00743"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00490"},{"key":"ref27","volume-title":"Sam3d: Segment anything in 3d scenes.","author":"Yang","year":"2023"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01807"},{"key":"ref29","first-page":"68367","article-title":"Openmask3d: Open-vocabulary 3d instance segmentation","volume-title":"in Advances in Neural Information Processing Systems","volume":"36","author":"Takmaz","year":"2023"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73033-7_10"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610443"},{"key":"ref32","author":"Yang","year":"2023","journal-title":"Set-of-mark prompting unleashes extraordinary visual grounding in gpt-4v"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73202-7_12"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02034"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72754-2_16"},{"key":"ref36","article-title":"Context-aware entity grounding with open-vocabulary 3d scene graphs","volume-title":"7th Annual Conference on Robot Learning","author":"Chang","year":"2023"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610243"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2024.3451395"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00179"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72970-6_3"}],"event":{"name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","location":"Atlanta, GA, USA","start":{"date-parts":[[2025,5,19]]},"end":{"date-parts":[[2025,5,23]]}},"container-title":["2025 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11127273\/11127223\/11127896.pdf?arnumber=11127896","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T19:50:09Z","timestamp":1773431409000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11127896\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,19]]},"references-count":40,"URL":"https:\/\/doi.org\/10.1109\/icra55743.2025.11127896","relation":{},"subject":[],"published":{"date-parts":[[2025,5,19]]}}}