{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:20:48Z","timestamp":1777656048451,"version":"3.51.4"},"reference-count":33,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T00:00:00Z","timestamp":1715558400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T00:00:00Z","timestamp":1715558400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,5,13]]},"DOI":"10.1109\/icra57147.2024.10610193","type":"proceedings-article","created":{"date-parts":[[2024,8,8]],"date-time":"2024-08-08T17:51:05Z","timestamp":1723139465000},"page":"9411-9417","source":"Crossref","is-referenced-by-count":25,"title":["Open-Fusion: Real-time Open-Vocabulary 3D Mapping and Queryable Scene Representation"],"prefix":"10.1109","author":[{"given":"Kashu","family":"Yamazaki","sequence":"first","affiliation":[{"name":"University of Arkansas,AICV Lab,Department of EECS,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Taisei","family":"Hanyu","sequence":"additional","affiliation":[{"name":"University of Arkansas,AICV Lab,Department of EECS,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Khoa","family":"Vo","sequence":"additional","affiliation":[{"name":"University of Arkansas,AICV Lab,Department of EECS,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Thang","family":"Pham","sequence":"additional","affiliation":[{"name":"University of Arkansas,AICV Lab,Department of EECS,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minh","family":"Tran","sequence":"additional","affiliation":[{"name":"University of Arkansas,AICV Lab,Department of EECS,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gianfranco","family":"Doretto","sequence":"additional","affiliation":[{"name":"West Virginia University,Department of CSCE,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Anh","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Liverpool,Department of CS,UK"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ngan","family":"Le","sequence":"additional","affiliation":[{"name":"University of Arkansas,AICV Lab,Department of EECS,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298655"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2018.2886133"},{"key":"ref3","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford"},{"key":"ref4","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"ICLR","author":"Jia","year":"2021"},{"key":"ref5","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"International Conference on Machine Learning","author":"Li"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02219"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01228-7"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161534"},{"key":"ref11","article-title":"Open-vocabulary object detection via vision and language knowledge distillation","author":"Gu","year":"2021"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160969"},{"key":"ref13","article-title":"Language-driven Semantic Segmentation","author":"Li","year":"2022"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2023.xix.074"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01807"},{"key":"ref17","article-title":"Semantic abstraction: Open-world 3d scene understanding from 2d vision-language models","volume-title":"6th Annual Conference on Robot Learning","author":"Ha"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.066"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref20","article-title":"Segment everything everywhere all at once","author":"Zou","year":"2023"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01857"},{"key":"ref22","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_40"},{"key":"ref24","article-title":"Glipv2: Unifying localization and vision-language understanding","author":"Zhang","year":"2022","journal-title":"NIPS"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00289"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.00375"},{"key":"ref27","article-title":"Semantic-sam: Segment and recognize anything at any granularity","author":"Li","year":"2023"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/IROS40897.2019.8967693"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2019.2956367"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TAES.2016.140952"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"ref33","article-title":"The replica dataset: A digital replica of indoor spaces","author":"Straub","year":"2019"}],"event":{"name":"2024 IEEE International Conference on Robotics and Automation (ICRA)","location":"Yokohama, Japan","start":{"date-parts":[[2024,5,13]]},"end":{"date-parts":[[2024,5,17]]}},"container-title":["2024 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10609961\/10609862\/10610193.pdf?arnumber=10610193","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,10]],"date-time":"2024-08-10T05:45:30Z","timestamp":1723268730000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10610193\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,13]]},"references-count":33,"URL":"https:\/\/doi.org\/10.1109\/icra57147.2024.10610193","relation":{},"subject":[],"published":{"date-parts":[[2024,5,13]]}}}