{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:19:28Z","timestamp":1778080768737,"version":"3.51.4"},"reference-count":85,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,19]]},"DOI":"10.1109\/icra55743.2025.11128059","type":"proceedings-article","created":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T17:28:56Z","timestamp":1756834136000},"page":"13582-13589","source":"Crossref","is-referenced-by-count":12,"title":["Beyond Bare Queries: Open-Vocabulary Object Grounding with 3D Scene Graph"],"prefix":"10.1109","author":[{"given":"Sergey","family":"Linok","sequence":"first","affiliation":[{"name":"Center for Cognitive Modeling, Moscow Institute of Physics and Technology,Dolgoprudny,Russia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tatiana","family":"Zemskova","sequence":"additional","affiliation":[{"name":"Center for Cognitive Modeling, Moscow Institute of Physics and Technology,Dolgoprudny,Russia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Svetlana","family":"Ladanova","sequence":"additional","affiliation":[{"name":"Center for Cognitive Modeling, Moscow Institute of Physics and Technology,Dolgoprudny,Russia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Roman","family":"Titkov","sequence":"additional","affiliation":[{"name":"Center for Cognitive Modeling, Moscow Institute of Physics and Technology,Dolgoprudny,Russia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dmitry","family":"Yudin","sequence":"additional","affiliation":[{"name":"Center for Cognitive Modeling, Moscow Institute of Physics and Technology,Dolgoprudny,Russia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Maxim","family":"Monastyrny","sequence":"additional","affiliation":[{"name":"Sberbank of Russia, Robotics Center,Moscow,Russia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Aleksei","family":"Valenkov","sequence":"additional","affiliation":[{"name":"Sberbank of Russia, Robotics Center,Moscow,Russia"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning. PMLR","author":"Radford","year":"2021"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19809-0_30"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2024.105171"},{"key":"ref4","article-title":"Eva-clip-18b: Scaling clip to 18 billion parameters","author":"Sun","year":"2024","journal-title":"arXiv preprint"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01237"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref8","article-title":"Segment everything everywhere all at once","volume":"36","author":"Zou","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref9","author":"Liu","year":"2023","journal-title":"Visual instruction tuning"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref11","first-page":"19730","article-title":"Blip-2: Bootstrapping languageimage pre-training with frozen image encoders and large language models","volume-title":"International conference on machine learning. PMLR","author":"Li","year":"2023"},{"key":"ref12","article-title":"Am-radio: Agglomerative model-reduce all domains into one","author":"Ranzinger","year":"2023","journal-title":"arXiv preprint"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00490"},{"key":"ref14","first-page":"200212","article-title":"Multimodal few-shot learning with frozen language models","volume":"34","author":"Tsimpoukelli","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02484"},{"key":"ref16","author":"Liu","year":"2024","journal-title":"Llava-next: Improved reasoning, ocr, and world knowledge"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00576"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01345"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.077"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610243"},{"key":"ref21","article-title":"Leveraging large language models for robot 3d scene understanding","author":"Chen","year":"2022","journal-title":"arXiv preprint"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02674"},{"key":"ref23","first-page":"20482","article-title":"3d-1lm: Injecting the 3d world into large language models","volume":"36","author":"Hong","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref24","article-title":"Chat-3d v2: Bridging 3d scene and large language models with object identifiers","author":"Huang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref25","article-title":"Scene-1lm: Extending language model for 3d visual understanding and reasoning","author":"Fu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref26","article-title":"Language is not all you need: Aligning perception with language models","volume":"36","author":"Huang","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02059"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73347-5_21"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3343602"},{"issue":"3","key":"ref30","first-page":"4","article-title":"Deep vit features as dense visual descriptors","volume":"2","author":"Amir","year":"2021","journal-title":"arXiv preprint"},{"key":"ref31","first-page":"24 824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume":"35","author":"Wei","year":"2022","journal-title":"Advances in neural information processing systems"},{"key":"ref32","article-title":"Vl-fields: Towards language-grounded neural implicit spatial representations","author":"Tsagkas","year":"2023","journal-title":"arXiv preprint"},{"key":"ref33","article-title":"Semantic abstraction: Open-world 3d scene understanding from 2d vision-language models","author":"Ha","year":"2022","journal-title":"arXiv preprint"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2023.xix.066"},{"key":"ref35","first-page":"1610","article-title":"Ovir-3d: Open-vocabulary 3d instance retrieval without training on 3d data","volume-title":"Conference on Robot Learning. PMLR","author":"Lu","year":"2023"},{"key":"ref36","article-title":"Open-fusion: Real-time open-vocabulary 3d mapping and queryable scene representation","author":"Yamazaki","year":"2023","journal-title":"arXiv preprint"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73033-7_10"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00385"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01874"},{"key":"ref40","article-title":"Openmask3d: Open-vocabulary 3d instance segmentation","author":"Takmaz","year":"2023","journal-title":"arXiv preprint"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00317"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01807"},{"key":"ref44","first-page":"53433","article-title":"Weakly supervised 3d open-vocabulary segmentation","volume":"36","author":"Liu","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3439737"},{"key":"ref46","article-title":"O2v-mapping: Online openvocabulary mapping with neural implicit representation","author":"Tie","year":"2024","journal-title":"arXiv preprint"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01948"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00510"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01895"},{"key":"ref50","volume-title":"Feature 3DGS: Supercharging 3D Gaussian Splatting to Enable Distilled Feature Fields","author":"Jiang","year":"2023"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73195-2_14"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73397-0_21"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2022.XVIII.050"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2019.2931042"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.02065"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00260"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02632"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00402"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01546-9"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2024.3441495"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_13"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_25"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01560"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73242-3_9"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2025.111728"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01397"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00292"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00257"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01949"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610443"},{"key":"ref71","first-page":"20522","article-title":"Language conditioned spatial relation reasoning for 3d object grounding","volume":"35","author":"Chen","year":"2022","journal-title":"Advances in neural information processing systems"},{"key":"ref72","article-title":"Grounded 3d-11m with referent tokens","author":"Chen","year":"2024","journal-title":"arXiv preprint"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/3DV62453.2024.00033"},{"key":"ref74","article-title":"Mobilesamv2: Faster segment anything to everything","author":"Zhang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref75","article-title":"Vision transformers need registers","author":"Darcet","year":"2023","journal-title":"arXiv preprint"},{"key":"ref76","article-title":"The replica dataset: A digital replica of indoor spaces","author":"Straub","year":"2019","journal-title":"arXiv preprint"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"ref78","article-title":"Fine-grained visual prompting","volume":"36","author":"Yang","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref79","volume-title":"Llama 3 model card","year":"2024"},{"key":"ref80","article-title":"Gpt-4 technical report","author":"Achiam","year":"2023","journal-title":"arXiv preprint"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/TRO.2021.3075644"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01578"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73033-7_2"},{"key":"ref84","article-title":"Boosttrack++: using tracklet information to detect more objects in multiple object tracking","author":"Stanojevi\u0107","year":"2024","journal-title":"arXiv preprint"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1007\/s10115-025-02375-9"}],"event":{"name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","location":"Atlanta, GA, USA","start":{"date-parts":[[2025,5,19]]},"end":{"date-parts":[[2025,5,23]]}},"container-title":["2025 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11127273\/11127223\/11128059.pdf?arnumber=11128059","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,3]],"date-time":"2025-09-03T06:46:27Z","timestamp":1756881987000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11128059\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,19]]},"references-count":85,"URL":"https:\/\/doi.org\/10.1109\/icra55743.2025.11128059","relation":{},"subject":[],"published":{"date-parts":[[2025,5,19]]}}}