{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T07:10:11Z","timestamp":1761894611522,"version":"build-2065373602"},"reference-count":33,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/icme59968.2025.11209525","type":"proceedings-article","created":{"date-parts":[[2025,10,30]],"date-time":"2025-10-30T17:57:42Z","timestamp":1761847062000},"page":"1-6","source":"Crossref","is-referenced-by-count":0,"title":["Open-Scene Understanding-oriented 3D Scene Graph Generation"],"prefix":"10.1109","author":[{"given":"Yuansu","family":"Hao","sequence":"first","affiliation":[{"name":"Harbin Institute of Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fei","family":"Yu","sequence":"additional","affiliation":[{"name":"Zhejiang Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yanhao","family":"Wang","sequence":"additional","affiliation":[{"name":"East China Normal University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuehua","family":"Li","sequence":"additional","affiliation":[{"name":"Zhejiang Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Quan","family":"Deng","sequence":"additional","affiliation":[{"name":"Hangzhou Institute for Advanced Study, UCAS"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuan","family":"Yu","sequence":"additional","affiliation":[{"name":"Zhejiang Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chen","family":"Huang","sequence":"additional","affiliation":[{"name":"Zhejiang Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nan","family":"Che","sequence":"additional","affiliation":[{"name":"Harbin University of Science and Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"46","article-title":"Taskography: Evaluating robot task planning over large 3D scene graphs","volume-title":"Proceedings of the 6th Conference on Robot Learning","author":"Agia"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161212"},{"key":"ref3","first-page":"23","article-title":"SayPlan: Grounding large language models using 3D scene graphs for scalable robot task planning","volume-title":"Proceedings of the 7th Conference on Robot Learning","author":"Rana"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00402"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610792"},{"key":"ref6","article-title":"When are lemons purple? the concept association bias of CLIP","volume":"abs\/2212.12043","author":"Yamada","year":"2022","journal-title":"CoRR"},{"article-title":"When and why vision-language models behave like bags-of-words, and what to do about it?","volume-title":"The Eleventh International Conference on Learning Representations","author":"Y\u00fcksekg\u00f6n\u00fcl","key":"ref7"},{"key":"ref8","article-title":"MobileSAMv2: Faster segment anything to everything","volume":"abs\/2312.09579","author":"Zhang","year":"2023","journal-title":"CoRR"},{"key":"ref9","article-title":"DINOv2: Learning robust visual features without supervision","author":"Oquab","year":"2024","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref10","article-title":"Qwen2-VL: Enhancing vision-language model\u2019s perception of the world at any resolution","volume":"abs\/2409.12191","author":"Wang","year":"2024","journal-title":"CoRR"},{"key":"ref11","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"Radford"},{"key":"ref12","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"Jia"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref14","article-title":"FoCTTA: Low-memory continual test-time adaptation with focus","volume":"abs\/2502.20677","author":"Hu","year":"2025","journal-title":"CoRR"},{"key":"ref15","first-page":"643","article-title":"Semantic abstraction: Open-world 3D scene understanding from 2D vision-language models","volume-title":"Proceedings of the 6th Conference on Robot Learning","author":"Ha"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00836"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00217"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610243"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00576"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00743"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00490"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02065"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01345"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02632"},{"key":"ref25","article-title":"Beyond bare queries: Open-vocabulary object retrieval with 3D scene graph","volume":"abs\/2406.07113","author":"Linok","year":"2024","journal-title":"CoRR"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3068335"},{"key":"ref27","article-title":"Open3D: A modern library for 3D data processing","volume":"abs\/1801.09847","author":"Zhou","year":"2018","journal-title":"CoRR"},{"article-title":"LLaVA-NeXT: Stronger LLMs supercharge multimodal capabilities in the wild","year":"2024","author":"Li","key":"ref28"},{"key":"ref29","article-title":"The Replica dataset: A digital replica of indoor spaces","volume":"abs\/1906.05797","author":"Straub","year":"2019","journal-title":"CoRR"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_51"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.330"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00958"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/n19\u20131423"}],"event":{"name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","start":{"date-parts":[[2025,6,30]]},"location":"Nantes, France","end":{"date-parts":[[2025,7,4]]}},"container-title":["2025 IEEE International Conference on Multimedia and Expo (ICME)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11208895\/11208897\/11209525.pdf?arnumber=11209525","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T05:31:46Z","timestamp":1761888706000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11209525\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":33,"URL":"https:\/\/doi.org\/10.1109\/icme59968.2025.11209525","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}