{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:57:48Z","timestamp":1781539068939,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Domain Foundation of Basic Strengthen Plan","award":["2025-JCJQ-JJ-0658"],"award-info":[{"award-number":["2025-JCJQ-JJ-0658"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810857","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"807-816","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["OpenSGG-VL: Open-Vocabulary 3DSGG with Orthogonal Residual Fusion and Iterative Relation Refinement"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-9634-5213","authenticated-orcid":false,"given":"Binbin","family":"Zhang","sequence":"first","affiliation":[{"name":"Nanjing University of Science and Technology, Nan Jing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9485-8311","authenticated-orcid":false,"given":"Fang","family":"Zhou","sequence":"additional","affiliation":[{"name":"National Key Laboratory of Information Systems Engineering, Nan Jing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0178-9384","authenticated-orcid":false,"given":"Liang","family":"Xiao","sequence":"additional","affiliation":[{"name":"Nanjing University of Science and Technology, Nan Jing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9483-5268","authenticated-orcid":false,"given":"Zhiyong","family":"Su","sequence":"additional","affiliation":[{"name":"Nanjing University of Science and Technology, Nan Jing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1929-3654","authenticated-orcid":false,"given":"Weiqing","family":"Li","sequence":"additional","affiliation":[{"name":"Nanjing University of Science and Technology, Nan Jing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia\u00a0Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et\u00a0al. 2023. Gpt-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_1_3_2","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared\u00a0D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et\u00a0al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877\u20131901."},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02632"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00052"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00886"},{"key":"e_1_3_3_1_7_2","unstructured":"Priya Goyal Piotr Doll\u00e1r Ross Girshick Pieter Noordhuis Lukasz Wesolowski Aapo Kyrola Andrew Tulloch Yangqing Jia and Kaiming He. 2017. Accurate large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1706.02677 (2017)."},{"key":"e_1_3_3_1_8_2","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et\u00a0al. 2024. The llama 3 herd of models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.21783 (2024)."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610112"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610243"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"Cheng-Yu Hsieh Jieyu Zhang Zixian Ma Aniruddha Kembhavi and Ranjay Krishna. 2023. Sugarcrepe: Fixing hackable benchmarks for vision-language compositionality. Advances in neural information processing systems 36 (2023) 31096\u201331116.","DOI":"10.52202\/075280-1355"},{"key":"e_1_3_3_1_12_2","unstructured":"Diederik\u00a0P Kingma. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1412.6980 (2014)."},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00337"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01345"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02022"},{"key":"e_1_3_3_1_16_2","unstructured":"Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan et\u00a0al. 2024. Deepseek-v3 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.19437 (2024)."},{"key":"e_1_3_3_1_17_2","unstructured":"Ilya Loshchilov and Frank Hutter. 2016. Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1608.03983 (2016)."},{"key":"e_1_3_3_1_18_2","unstructured":"Ilya Loshchilov Frank Hutter et\u00a0al. 2017. Fixing weight decay regularization in adam. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1711.05101 5 (2017)."},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_51"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093411"},{"key":"e_1_3_3_1_21_2","first-page":"274","volume-title":"European Conference on Computer Vision","author":"Ma Yanni","year":"2024","unstructured":"Yanni Ma, Hao Liu, Yun Pei, and Yulan Guo. 2024. Heterogeneous graph learning for scene graph prediction in 3d point clouds. In European Conference on Computer Vision. Springer, 274\u2013291."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01050"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Dominic Maggio Yun Chang Nathan Hughes Matthew Trang Dan Griffith Carlyn Dougherty Eric Cristofalo Lukas Schmid and Luca Carlone. 2024. Clio: Real-time task-driven open-set 3d scene graphs. IEEE Robotics and Automation Letters (2024).","DOI":"10.1109\/LRA.2024.3451395"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"crossref","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray et\u00a0al. 2022. Training language models to follow instructions with human feedback. Advances in neural information processing systems 35 (2022) 27730\u201327744.","DOI":"10.52202\/068431-2011"},{"key":"e_1_3_3_1_25_2","unstructured":"Adam Paszke Sam Gross Francisco Massa Adam Lerer James Bradbury Gregory Chanan Trevor Killeen Zeming Lin Natalia Gimelshein Luca Antiga et\u00a0al. 2019. Pytorch: An imperative style high-performance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_3_1_26_2","unstructured":"Jialun Pei Diandian Guo Jingyang Zhang Manxi Lin Yueming Jin and Pheng-Ann Heng. 2024. S 2 Former-OR: Single-Stage Bi-Modal Transformer for Scene Graph Generation in OR. IEEE Transactions on Medical Imaging (2024)."},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"e_1_3_3_1_28_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_1_29_2","unstructured":"Krishan Rana Jesse Haviland Sourav Garg Jad Abou-Chakra Ian Reid and Niko Suenderhauf. 2023. Sayplan: Grounding large language models using 3d scene graphs for scalable robot task planning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.06135 (2023)."},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISMAR-Adjunct51615.2020.00072"},{"key":"e_1_3_3_1_31_2","unstructured":"Qwen Team. 2024. Qwen2.5: A party of foundation models. URL https:\/\/qwenlm. github. io\/blog\/qwen2 5 (2024)."},{"key":"e_1_3_3_1_32_2","unstructured":"Michael Tschannen Alexey Gritsenko Xiao Wang Muhammad\u00a0Ferjad Naeem Ibrahim Alabdulmohsin Nikhil Parthasarathy Talfan Evans Lucas Beyer Ye Xia Basil Mustafa et\u00a0al. 2025. Siglip 2: Multilingual vision-language encoders with improved semantic understanding localization and dense features. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.14786 (2025)."},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00775"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00402"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00732"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"crossref","unstructured":"Xu Wang Yifan Li Qiudan Zhang Wenhui Wu Mark\u00a0Junjie Li Lin Ma and Jianmin Jiang. 2024. Weakly-Supervised 3D Scene Graph Generation via Visual-Linguistic Assisted Pseudo-labeling. IEEE Transactions on Multimedia (2024).","DOI":"10.1109\/TMM.2024.3443670"},{"key":"e_1_3_3_1_37_2","unstructured":"Xuezhi Wang Jason Wei Dale Schuurmans Quoc Le Ed Chi Sharan Narang Aakanksha Chowdhery and Denny Zhou. 2022. Self-consistency improves chain of thought reasoning in language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2203.11171 (2022)."},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02065"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"crossref","unstructured":"Jason Wei Xuezhi Wang Dale Schuurmans Maarten Bosma Fei Xia Ed Chi Quoc\u00a0V Le Denny Zhou et\u00a0al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems 35 (2022) 24824\u201324837.","DOI":"10.52202\/068431-1800"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.077"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00490"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00743"},{"key":"e_1_3_3_1_43_2","unstructured":"Yifan Xu Ziming Luo Qianwei Wang Vineet Kamat and Carol Menassa. 2024. Point2Graph: An End-to-end Point Cloud-based 3D Open-Vocabulary Scene Graph for Robot Navigation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.10350 (2024)."},{"key":"e_1_3_3_1_44_2","unstructured":"An Yang Anfeng Li Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chang Gao Chengen Huang Chenxu Lv et\u00a0al. 2025. Qwen3 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.09388 (2025)."},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00008"},{"key":"e_1_3_3_1_46_2","first-page":"167","volume-title":"European Conference on Computer Vision","author":"Zhai Guangyao","year":"2024","unstructured":"Guangyao Zhai, Evin\u00a0P\u0131nar \u00d6rnek, Dave\u00a0Zhenyu Chen, Ruotong Liao, Yan Di, Nassir Navab, Federico Tombari, and Benjamin Busam. 2024. Echoscene: Indoor scene generation via information echo over scene graph diffusion. In European Conference on Computer Vision. Springer, 167\u2013184."},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00260"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00958"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01595"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:46:55Z","timestamp":1781538415000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810857"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":48,"alternative-id":["10.1145\/3805622.3810857","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810857","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}