{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:02:27Z","timestamp":1777654947016,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100003399","name":"Science and Technology Commission of Shanghai Municipality","doi-asserted-by":"publisher","award":["24511103100"],"award-info":[{"award-number":["24511103100"]}],"id":[{"id":"10.13039\/501100003399","id-type":"DOI","asserted-by":"publisher"}]},{"name":"NSFC Project","award":["62176061"],"award-info":[{"award-number":["62176061"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754918","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:47:18Z","timestamp":1761374838000},"page":"1042-1051","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["A Neural Representation Framework with LLM-Driven Spatial Reasoning for Open-Vocabulary 3D Visual Grounding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1323-7632","authenticated-orcid":false,"given":"Zhenyang","family":"Liu","sequence":"first","affiliation":[{"name":"Fudan University, Shanghai, China and Shanghai Innovation Institute, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8324-1528","authenticated-orcid":false,"given":"Sixiao","family":"Zheng","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China and Shanghai Innovation Institute, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3467-0197","authenticated-orcid":false,"given":"Siyu","family":"Chen","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6745-9674","authenticated-orcid":false,"given":"Cairong","family":"Zhao","sequence":"additional","affiliation":[{"name":"Tongji University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5199-5134","authenticated-orcid":false,"given":"Longfei","family":"Liang","sequence":"additional","affiliation":[{"name":"NeuhHelium Co., Ltd., Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4897-9209","authenticated-orcid":false,"given":"Xiangyang","family":"Xue","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6595-6893","authenticated-orcid":false,"given":"Yanwei","family":"Fu","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China and Shanghai Innovation Institute, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_25"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Robert A Brebin Loren Carpenter and Pat Hanrahan. 1998. Volume rendering. In Seminal graphics: pioneering efforts that shaped the field. 363-372.","DOI":"10.1145\/280811.281028"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_13"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2022.3203102"},{"key":"e_1_3_2_1_6_1","volume-title":"European Conference on Computer Vision. Springer, 247-264","author":"Fu Yuqian","year":"2024","unstructured":"Yuqian Fu, Yu Wang, Yixuan Pan, Lian Huai, Xingyu Qiu, Zeyu Shangguan, Tong Liu, Yanwei Fu, Luc Van Gool, and Xingqun Jiang. 2024. Cross-domain few-shot object detection via enhanced open-set object detector. In European Conference on Computer Vision. Springer, 247-264."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"e_1_3_2_1_8_1","first-page":"203","volume-title":"Foundations and Trends\u00ae in Human-Computer Interaction","volume":"1","author":"Goodrich Michael A","year":"2008","unstructured":"Michael A Goodrich, Alan C Schultz, et al., 2008. Human-robot interaction: a survey. Foundations and Trends\u00ae in Human-Computer Interaction, Vol. 1, 3 (2008), 203-275."},{"key":"e_1_3_2_1_9_1","volume-title":"Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921","author":"Gu Xiuye","year":"2021","unstructured":"Xiuye Gu, Tsung-Yi Lin, Weicheng Kuo, and Yin Cui. 2021. Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921 (2021)."},{"key":"e_1_3_2_1_10_1","volume-title":"Semantic abstraction: Open-world 3d scene understanding from 2d vision-language models. arXiv preprint arXiv:2207.11514","author":"Ha Huy","year":"2022","unstructured":"Huy Ha and Shuran Song. 2022. Semantic abstraction: Open-world 3d scene understanding from 2d vision-language models. arXiv preprint arXiv:2207.11514 (2022)."},{"key":"e_1_3_2_1_11_1","first-page":"20482","article-title":"3d-llm: Injecting the 3d world into large language models","volume":"36","author":"Hong Yining","year":"2023","unstructured":"Yining Hong, Haoyu Zhen, Peihao Chen, Shuhong Zheng, Yilun Du, Zhenfang Chen, and Chuang Gan. 2023. 3d-llm: Injecting the 3d world into large language models. Advances in Neural Information Processing Systems, Vol. 36 (2023), 20482-20494.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_12_1","volume-title":"CCSUMSP: A cross-subject Chinese speech decoding framework with unified topology and multi-modal semantic pre-training. Information Fusion","author":"Huang Shuai","year":"2025","unstructured":"Shuai Huang, Yongxiong Wang, and Huan Luo. 2025a. CCSUMSP: A cross-subject Chinese speech decoding framework with unified topology and multi-modal semantic pre-training. Information Fusion (2025), 103022."},{"key":"e_1_3_2_1_13_1","volume-title":"A dual-branch generative adversarial network with self-supervised enhancement for robust auditory attention decoding. Engineering Applications of Artificial Intelligence","author":"Huang Shuai","year":"2025","unstructured":"Shuai Huang, Yongxiong Wang, and Huan Luo. 2025b. A dual-branch generative adversarial network with self-supervised enhancement for robust auditory attention decoding. Engineering Applications of Artificial Intelligence (2025), 111122."},{"key":"e_1_3_2_1_14_1","first-page":"1","article-title":"SSAAD: A Multi-Scale Temporal-Frequency Graph Network for Binary Auditory Attention Detection with Self-Supervised Learning. In ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Huang Shuai","year":"2025","unstructured":"Shuai Huang, Yongxiong Wang, Huan Luo, Shuwen Jia, Han Chen, Chendong Qin, Zhongcai He, and Rui Luo. 2025c. SSAAD: A Multi-Scale Temporal-Frequency Graph Network for Binary Auditory Attention Detection with Self-Supervised Learning. In ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1-5.","journal-title":"IEEE"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592433"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01807"},{"key":"e_1_3_2_1_17_1","volume-title":"GARField: Group Anything with Radiance Fields. arXiv preprint arXiv:2401.09419","author":"Kim Chung Min","year":"2024","unstructured":"Chung Min Kim, Mingxuan Wu, Justin Kerr, Ken Goldberg, Matthew Tancik, and Angjoo Kanazawa. 2024. GARField: Group Anything with Radiance Fields. arXiv preprint arXiv:2401.09419 (2024)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Ranjay Krishna Yuke Zhu Oliver Groth Justin Johnson Kenji Hata Joshua Kravitz Stephanie Chen Yannis Kalantidis Li-Jia Li David A Shamma et al. 2017. Visual genome: Connecting language and vision using crowdsourced dense image annotations. International journal of computer vision Vol. 123 (2017) 32-73.","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_20_1","volume-title":"Language-driven semantic segmentation. arXiv preprint arXiv:2201.03546","author":"Li Boyi","year":"2022","unstructured":"Boyi Li, Kilian Q Weinberger, Serge Belongie, Vladlen Koltun, and Ren\u00e9 Ranftl. 2022. Language-driven semantic segmentation. arXiv preprint arXiv:2201.03546 (2022)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00029"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00079"},{"key":"e_1_3_2_1_24_1","volume-title":"Spatial-Temporal Aware Visuomotor Diffusion Policy Learning. arXiv preprint arXiv:2507.06710","author":"Liu Zhenyang","year":"2025","unstructured":"Zhenyang Liu, Yikai Wang, Kuanning Wang, Longfei Liang, Xiangyang Xue, and Yanwei Fu. 2025a. Spatial-Temporal Aware Visuomotor Diffusion Policy Learning. arXiv preprint arXiv:2507.06710 (2025)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00352"},{"key":"e_1_3_2_1_26_1","volume-title":"Conference on Robot Learning. PMLR, 1610-1620","author":"Lu Shiyang","year":"2023","unstructured":"Shiyang Lu, Haonan Chang, Eric Pu Jing, Abdeslam Boularias, and Kostas Bekris. 2023. Ovir-3d: Open-vocabulary 3d instance retrieval without training on 3d data. In Conference on Robot Learning. PMLR, 1610-1620."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00695"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"e_1_3_2_1_30_1","volume-title":"Instant neural graphics primitives with a multiresolution hash encoding. ACM transactions on graphics (TOG)","author":"M\u00fcller Thomas","year":"2022","unstructured":"Thomas M\u00fcller, Alex Evans, Christoph Schied, and Alexander Keller. 2022. Instant neural graphics primitives with a multiresolution hash encoding. ACM transactions on graphics (TOG), Vol. 41, 4 (2022), 1-15."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00985"},{"key":"e_1_3_2_1_32_1","volume-title":"Proc. of NeurIPS-Workshops","volume":"3","author":"Novosel Jelena","year":"2019","unstructured":"Jelena Novosel, Prashanth Viswanath, and Bruno Arsenali. 2019. Boosting semantic segmentation with multi-task self-supervised learning for autonomous driving applications. In Proc. of NeurIPS-Workshops, Vol. 3."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i6.32672"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"e_1_3_2_1_35_1","volume-title":"LangSplat: 3D Language Gaussian Splatting. arXiv preprint arXiv:2312.16084","author":"Qin Minghan","year":"2023","unstructured":"Minghan Qin, Wanhua Li, Jiawei Zhou, Haoqian Wang, and Hanspeter Pfister. 2023. LangSplat: 3D Language Gaussian Splatting. arXiv preprint arXiv:2312.16084 (2023)."},{"key":"e_1_3_2_1_36_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.3390\/rs15143585"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9561936"},{"key":"e_1_3_2_1_39_1","volume-title":"Human-robot interaction: status and challenges. Human factors","author":"Sheridan Thomas B","year":"2016","unstructured":"Thomas B Sheridan. 2016. Human-robot interaction: status and challenges. Human factors, Vol. 58, 4 (2016), 525-532."},{"key":"e_1_3_2_1_40_1","unstructured":"Julian Straub Thomas Whelan Lingni Ma Yufan Chen Erik Wijmans Simon Green Jakob J Engel Raul Mur-Artal Carl Ren Shobhit Verma et al. 2019. The replica dataset: A digital replica of indoor spaces. arXiv preprint arXiv:1906.05797 (2019)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00807"},{"key":"e_1_3_2_1_42_1","volume-title":"Etienne Pot, Andrea Tagliasacchi, and Daniel Duckworth.","author":"Vora Suhani","year":"2021","unstructured":"Suhani Vora, Noha Radwan, Klaus Greff, Henning Meyer, Kyle Genova, Mehdi SM Sajjadi, Etienne Pot, Andrea Tagliasacchi, and Daniel Duckworth. 2021. Nesf: Neural semantic fields for generalizable semantic segmentation of 3d scenes. arXiv preprint arXiv:2111.13260 (2021)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00289"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00181"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00342"},{"key":"e_1_3_2_1_48_1","volume-title":"Tinyllama: An open-source small language model. arXiv preprint arXiv:2401.02385","author":"Zhang Peiyuan","year":"2024","unstructured":"Peiyuan Zhang, Guangtao Zeng, Tianduo Wang, and Wei Lu. 2024b. Tinyllama: An open-source small language model. arXiv preprint arXiv:2401.02385 (2024)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00537"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00674"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754918","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:18:07Z","timestamp":1765340287000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754918"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":50,"alternative-id":["10.1145\/3746027.3754918","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754918","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}