{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:04:45Z","timestamp":1765357485745,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61972009"],"award-info":[{"award-number":["61972009"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611902","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"5017-5026","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":16,"title":["Dense Object Grounding in 3D Scenes"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1555-3674","authenticated-orcid":false,"given":"Wencan","family":"Huang","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8179-4508","authenticated-orcid":false,"given":"Daizong","family":"Liu","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9860-0922","authenticated-orcid":false,"given":"Wei","family":"Hu","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"ScanEnts3D: Exploiting Phrase-to-3D-Object Correspondences for Improved Visio-Linguistic Models in 3D Scenes. arXiv preprint arXiv:2212.06250","author":"Abdelreheem Ahmed","year":"2022","unstructured":"Ahmed Abdelreheem, Kyle Olszewski, Hsin-Ying Lee, Peter Wonka, and Panos Achlioptas. 2022. ScanEnts3D: Exploiting Phrase-to-3D-Object Correspondences for Improved Visio-Linguistic Models in 3D Scenes. arXiv preprint arXiv:2212.06250 (2022)."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_25"},{"key":"e_1_3_2_2_3_1","first-page":"37146","article-title":"Look around and refer: 2d synthetic semantics knowledge distillation for 3d visual grounding","volume":"35","author":"Bakr Eslam","year":"2022","unstructured":"Eslam Bakr, Yasmeen Alsaedy, and Mohamed Elhoseiny. 2022. Look around and refer: 2d synthetic semantics knowledge distillation for 3d visual grounding. Advances in Neural Information Processing Systems, Vol. 35 (2022), 37146--37158.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16175"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01597"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_2_7_1","volume-title":"UK","author":"Chen Dave Zhenyu","year":"2020","unstructured":"Dave Zhenyu Chen, Angel X Chang, and Matthias Nie\u00dfner. 2020a. Scanrefer: 3d object localization in rgb-d scans using natural language. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XX. Springer, 202--221."},{"key":"e_1_3_2_2_8_1","volume-title":"UniT3D: A Unified Transformer for 3D Dense Captioning and Visual Grounding. arXiv preprint arXiv:2212.00836","author":"Chen Dave Zhenyu","year":"2022","unstructured":"Dave Zhenyu Chen, Ronghang Hu, Xinlei Chen, Matthias Nie\u00dfner, and Angel X Chang. 2022b. UniT3D: A Unified Transformer for 3D Dense Captioning and Visual Grounding. arXiv preprint arXiv:2212.00836 (2022)."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01282"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01518"},{"key":"e_1_3_2_2_11_1","unstructured":"Shizhe Chen Pierre-Louis Guhur Makarand Tapaswi Cordelia Schmid and Ivan Laptev. 2022a. Language Conditioned Spatial Relation Reasoning for 3D Object Grounding. In NeurIPS 2022--36th Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"crossref","unstructured":"Sijin Chen Hongyuan Zhu Xin Chen Yinjie Lei Tao Chen and Gang YU. 2023. End-to-End 3D Dense Captioning with Vote2Cap-DETR. arXiv preprint arXiv:2301.02508 (2023).","DOI":"10.1109\/CVPR52729.2023.01070"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00321"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01010"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"e_1_3_2_2_17_1","volume-title":"9th International Conference on Learning Representations, ICLR 2021","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3-7, 2021."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00370"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475397"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16253"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01508"},{"key":"e_1_3_2_2_22_1","volume-title":"Tel Aviv","author":"Jain Ayush","year":"2022","unstructured":"Ayush Jain, Nikolaos Gkanatsios, Ishita Mediratta, and Katerina Fragkiadaki. 2022. Bottom up top down detection transformers for language grounding in images and point clouds. In Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXXVI. Springer, 417--433."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00492"},{"key":"e_1_3_2_2_24_1","volume-title":"Tel Aviv","author":"Jiao Yang","year":"2022","unstructured":"Yang Jiao, Shaoxiang Chen, Zequn Jie, Jingjing Chen, Lin Ma, and Yu-Gang Jiang. 2022. More: Multi-order relation mining for dense captioning in 3d scenes. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXXV. Springer, 528--545."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"e_1_3_2_2_26_1","volume-title":"Proceedings of NAACL-HLT. 4171--4186","author":"Ming-Wei Chang Jacob Devlin","year":"2019","unstructured":"Jacob Devlin Ming-Wei Chang Kenton and Lee Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of NAACL-HLT. 4171--4186."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01108"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3414026"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00597"},{"key":"e_1_3_2_2_30_1","volume-title":"3D-QueryIS: A Query-based Framework for 3D Instance Segmentation. arXiv preprint arXiv:2211.09375","author":"Liu Jiaheng","year":"2022","unstructured":"Jiaheng Liu, Tong He, Honghui Yang, Rui Su, Jiayi Tian, Junran Wu, Hongcheng Guo, Ke Xu, and Wanli Ouyang. 2022. 3D-QueryIS: A Query-based Framework for 3D Instance Segmentation. arXiv preprint arXiv:2211.09375 (2022)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00431"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00294"},{"key":"e_1_3_2_2_34_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_2_35_1","volume-title":"Hierarchical question-image co-attention for visual question answering. Advances in neural information processing systems","author":"Lu Jiasen","year":"2016","unstructured":"Jiasen Lu, Jianwei Yang, Dhruv Batra, and Devi Parikh. 2016. Hierarchical question-image co-attention for visual question answering. Advances in neural information processing systems, Vol. 29 (2016)."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01596"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00290"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00937"},{"key":"e_1_3_2_2_40_1","volume-title":"Pointnet: Deep hierarchical feature learning on point sets in a metric space. Advances in neural information processing systems","author":"Qi Charles Ruizhongtai","year":"2017","unstructured":"Charles Ruizhongtai Qi, Li Yi, Hao Su, and Leonidas J Guibas. 2017. Pointnet: Deep hierarchical feature learning on point sets in a metric space. Advances in neural information processing systems, Vol. 30 (2017)."},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 9982--9991","author":"Qi Yuankai","key":"e_1_3_2_2_41_1","unstructured":"Yuankai Qi, Qi Wu, Peter Anderson, Xin Wang, William Yang Wang, Chunhua Shen, and Anton van den Hengel. 2020. Reverie: Remote embodied visual referring expression in real indoor environments. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 9982--9991."},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3042066"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00075"},{"key":"e_1_3_2_2_44_1","volume-title":"Conference on Robot Learning. PMLR, 1046--1056","author":"Roh Junha","year":"2022","unstructured":"Junha Roh, Karthik Desingh, Ali Farhadi, and Dieter Fox. 2022. Languagerefer: Spatial-language model for 3d visual grounding. In Conference on Robot Learning. PMLR, 1046--1056."},{"key":"e_1_3_2_2_45_1","volume-title":"Superpoint Transformer for 3D Scene Instance Segmentation. arXiv preprint arXiv:2211.15766","author":"Sun Jiahao","year":"2022","unstructured":"Jiahao Sun, Chunmei Qing, Junpeng Tan, and Xiangmin Xu. 2022. Superpoint Transformer for 3D Scene Instance Segmentation. arXiv preprint arXiv:2211.15766 (2022)."},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01422"},{"key":"e_1_3_2_2_47_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00273"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/194"},{"key":"e_1_3_2_2_50_1","volume-title":"EDA: Explicit Text-Decoupling and Dense Alignment for 3D Visual and Language Learning. arXiv preprint arXiv:2209.14941","author":"Wu Yanmin","year":"2022","unstructured":"Yanmin Wu, Xinhua Cheng, Renrui Zhang, Zesen Cheng, and Jian Zhang. 2022. EDA: Explicit Text-Decoupling and Dense Alignment for 3D Visual and Language Learning. arXiv preprint arXiv:2209.14941 (2022)."},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01595"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00187"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"e_1_3_2_2_54_1","volume-title":"Toward Explainable and Fine-Grained 3D Grounding through Referring Textual Phrases. arXiv preprint arXiv:2207.01821","author":"Yuan Zhihao","year":"2022","unstructured":"Zhihao Yuan, Xu Yan, Zhuo Li, Xuhao Li, Yao Guo, Shuguang Cui, and Zhen Li. 2022a. Toward Explainable and Fine-Grained 3D Grounding through Referring Textual Phrases. arXiv preprint arXiv:2207.01821 (2022)."},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00837"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00181"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00292"},{"key":"e_1_3_2_2_58_1","volume-title":"Contextual Modeling for 3D Dense Captioning on Point Clouds. arXiv preprint arXiv:2210.03925","author":"Zhong Yufeng","year":"2022","unstructured":"Yufeng Zhong, Long Xu, Jiebo Luo, and Lin Ma. 2022. Contextual Modeling for 3D Dense Captioning on Point Clouds. arXiv preprint arXiv:2210.03925 (2022)."},{"key":"e_1_3_2_2_59_1","volume-title":"ConQueR: Query Contrast Voxel-DETR for 3D Object Detection. arXiv preprint arXiv:2212.07289","author":"Zhu Benjin","year":"2022","unstructured":"Benjin Zhu, Zhe Wang, Shaoshuai Shi, Hang Xu, Lanqing Hong, and Hongsheng Li. 2022. ConQueR: Query Contrast Voxel-DETR for 3D Object Detection. arXiv preprint arXiv:2212.07289 (2022)."},{"key":"e_1_3_2_2_60_1","volume-title":"Deformable DETR: Deformable Transformers for End-to-End Object Detection. In 9th International Conference on Learning Representations, ICLR 2021","author":"Zhu Xizhou","year":"2021","unstructured":"Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, and Jifeng Dai. 2021. Deformable DETR: Deformable Transformers for End-to-End Object Detection. In 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3-7, 2021."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611902","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611902","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:11:59Z","timestamp":1755821519000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611902"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":60,"alternative-id":["10.1145\/3581783.3611902","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611902","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}