{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,17]],"date-time":"2026-07-17T15:15:28Z","timestamp":1784301328234,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":69,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680758","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"7995-8004","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["Advancing 3D Object Grounding Beyond a Single 3D Scene"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1555-3674","authenticated-orcid":false,"given":"Wencan","family":"Huang","sequence":"first","affiliation":[{"name":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8179-4508","authenticated-orcid":false,"given":"Daizong","family":"Liu","sequence":"additional","affiliation":[{"name":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9860-0922","authenticated-orcid":false,"given":"Wei","family":"Hu","sequence":"additional","affiliation":[{"name":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"ScanEnts3D: Exploiting Phrase-to-3D-Object Correspondences for Improved Visio-Linguistic Models in 3D Scenes. arXiv preprint arXiv:2212.06250","author":"Abdelreheem Ahmed","year":"2022","unstructured":"Ahmed Abdelreheem, Kyle Olszewski, Hsin-Ying Lee, Peter Wonka, and Panos Achlioptas. 2022. ScanEnts3D: Exploiting Phrase-to-3D-Object Correspondences for Improved Visio-Linguistic Models in 3D Scenes. arXiv preprint arXiv:2212.06250 (2022)."},{"key":"e_1_3_2_2_2_1","volume-title":"Proceedings, Part I 16","author":"Achlioptas Panos","year":"2020","unstructured":"Panos Achlioptas, Ahmed Abdelreheem, Fei Xia, Mohamed Elhoseiny, and Leonidas Guibas. 2020. Referit3d: Neural listeners for fine-grained 3d object identification in real-world scenes. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part I 16. Springer, 422--440."},{"key":"e_1_3_2_2_3_1","first-page":"37146","article-title":"Look around and refer: 2d synthetic semantics knowledge distillation for 3d visual grounding","volume":"35","author":"Bakr Eslam","year":"2022","unstructured":"Eslam Bakr, Yasmeen Alsaedy, and Mohamed Elhoseiny. 2022. Look around and refer: 2d synthetic semantics knowledge distillation for 3d visual grounding. Advances in Neural Information Processing Systems, Vol. 35 (2022), 37146--37158.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_4_1","volume-title":"CoT3DRef: Chain-of-Thoughts Data-Efficient 3D Visual Grounding. arXiv preprint arXiv:2310.06214","author":"Bakr Eslam Mohamed","year":"2023","unstructured":"Eslam Mohamed Bakr, Mohamed Ayman, Mahmoud Ahmed, Habib Slim, and Mohamed Elhoseiny. 2023. CoT3DRef: Chain-of-Thoughts Data-Efficient 3D Visual Grounding. arXiv preprint arXiv:2310.06214 (2023)."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/1553374.1553380"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01597"},{"key":"e_1_3_2_2_7_1","volume-title":"Proceedings, Part XX. Springer, 202--221","author":"Chen Dave Zhenyu","year":"2020","unstructured":"Dave Zhenyu Chen, Angel X Chang, and Matthias Nie\u00dfner. 2020. Scanrefer: 3d object localization in rgb-d scans using natural language. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XX. Springer, 202--221."},{"key":"e_1_3_2_2_8_1","volume-title":"UniT3D: A Unified Transformer for 3D Dense Captioning and Visual Grounding. arXiv preprint arXiv:2212.00836","author":"Chen Dave Zhenyu","year":"2022","unstructured":"Dave Zhenyu Chen, Ronghang Hu, Xinlei Chen, Matthias Nie\u00dfner, and Angel X Chang. 2022. UniT3D: A Unified Transformer for 3D Dense Captioning and Visual Grounding. arXiv preprint arXiv:2212.00836 (2022)."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"crossref","unstructured":"Dave Zhenyu Chen Qirui Wu Matthias Nie\u00dfner and Angel X Chang. 2021. D3Net: a speaker-listener architecture for semi-supervised dense captioning and visual grounding in RGB-D scans. (2021).","DOI":"10.1007\/978-3-031-19824-3_29"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01518"},{"key":"e_1_3_2_2_11_1","unstructured":"Shizhe Chen Pierre-Louis Guhur Makarand Tapaswi Cordelia Schmid and Ivan Laptev. 2022. Language Conditioned Spatial Relation Reasoning for 3D Object Grounding. In NeurIPS 2022--36th Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2480741.2480751"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TETCI.2022.3141105"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3060412"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01211"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00370"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01410"},{"key":"e_1_3_2_2_19_1","volume-title":"International conference on machine learning. PMLR, 2535--2544","author":"Hacohen Guy","year":"2019","unstructured":"Guy Hacohen and Daphna Weinshall. 2019. On the power of curriculum learning in training deep networks. In International conference on machine learning. PMLR, 2535--2544."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475397"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00257"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16253"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01508"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611902"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_24"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00492"},{"key":"e_1_3_2_2_27_1","first-page":"18749","article-title":"Icnet: Intra-saliency correlation network for co-saliency detection","volume":"33","author":"Jin Wen-Da","year":"2020","unstructured":"Wen-Da Jin, Jun Xu, Ming-Ming Cheng, Yi Zhang, and Wei Guo. 2020. Icnet: Intra-saliency correlation network for co-saliency detection. Advances in Neural Information Processing Systems, Vol. 33 (2020), 18749--18759.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01057"},{"key":"e_1_3_2_2_29_1","volume-title":"Proceedings, Part VI 13","author":"Joulin Armand","year":"2014","unstructured":"Armand Joulin, Kevin Tang, and Li Fei-Fei. 2014. Efficient image and video co-localization with frank-wolfe algorithm. In Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6--12, 2014, Proceedings, Part VI 13. Springer, 253--268."},{"key":"e_1_3_2_2_30_1","volume-title":"Proceedings of NAACL-HLT. 4171--4186","author":"Ming-Wei Chang Jacob Devlin","year":"2019","unstructured":"Jacob Devlin Ming-Wei Chang Kenton and Lee Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of NAACL-HLT. 4171--4186."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00700"},{"key":"e_1_3_2_2_32_1","volume-title":"International Conference on Learning Representations.","author":"Li Yunsheng","year":"2021","unstructured":"Yunsheng Li and Yinpeng Chen. 2021. Revisiting Dynamic Convolution via Matrix Decomposition. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_2"},{"key":"e_1_3_2_2_34_1","volume-title":"A Unified Framework for 3D Point Cloud Visual Grounding. arXiv preprint arXiv:2308.11887","author":"Lin Haojia","year":"2023","unstructured":"Haojia Lin, Yongdong Luo, Xiawu Zheng, Lijiang Li, Fei Chao, Taisong Jin, Donghao Luo, Chengjie Wang, Yan Wang, and Liujuan Cao. 2023. A Unified Framework for 3D Point Cloud Visual Grounding. arXiv preprint arXiv:2308.11887 (2023)."},{"key":"e_1_3_2_2_35_1","volume-title":"Recent Advances, and Future Directions. arXiv preprint arXiv:2406.05785","author":"Liu Daizong","year":"2024","unstructured":"Daizong Liu, Yang Liu, Wencan Huang, and Wei Hu. 2024. A Survey on Text-guided 3D Visual Grounding: Elements, Recent Advances, and Future Directions. arXiv preprint arXiv:2406.05785 (2024)."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00597"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680614"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00294"},{"key":"e_1_3_2_2_39_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01596"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.3390\/encyclopedia2010031"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00937"},{"key":"e_1_3_2_2_43_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 652--660","author":"Qi Charles R","year":"2017","unstructured":"Charles R Qi, Hao Su, Kaichun Mo, and Leonidas J Guibas. 2017. Pointnet: Deep learning on point sets for 3d classification and segmentation. In Proceedings of the IEEE conference on computer vision and pattern recognition. 652--660."},{"key":"e_1_3_2_2_44_1","volume-title":"Pointnet: Deep hierarchical feature learning on point sets in a metric space. Advances in neural information processing systems","author":"Qi Charles Ruizhongtai","year":"2017","unstructured":"Charles Ruizhongtai Qi, Li Yi, Hao Su, and Leonidas J Guibas. 2017. Pointnet: Deep hierarchical feature learning on point sets in a metric space. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_2_45_1","volume-title":"Conference on Robot Learning. PMLR, 1046--1056","author":"Roh Junha","year":"2022","unstructured":"Junha Roh, Karthik Desingh, Ali Farhadi, and Dieter Fox. 2022. Languagerefer: Spatial-language model for 3d visual grounding. In Conference on Robot Learning. PMLR, 1046--1056."},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.190"},{"key":"e_1_3_2_2_47_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00273"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/425"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01843"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01881"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00248"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612133"},{"key":"e_1_3_2_2_54_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Yang Li","year":"2024","unstructured":"Li Yang, Ziqi Zhang, Zhongang Qi, Yan Xu, Wei Liu, Ying Shan, Bing Li, Weiping Yang, Peng Li, Yan Wang, et al. 2024. Exploiting Contextual Objects and Relations for 3D Visual Grounding. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00187"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00105"},{"key":"e_1_3_2_2_57_1","volume-title":"Toward Explainable and Fine-Grained 3D Grounding through Referring Textual Phrases. arXiv preprint arXiv:2207.01821","author":"Yuan Zhihao","year":"2022","unstructured":"Zhihao Yuan, Xu Yan, Zhuo Li, Xuhao Li, Yao Guo, Shuguang Cui, and Zhen Li. 2022. Toward Explainable and Fine-Grained 3D Grounding through Referring Textual Phrases. arXiv preprint arXiv:2207.01821 (2022)."},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00181"},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01349"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00321"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00907"},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00413"},{"key":"e_1_3_2_2_63_1","volume-title":"CoADNet: Collaborative aggregation-and-distribution networks for co-salient object detection. Advances in neural information processing systems","author":"Zhang Qijian","year":"2020","unstructured":"Qijian Zhang, Runmin Cong, Junhui Hou, Chongyi Li, and Yao Zhao. 2020. CoADNet: Collaborative aggregation-and-distribution networks for co-salient object detection. Advances in neural information processing systems, Vol. 33 (2020), 6959--6970."},{"key":"e_1_3_2_2_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01397"},{"key":"e_1_3_2_2_65_1","volume-title":"Proceedings, Part XII 16","author":"Zhang Zhao","year":"2020","unstructured":"Zhao Zhang, Wenda Jin, Jun Xu, and Ming-Ming Cheng. 2020. Gradient-induced co-saliency detection. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XII 16. Springer, 455--472."},{"key":"e_1_3_2_2_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00292"},{"key":"e_1_3_2_2_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01003"},{"key":"e_1_3_2_2_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00272"},{"key":"e_1_3_2_2_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3234586"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680758","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680758","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:42Z","timestamp":1750294662000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680758"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":69,"alternative-id":["10.1145\/3664647.3680758","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680758","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}