{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,8]],"date-time":"2026-03-08T04:42:04Z","timestamp":1772944924337,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":68,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680998","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"8316-8325","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["RefMask3D: Language-Guided Transformer for 3D Referring Segmentation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1582-5684","authenticated-orcid":false,"given":"Shuting","family":"He","sequence":"first","affiliation":[{"name":"Institute of Big Data, Fudan University &amp; Nanyang Technological University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4868-6526","authenticated-orcid":false,"given":"Henghui","family":"Ding","sequence":"additional","affiliation":[{"name":"Institute of Big Data, Fudan University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Jun Chen, and Mohamed Elhoseiny.","author":"Abdelreheem Ahmed","year":"2022","unstructured":"Ahmed Abdelreheem, Ujjwal Upadhyay, Ivan Skorokhodov, Rawan Al Yahya, Jun Chen, and Mohamed Elhoseiny. 2022. 3dreftransformer: Fine-grained object identification in real-world scenes using natural language. In WACV. 3941--3950."},{"key":"e_1_3_2_1_2_1","volume-title":"Referit3d: Neural listeners for fine-grained 3d object identification in real-world scenes","author":"Achlioptas Panos","unstructured":"Panos Achlioptas, Ahmed Abdelreheem, Fei Xia, Mohamed Elhoseiny, and Leonidas Guibas. 2020. Referit3d: Neural listeners for fine-grained 3d object identification in real-world scenes. In ECCV. Springer, 422--440."},{"key":"e_1_3_2_1_3_1","unstructured":"Eslam Bakr Yasmeen Alsaedy and Mohamed Elhoseiny. 2022. Look around and refer: 2d synthetic semantics knowledge distillation for 3d visual grounding. In NeurIPS. 37146--37158."},{"key":"e_1_3_2_1_4_1","unstructured":"Daigang Cai Lichen Zhao Jing Zhang Lu Sheng and Dong Xu. 2022. 3djcg: A unified framework for joint dense captioning and visual grounding on 3d point clouds. In CVPR. 16464--16473."},{"key":"e_1_3_2_1_5_1","volume-title":"Scanrefer: 3d object localization in rgb-d scans using natural language","author":"Chen Dave Zhenyu","unstructured":"Dave Zhenyu Chen, Angel X Chang, and Matthias Nie\u00dfner. 2020. Scanrefer: 3d object localization in rgb-d scans using natural language. In ECCV. Springer, 202--221."},{"key":"e_1_3_2_1_6_1","volume-title":"D3Net: A Unified Speaker-Listener Architecture for 3D Dense Captioning and Visual Grounding","author":"Chen Dave Zhenyu","unstructured":"Dave Zhenyu Chen, Qirui Wu, Matthias Nie\u00dfner, and Angel X Chang. 2022. D3Net: A Unified Speaker-Listener Architecture for 3D Dense Captioning and Visual Grounding. In ECCV. Springer, 487--505."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Bowen Cheng Ishan Misra Alexander G Schwing Alexander Kirillov and Rohit Girdhar. 2022. Masked-attention mask transformer for universal image segmentation. In CVPR. 1290--1299.","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"e_1_3_2_1_8_1","unstructured":"Christopher Choy JunYoung Gwak and Silvio Savarese. 2019. 4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural Networks. In CVPR."},{"key":"e_1_3_2_1_9_1","volume-title":"Scannet: Richly-annotated 3d reconstructions of indoor scenes. In CVPR. 5828--5839.","author":"Dai Angela","year":"2017","unstructured":"Angela Dai, Angel X Chang, Manolis Savva, Maciej Halber, Thomas Funkhouser, and Matthias Nie\u00dfner. 2017. Scannet: Richly-annotated 3d reconstructions of indoor scenes. In CVPR. 5828--5839."},{"key":"e_1_3_2_1_10_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_11_1","volume-title":"Phraseclick: toward achieving flexible interactive segmentation by phrase and click","author":"Ding Henghui","unstructured":"Henghui Ding, Scott Cohen, Brian Price, and Xudong Jiang. 2020. Phraseclick: toward achieving flexible interactive segmentation by phrase and click. In ECCV. Springer, 417--435."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Henghui Ding Chang Liu Shuting He Xudong Jiang and Chen Change Loy. 2023. MeViS: A large-scale benchmark for video segmentation with motion expressions. In ICCV. 2694--2703.","DOI":"10.1109\/ICCV51070.2023.00254"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Henghui Ding Chang Liu Suchen Wang and Xudong Jiang. 2021. Vision-language transformer and query generation for referring segmentation. In ICCV. 16321--16330.","DOI":"10.1109\/ICCV48922.2021.01601"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3217852"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Mingtao Feng Zhen Li Qi Li Liang Zhang XiangDong Zhang Guangming Zhu Hui Zhang Yaonan Wang and Ajmal Mian. 2021. Free-form description guided 3d visual graph network for object grounding in point cloud. In ICCV. 3722--3731.","DOI":"10.1109\/ICCV48922.2021.00370"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Ziyu Guo Yiwen Tang Renrui Zhang Dong Wang Zhigang Wang Bin Zhao and Xuelong Li. 2023. ViewRefer: Grasp the Multi-view Knowledge for 3D Visual Grounding with GPT and Prototype Guidance. In ICCV.","DOI":"10.1109\/ICCV51070.2023.01410"},{"key":"e_1_3_2_1_17_1","unstructured":"Dailan He Yusheng Zhao Junyu Luo Tianrui Hui Shaofei Huang Aixi Zhang and Si Liu. 2021. Transrefer3d: Entity-and-relation aware transformer for fine-grained 3d visual grounding. In ACM MM. 2344--2352."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Shuting He and Henghui Ding. 2024. Decoupling static and hierarchical motion perception for referring video segmentation. In CVPR. 13332--13341.","DOI":"10.1109\/CVPR52733.2024.01266"},{"key":"e_1_3_2_1_19_1","unstructured":"Shuting He Henghui Ding and Wei Jiang. 2023. Primitive generation and semantic-related alignment for universal zero-shot segmentation. In CVPR."},{"key":"e_1_3_2_1_20_1","unstructured":"Shuting He Henghui Ding and Wei Jiang. 2023. Semantic-promoted debiasing and background disambiguation for zero-shot instance segmentation. In CVPR."},{"key":"e_1_3_2_1_21_1","unstructured":"Shuting He Henghui Ding Xudong Jiang and Bihan Wen. 2024. SegPoint: Segment Any Point Cloud via Large Language Model. In ECCV."},{"key":"e_1_3_2_1_22_1","volume-title":"GREC: Generalized referring expression comprehension. arXiv preprint arXiv:2308.16182","author":"He Shuting","year":"2023","unstructured":"Shuting He, Henghui Ding, Chang Liu, and Xudong Jiang. 2023. GREC: Generalized referring expression comprehension. arXiv preprint arXiv:2308.16182 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"2023 d. Prototype adaption and projection for few-and zero-shot 3d point cloud semantic segmentation","author":"He Shuting","year":"2023","unstructured":"Shuting He, Xudong Jiang, Wei Jiang, and Henghui Ding. 2023 d. Prototype adaption and projection for few-and zero-shot 3d point cloud semantic segmentation. IEEE TIP (2023), 3199--3211."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Joy Hsu Jiayuan Mao and Jiajun Wu. 2023. Ns3d: Neuro-symbolic grounding of 3d objects and relations. In CVPR. 2614--2623.","DOI":"10.1109\/CVPR52729.2023.00257"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Pin-Hao Huang Han-Hung Lee Hwann-Tzong Chen and Tyng-Luh Liu. 2021. Text-guided graph neural networks for referring 3d instance segmentation. In AAAI. 1610--1618.","DOI":"10.1609\/aaai.v35i2.16253"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Shijia Huang Yilun Chen Jiaya Jia and Liwei Wang. 2022. Multi-view transformer for 3d visual grounding. In CVPR. 15524--15533.","DOI":"10.1109\/CVPR52688.2022.01508"},{"key":"e_1_3_2_1_27_1","volume-title":"Bottom up top down detection transformers for language grounding in images and point clouds","author":"Jain Ayush","unstructured":"Ayush Jain, Nikolaos Gkanatsios, Ishita Mediratta, and Katerina Fragkiadaki. 2022. Bottom up top down detection transformers for language grounding in images and point clouds. In ECCV. Springer, 417--433."},{"key":"e_1_3_2_1_28_1","volume-title":"Categorical reparameterization with gumbel-softmax. arXiv preprint arXiv:1611.01144","author":"Jang Eric","year":"2016","unstructured":"Eric Jang, Shixiang Gu, and Ben Poole. 2016. Categorical reparameterization with gumbel-softmax. arXiv preprint arXiv:1611.01144 (2016)."},{"key":"e_1_3_2_1_29_1","volume-title":"Pointgroup: Dual-set point grouping for 3d instance segmentation. In CVPR. 4867--4876.","author":"Jiang Li","year":"2020","unstructured":"Li Jiang, Hengshuang Zhao, Shaoshuai Shi, Shu Liu, Chi-Wing Fu, and Jiaya Jia. 2020. Pointgroup: Dual-set point grouping for 3d instance segmentation. In CVPR. 4867--4876."},{"key":"e_1_3_2_1_30_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Alexander Kirillov Kaiming He Ross Girshick Carsten Rother and Piotr Doll\u00e1r. 2019. Panoptic segmentation. In CVPR. 9404--9413.","DOI":"10.1109\/CVPR.2019.00963"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Maxim Kolodiazhnyi Anna Vorontsova Anton Konushin and Danila Rukhovich. 2024. OneFormer3D: One transformer for unified point cloud segmentation. In CVPR.","DOI":"10.1109\/CVPR52733.2024.01979"},{"key":"e_1_3_2_1_33_1","volume-title":"Lisa: Reasoning segmentation via large language model. In CVPR.","author":"Lai Xin","year":"2024","unstructured":"Xin Lai, Zhuotao Tian, Yukang Chen, Yanwei Li, Yuhui Yuan, Shu Liu, and Jiaya Jia. 2024. Lisa: Reasoning segmentation via large language model. In CVPR."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Xin Lai Yuhui Yuan Ruihang Chu Yukang Chen Han Hu and Jiaya Jia. 2023. Mask-Attention-Free Transformer for 3D Instance Segmentation. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00342"},{"key":"e_1_3_2_1_35_1","volume-title":"NeurIPS (2021)","author":"Li Muchen","year":"2021","unstructured":"Muchen Li and Leonid Sigal. 2021. Referring transformer: A one-step approach to multi-task visual grounding. NeurIPS (2021), 19652--19664."},{"key":"e_1_3_2_1_36_1","volume-title":"Transformer-based visual segmentation: A survey","author":"Li Xiangtai","year":"2024","unstructured":"Xiangtai Li, Henghui Ding, Haobo Yuan, Wenwei Zhang, Jiangmiao Pang, Guangliang Cheng, Kai Chen, Ziwei Liu, and Chen Change Loy. 2024. Transformer-based visual segmentation: A survey. IEEE TPAMI (2024)."},{"key":"e_1_3_2_1_37_1","volume-title":"GRES: Generalized Referring Expression Segmentation. In CVPR. 23592--23601.","author":"Liu Chang","year":"2023","unstructured":"Chang Liu, Henghui Ding, and Xudong Jiang. 2023. GRES: Generalized Referring Expression Segmentation. In CVPR. 23592--23601."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3277791"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3163578"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/s44267-024-00049-8"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Chang Liu Xiangtai Li and Henghui Ding. 2024. Referring Image Editing: Object-level Image Editing via Referring Expressions. In CVPR. 13128--13138.","DOI":"10.1109\/CVPR52733.2024.01247"},{"key":"e_1_3_2_1_42_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Ze Liu Yutong Lin Yue Cao Han Hu Yixuan Wei Zheng Zhang Stephen Lin and Baining Guo. 2021. Swin transformer: Hierarchical vision transformer using shifted windows. In ICCV. 10012--10022.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Ze Liu Zheng Zhang Yue Cao Han Hu and Xin Tong. 2021. Group-free 3d object detection via transformers. In ICCV. 2949--2958.","DOI":"10.1109\/ICCV48922.2021.00294"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Gen Luo Yiyi Zhou Xiaoshuai Sun Liujuan Cao Chenglin Wu Cheng Deng and Rongrong Ji. 2020. Multi-task collaborative network for joint referring expression comprehension and segmentation. In CVPR. 10034--10043.","DOI":"10.1109\/CVPR42600.2020.01005"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Junyu Luo Jiahui Fu Xianghao Kong Chen Gao Haibing Ren Hao Shen Huaxia Xia and Si Liu. 2022. 3d-sps: Single-stage 3d visual grounding via referred point progressive selection. In CVPR. 16454--16463.","DOI":"10.1109\/CVPR52688.2022.01596"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Junhua Mao Jonathan Huang Alexander Toshev Oana Camburu Alan L Yuille and Kevin Murphy. 2016. Generation and comprehension of unambiguous object descriptions. In CVPR. 11--20.","DOI":"10.1109\/CVPR.2016.9"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Ishan Misra Rohit Girdhar and Armand Joulin. 2021. An end-to-end transformer model for 3d object detection. In CVPR. 2906--2917.","DOI":"10.1109\/ICCV48922.2021.00290"},{"key":"e_1_3_2_1_49_1","volume-title":"Pointnet: Deep hierarchical feature learning on point sets in a metric space. In NeurIPS.","author":"Qi Charles Ruizhongtai","year":"2017","unstructured":"Charles Ruizhongtai Qi, Li Yi, Hao Su, and Leonidas J Guibas. 2017. Pointnet: Deep hierarchical feature learning on point sets in a metric space. In NeurIPS."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Zhipeng Qian Yiwei Ma Jiayi Ji and Xiaoshuai Sun. 2024. X-RefSeg3D: Enhancing Referring 3D Instance Segmentation via Structured Cross-Modal Graph Neural Networks. In AAAI. 4551--4559.","DOI":"10.1609\/aaai.v38i5.28254"},{"key":"e_1_3_2_1_51_1","volume-title":"Conference on Robot Learning. PMLR, 1046--1056","author":"Roh Junha","year":"2022","unstructured":"Junha Roh, Karthik Desingh, Ali Farhadi, and Dieter Fox. 2022. Languagerefer: Spatial-language model for 3d visual grounding. In Conference on Robot Learning. PMLR, 1046--1056."},{"key":"e_1_3_2_1_52_1","volume-title":"Mask3D: Mask Transformer for 3D Semantic Instance Segmentation","author":"Schult Jonas","unstructured":"Jonas Schult, Francis Engelmann, Alexander Hermans, Or Litany, Siyu Tang, and Bastian Leibe. 2023. Mask3D: Mask Transformer for 3D Semantic Instance Segmentation. In ICRA. IEEE, 8216--8223."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Yanyan Shao Shuting He Qi Ye Yuchao Feng Wenhan Luo and Jiming Chen. 2024. Context-Aware Integration of Language and Visual References for Natural Language Tracking. In CVPR.","DOI":"10.1109\/CVPR52733.2024.01817"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Jiahao Sun Chunmei Qing Junpeng Tan and Xiangmin Xu. 2023. Superpoint transformer for 3d scene instance segmentation. In AAAI. 2393--2401.","DOI":"10.1609\/aaai.v37i2.25335"},{"key":"e_1_3_2_1_55_1","volume-title":"Cris: Clip-driven referring image segmentation. In CVPR. 11686--11695.","author":"Wang Zhaoqing","year":"2022","unstructured":"Zhaoqing Wang, Yu Lu, Qiang Li, Xunqiang Tao, Yandong Guo, Mingming Gong, and Tongliang Liu. 2022. Cris: Clip-driven referring image segmentation. In CVPR. 11686--11695."},{"key":"e_1_3_2_1_56_1","unstructured":"Changli Wu Yihang Liu Yiwei Ma Haowei Wang Gen Luo Jiayi Ji Henghui Ding Xiaoshuai Sun and Rongrong Ji. 2024. 3D-GRES: Generalized 3D Referring Expression Segmentation. In ACM MM."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Changli Wu Yiwei Ma Qi Chen Haowei Wang Gen Luo Jiayi Ji and Xiaoshuai Sun. 2024 d. 3D-STMN: Dependency-Driven Superpoint-Text Matching Network for End-to-End 3D Referring Expression Segmentation. In AAAI. 5940--5948.","DOI":"10.1609\/aaai.v38i6.28408"},{"key":"e_1_3_2_1_58_1","first-page":"1782","article-title":"Towards robust referring image segmentation","volume":"33","author":"Wu Jianzong","year":"2024","unstructured":"Jianzong Wu, Xiangtai Li, Xia Li, Henghui Ding, Yunhai Tong, and Dacheng Tao. 2024. Towards robust referring image segmentation. IEEE TIP, Vol. 33 (2024), 1782--1794.","journal-title":"IEEE TIP"},{"key":"e_1_3_2_1_59_1","unstructured":"Jianzong Wu Xiangtai Li Shilin Xu Haobo Yuan Henghui Ding Yibo Yang Xia Li Jiangning Zhang Yunhai Tong Xudong Jiang et al. 2024. Towards open vocabulary learning: A survey. IEEE TPAMI (2024)."},{"key":"e_1_3_2_1_60_1","volume-title":"EDA: Explicit Text-Decoupling and Dense Alignment for 3D Visual Grounding. In CVPR. 19231--19242.","author":"Wu Yanmin","year":"2023","unstructured":"Yanmin Wu, Xinhua Cheng, Renrui Zhang, Zesen Cheng, and Jian Zhang. 2023. EDA: Explicit Text-Decoupling and Dense Alignment for 3D Visual Grounding. In CVPR. 19231--19242."},{"key":"e_1_3_2_1_61_1","volume-title":"Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, and Xiaolong Wang.","author":"Xu Jiarui","year":"2022","unstructured":"Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, and Xiaolong Wang. 2022. Groupvit: Semantic segmentation emerges from text supervision. In CVPR. 18134--18144."},{"key":"e_1_3_2_1_62_1","volume-title":"Lavt: Language-aware vision transformer for referring image segmentation. In CVPR. 18155--18165.","author":"Yang Zhao","year":"2022","unstructured":"Zhao Yang, Jiaqi Wang, Yansong Tang, Kai Chen, Hengshuang Zhao, and Philip HS Torr. 2022. Lavt: Language-aware vision transformer for referring image segmentation. In CVPR. 18155--18165."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"crossref","unstructured":"Zhengyuan Yang Songyang Zhang Liwei Wang and Jiebo Luo. 2021. Sat: 2d semantics assisted training for 3d visual grounding. In ICCV. 1856--1866.","DOI":"10.1109\/ICCV48922.2021.00187"},{"key":"e_1_3_2_1_64_1","volume-title":"Modeling context in referring expressions","author":"Yu Licheng","unstructured":"Licheng Yu, Patrick Poirson, Shan Yang, Alexander C Berg, and Tamara L Berg. 2016. Modeling context in referring expressions. In ECCV. Springer, 69--85."},{"key":"e_1_3_2_1_65_1","volume-title":"Instancerefer: Cooperative holistic understanding for visual grounding on point clouds through instance multi-level contextual referring. In CVPR. 1791--1800.","author":"Yuan Zhihao","year":"2021","unstructured":"Zhihao Yuan, Xu Yan, Yinghong Liao, Ruimao Zhang, Sheng Wang, Zhen Li, and Shuguang Cui. 2021. Instancerefer: Cooperative holistic understanding for visual grounding on point clouds through instance multi-level contextual referring. In CVPR. 1791--1800."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"crossref","unstructured":"Yiming Zhang ZeMing Gong and Angel X Chang. 2023. Multi3DRefer: Grounding Text Description to Multiple 3D Objects. In ICCV. 15225--15236.","DOI":"10.1109\/ICCV51070.2023.01397"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"crossref","unstructured":"Lichen Zhao Daigang Cai Lu Sheng and Dong Xu. 2021. 3DVG-Transformer: Relation modeling for visual grounding on point clouds. In ICCV. 2928--2937.","DOI":"10.1109\/ICCV48922.2021.00292"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"crossref","unstructured":"Ziyu Zhu Xiaojian Ma Yixin Chen Zhidong Deng Siyuan Huang and Qing Li. 2023. 3d-vista: Pre-trained transformer for 3d vision and text alignment. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00272"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680998","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680998","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:36Z","timestamp":1750295856000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680998"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":68,"alternative-id":["10.1145\/3664647.3680998","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680998","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}