{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,3]],"date-time":"2026-07-03T17:27:36Z","timestamp":1783099656344,"version":"3.54.6"},"publisher-location":"New York, NY, USA","reference-count":84,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the Natural Science Foundation of Fujian Province of China","award":["2021J01002, 2022J06001"],"award-info":[{"award-number":["2021J01002, 2022J06001"]}]},{"name":"National Key R\\&D Program of China","award":["2022ZD0118201"],"award-info":[{"award-number":["2022ZD0118201"]}]},{"name":"CCF-NetEase ThunderFire Innovation Research Funding","award":["CCF-Netease 202301"],"award-info":[{"award-number":["CCF-Netease 202301"]}]},{"name":"the National Science Fund for Distinguished Young Scholars","award":["62025603"],"award-info":[{"award-number":["62025603"]}]},{"name":"the National Natural Science Foundation of China","award":["U21B2037, U22B2051, 62072389, U21A20472"],"award-info":[{"award-number":["U21B2037, U22B2051, 62072389, U21A20472"]}]},{"name":"the National Natural Science Fund for Young Scholars of China","award":["62302411"],"award-info":[{"award-number":["62302411"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680841","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"7852-7861","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["3D-GRES: Generalized 3D Referring Expression Segmentation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-3593-5142","authenticated-orcid":false,"given":"Changli","family":"Wu","sequence":"first","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1235-5016","authenticated-orcid":false,"given":"Yihang","family":"Liu","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9956-6308","authenticated-orcid":false,"given":"Jiayi","family":"Ji","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8744-3423","authenticated-orcid":false,"given":"Yiwei","family":"Ma","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0289-9672","authenticated-orcid":false,"given":"Haowei","family":"Wang","sequence":"additional","affiliation":[{"name":"Youtu Lab, Tencent, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5334-1843","authenticated-orcid":false,"given":"Gen","family":"Luo","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4868-6526","authenticated-orcid":false,"given":"Henghui","family":"Ding","sequence":"additional","affiliation":[{"name":"Institute of Big Data, Fudan University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3912-9306","authenticated-orcid":false,"given":"Xiaoshuai","family":"Sun","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9163-2932","authenticated-orcid":false,"given":"Rongrong","family":"Ji","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings, Part I 16","author":"Achlioptas Panos","year":"2020","unstructured":"Panos Achlioptas, Ahmed Abdelreheem, Fei Xia, Mohamed Elhoseiny, and Leonidas Guibas. 2020. Referit3d: Neural listeners for fine-grained 3d object identification in real-world scenes. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part I 16. Springer, 422--440."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_13"},{"key":"e_1_3_2_1_4_1","volume-title":"Language conditioned spatial relation reasoning for 3d object grounding. Advances in neural information processing systems","author":"Chen Shizhe","year":"2022","unstructured":"Shizhe Chen, Pierre-Louis Guhur, Makarand Tapaswi, Cordelia Schmid, and Ivan Laptev. 2022. Language conditioned spatial relation reasoning for 3d object grounding. Advances in neural information processing systems, Vol. 35 (2022), 20522--20535."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Bowen Cheng Lu Sheng Shaoshuai Shi Ming Yang and Dong Xu. 2021. Back-tracing representative points for voting-based 3d object detection in point clouds. In CVPR. 8963--8972.","DOI":"10.1109\/CVPR46437.2021.00885"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"e_1_3_2_1_7_1","volume-title":"Instructdet: Diversifying referring object detection with generalized instructions. arXiv preprint arXiv:2310.05136","author":"Dang Ronghao","year":"2023","unstructured":"Ronghao Dang, Jiangyan Feng, Haodong Zhang, Chongjian Ge, Lin Song, Lijun Gong, Chengju Liu, Qijun Chen, Feng Zhu, Rui Zhao, et al. 2023. Instructdet: Diversifying referring object detection with generalized instructions. arXiv preprint arXiv:2310.05136 (2023)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00808"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Henghui Ding Chang Liu Suchen Wang and Xudong Jiang. 2021. Vision-language transformer and query generation for referring segmentation. In ICCV. 16321--16330.","DOI":"10.1109\/ICCV48922.2021.01601"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3217852"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.329"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Hao Fei Shengqiong Wu Wei Ji Hanwang Zhang and Tat-Seng Chua. 2024. Dysen-VDM: Empowering Dynamics-aware Text-to-Video Diffusion with LLMs. In CVPR. 7641--7653.","DOI":"10.1109\/CVPR52733.2024.00730"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3393452"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Mingtao Feng Zhen Li Qi Li Liang Zhang XiangDong Zhang Guangming Zhu Hui Zhang Yaonan Wang and Ajmal Mian. 2021. Free-form description guided 3d visual graph network for object grounding in point cloud. In ICCV. 3722--3731.","DOI":"10.1109\/ICCV48922.2021.00370"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00961"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475397"},{"key":"e_1_3_2_1_17_1","volume-title":"RefMask3D: Language-Guided Transformer for 3D Referring Segmentation. arXiv preprint arXiv:2407.18244","author":"He Shuting","year":"2024","unstructured":"Shuting He and Henghui Ding. 2024. RefMask3D: Language-Guided Transformer for 3D Referring Segmentation. arXiv preprint arXiv:2407.18244 (2024)."},{"key":"e_1_3_2_1_18_1","volume-title":"SegPoint: Segment Any Point Cloud via Large Language Model. arXiv preprint arXiv:2407.13761","author":"He Shuting","year":"2024","unstructured":"Shuting He, Henghui Ding, Xudong Jiang, and Bihan Wen. 2024. SegPoint: Segment Any Point Cloud via Large Language Model. arXiv preprint arXiv:2407.13761 (2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"GREC: Generalized Referring Expression Comprehension. arXiv preprint arXiv:2308.16182","author":"He Shuting","year":"2023","unstructured":"Shuting He, Henghui Ding, Chang Liu, and Xudong Jiang. 2023. GREC: Generalized Referring Expression Comprehension. arXiv preprint arXiv:2308.16182 (2023)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2911066"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.470"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings, Part I 14","author":"Hu Ronghang","year":"2016","unstructured":"Ronghang Hu, Marcus Rohrbach, and Trevor Darrell. 2016. Segmentation from natural language expressions. In Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part I 14. Springer, 108--124."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.493"},{"key":"e_1_3_2_1_24_1","unstructured":"Yutao Hu Qixiong Wang Wenqi Shao Enze Xie Zhenguo Li Jungong Han and Ping Luo. 2023. Beyond one-to-one: Rethinking the referring image segmentation. In ICCV. 4067--4077."},{"key":"e_1_3_2_1_25_1","unstructured":"Zhiwei Hu Guang Feng Jiayu Sun Lihe Zhang and Huchuan Lu. 2020. Bi-directional relationship inferring network for referring image segmentation. In CVPR. 4424--4433."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16253"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611902"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475222"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Ya Jing Tao Kong Wei Wang Liang Wang Lei Li and Tieniu Tan. 2021. Locate then segment: A strong pipeline for referring image segmentation. In CVPR. 9858--9867.","DOI":"10.1109\/CVPR46437.2021.00973"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1086"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Xin Lai Yuhui Yuan Ruihang Chu Yukang Chen Han Hu and Jiaya Jia. 2023. Mask-attention-free transformer for 3d instance segmentation. In ICCV. 3693--3703.","DOI":"10.1109\/ICCV51070.2023.00342"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00479"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00602"},{"key":"e_1_3_2_1_34_1","unstructured":"Haojia Lin Yongdong Luo Xiawu Zheng Lijiang Li Fei Chao Taisong Jin Donghao Luo Chengjie Wang Yan Wang and Liujuan Cao. 2023. A Unified Framework for 3D Point Cloud Visual Grounding. arxiv: 2308.11887 [cs.CV]"},{"key":"e_1_3_2_1_35_1","volume-title":"Gres: Generalized referring expression segmentation. In CVPR. 23592--23601.","author":"Liu Chang","year":"2023","unstructured":"Chang Liu, Henghui Ding, and Xudong Jiang. 2023. Gres: Generalized referring expression segmentation. In CVPR. 23592--23601."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3277791"},{"key":"e_1_3_2_1_37_1","volume-title":"Instance-specific feature propagation for referring segmentation","author":"Liu Chang","year":"2022","unstructured":"Chang Liu, Xudong Jiang, and Henghui Ding. 2022. Instance-specific feature propagation for referring segmentation. IEEE Transactions on Multimedia (2022)."},{"key":"e_1_3_2_1_38_1","unstructured":"Daqing Liu Hanwang Zhang Feng Wu and Zheng-Jun Zha. 2019. Learning to assemble neural module tree networks for visual grounding. In ICCV. 4673--4682."},{"key":"e_1_3_2_1_39_1","volume-title":"Remoteclip: A vision language foundation model for remote sensing","author":"Liu Fan","year":"2024","unstructured":"Fan Liu, Delong Chen, Zhangqingyun Guan, Xiaocong Zhou, Jiale Zhu, Qiaolin Ye, Liyong Fu, and Jun Zhou. 2024. Remoteclip: A vision language foundation model for remote sensing. IEEE Transactions on Geoscience and Remote Sensing (2024)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612117"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Xihui Liu Zihao Wang Jing Shao Xiaogang Wang and Hongsheng Li. 2019. Improving referring expression grounding with cross-modal attention-guided erasing. In CVPR. 1950--1959.","DOI":"10.1109\/CVPR.2019.00205"},{"key":"e_1_3_2_1_42_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Ze Liu Zheng Zhang Yue Cao Han Hu and Xin Tong. 2021. Group-free 3d object detection via transformers. In ICCV. 2949--2958.","DOI":"10.1109\/ICCV48922.2021.00294"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3414006"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Gen Luo Yiyi Zhou Xiaoshuai Sun Liujuan Cao Chenglin Wu Cheng Deng and Rongrong Ji. 2020. Multi-task collaborative network for joint referring expression comprehension and segmentation. In CVPR. 10034--10043.","DOI":"10.1109\/CVPR42600.2020.01005"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Junyu Luo Jiahui Fu Xianghao Kong Chen Gao Haibing Ren Hao Shen Huaxia Xia and Si Liu. 2022. 3d-sps: Single-stage 3d visual grounding via referred point progressive selection. In CVPR. 16454--16463.","DOI":"10.1109\/CVPR52688.2022.01596"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109420"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547910"},{"key":"e_1_3_2_1_49_1","volume-title":"X-mesh: Towards fast and accurate text-driven 3d stylization via dynamic textual guidance. In ICCV. 2749--2760.","author":"Ma Yiwei","year":"2023","unstructured":"Yiwei Ma, Xiaoqing Zhang, Xiaoshuai Sun, Jiayi Ji, Haowei Wang, Guannan Jiang, Weilin Zhuang, and Rongrong Ji. 2023. X-mesh: Towards fast and accurate text-driven 3d stylization via dynamic textual guidance. In ICCV. 2749--2760."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"e_1_3_2_1_51_1","volume-title":"V-net: Fully convolutional neural networks for volumetric medical image segmentation. In 2016 fourth international conference on 3D vision (3DV). Ieee, 565--571.","author":"Milletari Fausto","year":"2016","unstructured":"Fausto Milletari, Nassir Navab, and Seyed-Ahmad Ahmadi. 2016. V-net: Fully convolutional neural networks for volumetric medical image segmentation. In 2016 fourth international conference on 3D vision (3DV). Ieee, 565--571."},{"key":"e_1_3_2_1_53_1","volume-title":"Proceedings, Part IV 14","author":"Nagaraja Varun K","year":"2016","unstructured":"Varun K Nagaraja, Vlad I Morariu, and Larry S Davis. 2016. Modeling context between objects for referring expression understanding. In Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part IV 14. Springer, 792--807."},{"key":"e_1_3_2_1_54_1","unstructured":"Charles R Qi Or Litany Kaiming He and Leonidas J Guibas. 2019. Deep hough voting for 3d object detection in point clouds. In ICCV. 9277--9286."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28254"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"crossref","unstructured":"Arka Sadhu Kan Chen and Ram Nevatia. 2019. Zero-shot grounding of objects from natural language queries. In ICCV. 4694--4703.","DOI":"10.1109\/ICCV.2019.00479"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W15-2812"},{"key":"e_1_3_2_1_58_1","volume-title":"From points to parts: 3d object detection from point cloud with part-aware and part-aggregation network","author":"Shi Shaoshuai","year":"2020","unstructured":"Shaoshuai Shi, Zhe Wang, Jianping Shi, Xiaogang Wang, and Hongsheng Li. 2020. From points to parts: 3d object detection from point cloud with part-aware and part-aggregation network. IEEE transactions on pattern analysis and machine intelligence, Vol. 43, 8 (2020), 2647--2664."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25331"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25335"},{"key":"e_1_3_2_1_61_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"crossref","unstructured":"Peng Wang Qi Wu Jiewei Cao Chunhua Shen Lianli Gao and Anton van den Hengel. 2019. Neighbourhood watch: Referring expression comprehension via language-guided graph attention networks. In CVPR. 1960--1968.","DOI":"10.1109\/CVPR.2019.00206"},{"key":"e_1_3_2_1_63_1","volume-title":"Unveiling Parts Beyond Objects: Towards Finer-Granularity Referring Expression Segmentation. arXiv preprint arXiv:2312.08007","author":"Wang Wenxuan","year":"2023","unstructured":"Wenxuan Wang, Tongtian Yue, Yisi Zhang, Longteng Guo, Xingjian He, Xinlong Wang, and Jing Liu. 2023. Unveiling Parts Beyond Objects: Towards Finer-Granularity Referring Expression Segmentation. arXiv preprint arXiv:2312.08007 (2023)."},{"key":"e_1_3_2_1_64_1","volume-title":"3drp-net: 3d relative position-aware network for 3d visual grounding. arXiv preprint arXiv:2307.13363","author":"Wang Zehan","year":"2023","unstructured":"Zehan Wang, Haifeng Huang, Yang Zhao, Linjun Li, Xize Cheng, Yichen Zhu, Aoxiong Yin, and Zhou Zhao. 2023. 3drp-net: 3d relative position-aware network for 3d visual grounding. arXiv preprint arXiv:2307.13363 (2023)."},{"key":"e_1_3_2_1_65_1","volume-title":"3D-STMN: Dependency-Driven Superpoint-Text Matching Network for End-to-End 3D Referring Expression Segmentation. arXiv preprint arXiv:2308.16632","author":"Wu Changli","year":"2023","unstructured":"Changli Wu, Yiwei Ma, Qi Chen, Haowei Wang, Gen Luo, Jiayi Ji, and Xiaoshuai Sun. 2023. 3D-STMN: Dependency-Driven Superpoint-Text Matching Network for End-to-End 3D Referring Expression Segmentation. arXiv preprint arXiv:2308.16632 (2023)."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"crossref","unstructured":"Hao Wu Jiayuan Mao Yufeng Zhang Yuning Jiang Lei Li Weiwei Sun and Wei-Ying Ma. 2019. Unified visual-semantic embeddings: Bridging vision and language with structured meaning representations. In CVPR. 6609--6618.","DOI":"10.1109\/CVPR.2019.00677"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3371348"},{"key":"e_1_3_2_1_68_1","volume-title":"Proceedings of the International Conference on Machine Learning.","author":"Wu Shengqiong","year":"2024","unstructured":"Shengqiong Wu, Hao Fei, Leigang Qu, Wei Ji, and Tat-Seng Chua. 2024. NExT-GPT: Any-to-Any Multimodal LLM. In Proceedings of the International Conference on Machine Learning."},{"key":"e_1_3_2_1_69_1","volume-title":"Eda: Explicit text-decoupling and dense alignment for 3d visual grounding. In CVPR. 19231--19242.","author":"Wu Yanmin","year":"2023","unstructured":"Yanmin Wu, Xinhua Cheng, Renrui Zhang, Zesen Cheng, and Jian Zhang. 2023. Eda: Explicit text-decoupling and dense alignment for 3d visual grounding. In CVPR. 19231--19242."},{"key":"e_1_3_2_1_70_1","volume-title":"GSVA: Generalized Segmentation via Multimodal Large Language Models. arXiv preprint arXiv:2312.10103","author":"Xia Zhuofan","year":"2023","unstructured":"Zhuofan Xia, Dongchen Han, Yizeng Han, Xuran Pan, Shiji Song, and Gao Huang. 2023. GSVA: Generalized Segmentation via Multimodal Large Language Models. arXiv preprint arXiv:2312.10103 (2023)."},{"key":"e_1_3_2_1_71_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Xie Chi","year":"2024","unstructured":"Chi Xie, Zhao Zhang, Yixuan Wu, Feng Zhu, Rui Zhao, and Shuang Liang. 2024. Described Object Detection: Liberating Object Detection with Flexible Expressions. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"crossref","unstructured":"Sibei Yang Meng Xia Guanbin Li Hong-Yu Zhou and Yizhou Yu. 2021. Bottom-up shift and reasoning for referring image segmentation. In CVPR. 11266--11275.","DOI":"10.1109\/CVPR46437.2021.01111"},{"key":"e_1_3_2_1_73_1","volume-title":"Proceedings, Part XIV 16","author":"Yang Zhengyuan","year":"2020","unstructured":"Zhengyuan Yang, Tianlang Chen, Liwei Wang, and Jiebo Luo. 2020. Improving one-stage visual grounding by recursive sub-query construction. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XIV 16. Springer, 387--404."},{"key":"e_1_3_2_1_74_1","volume-title":"Lavt: Language-aware vision transformer for referring image segmentation. In CVPR. 18155--18165.","author":"Yang Zhao","year":"2022","unstructured":"Zhao Yang, Jiaqi Wang, Yansong Tang, Kai Chen, Hengshuang Zhao, and Philip HS Torr. 2022. Lavt: Language-aware vision transformer for referring image segmentation. In CVPR. 18155--18165."},{"key":"e_1_3_2_1_75_1","unstructured":"Linwei Ye Mrigank Rochan Zhi Liu and Yang Wang. 2019. Cross-modal self-attention network for referring image segmentation. In CVPR. 10502--10511."},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00142"},{"key":"e_1_3_2_1_77_1","volume-title":"Proceedings, Part II 14","author":"Yu Licheng","year":"2016","unstructured":"Licheng Yu, Patrick Poirson, Shan Yang, Alexander C Berg, and Tamara L Berg. 2016. Modeling context in referring expressions. In Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part II 14. Springer, 69--85."},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.375"},{"key":"e_1_3_2_1_79_1","volume-title":"Instancerefer: Cooperative holistic understanding for visual grounding on point clouds through instance multi-level contextual referring. In ICCV. 1791--1800.","author":"Yuan Zhihao","year":"2021","unstructured":"Zhihao Yuan, Xu Yan, Yinghong Liao, Ruimao Zhang, Sheng Wang, Zhen Li, and Shuguang Cui. 2021. Instancerefer: Cooperative holistic understanding for visual grounding on point clouds through instance multi-level contextual referring. In ICCV. 1791--1800."},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"crossref","unstructured":"Yiming Zhang ZeMing Gong and Angel X Chang. 2023. Multi3drefer: Grounding text description to multiple 3d objects. In ICCV. 15225--15236.","DOI":"10.1109\/ICCV51070.2023.01397"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2015.09.118"},{"key":"e_1_3_2_1_82_1","volume-title":"PSALM: Pixelwise SegmentAtion with Large Multi-Modal Model. arXiv preprint arXiv:2403.14598","author":"Zhang Zheng","year":"2024","unstructured":"Zheng Zhang, Yeyao Ma, Enming Zhang, and Xiang Bai. 2024. PSALM: Pixelwise SegmentAtion with Large Multi-Modal Model. arXiv preprint arXiv:2403.14598 (2024)."},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"crossref","unstructured":"Lichen Zhao Daigang Cai Lu Sheng and Dong Xu. 2021. 3DVG-Transformer: Relation modeling for visual grounding on point clouds. In ICCV. 2928--2937.","DOI":"10.1109\/ICCV48922.2021.00292"},{"key":"e_1_3_2_1_84_1","volume-title":"An open and comprehensive pipeline for unified object grounding and detection. arXiv preprint arXiv:2401.02361","author":"Zhao Xiangyu","year":"2024","unstructured":"Xiangyu Zhao, Yicheng Chen, Shilin Xu, Xiangtai Li, Xinjiang Wang, Yining Li, and Haian Huang. 2024. An open and comprehensive pipeline for unified object grounding and detection. arXiv preprint arXiv:2401.02361 (2024)."},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00447"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680841","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680841","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:08Z","timestamp":1750295888000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680841"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":84,"alternative-id":["10.1145\/3664647.3680841","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680841","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}