{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,8]],"date-time":"2026-01-08T17:28:37Z","timestamp":1767893317512,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62176224,62176092,62222602,62306165"],"award-info":[{"award-number":["62176224,62176092,62222602,62306165"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Science and Technology on Sonar Laboratory","award":["2024-JCJQ-LB-32\/07"],"award-info":[{"award-number":["2024-JCJQ-LB-32\/07"]}]},{"DOI":"10.13039\/501100015282","name":"China Academy of Railway Sciences","doi-asserted-by":"publisher","award":["2023Y1357"],"award-info":[{"award-number":["2023Y1357"]}],"id":[{"id":"10.13039\/501100015282","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754985","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:56:43Z","timestamp":1761371803000},"page":"3094-3103","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["SeqVLM: Proposal-Guided Multi-View Sequences Reasoning via VLM for Zero-Shot 3D Visual Grounding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-6591-8487","authenticated-orcid":false,"given":"Jiawen","family":"Lin","sequence":"first","affiliation":[{"name":"School of Informatics, Xiamen University, Xiamen, Fujian, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3000-7679","authenticated-orcid":false,"given":"Shiran","family":"Bian","sequence":"additional","affiliation":[{"name":"School of Informatics, Xiamen University, Xiamen, Fujian, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1370-3510","authenticated-orcid":false,"given":"Yihang","family":"Zhu","sequence":"additional","affiliation":[{"name":"School of Computer Science, Nanjing University, Nanjing, Jiangsu, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9619-6323","authenticated-orcid":false,"given":"Wenbin","family":"Tan","sequence":"additional","affiliation":[{"name":"School of Informatics, Xiamen University, XiaMen, Fujian, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6153-5004","authenticated-orcid":false,"given":"Yachao","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Informatics, Xiamen University, Xiamen, Fujian, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6945-7437","authenticated-orcid":false,"given":"Yuan","family":"Xie","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, East China Normal University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8926-4162","authenticated-orcid":false,"given":"Yanyun","family":"Qu","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, Fujian, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_25"},{"key":"e_1_3_2_1_2_1","volume-title":"https:\/\/bailian.console.aliyun.com\/","author":"Cloud Alibaba","year":"2025","unstructured":"Alibaba Cloud. Tongyi Qianwen. https:\/\/bailian.console.aliyun.com\/, 2025. Accessed: 2025-03-03."},{"key":"e_1_3_2_1_3_1","volume-title":"https:\/\/console.volcengine.com\/","year":"2025","unstructured":"ByteDance. VolcEngine. https:\/\/console.volcengine.com\/, 2025. Accessed: 2025-02-18."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01597"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCAS58744.2024.10558436"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_13"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19824-3_29"},{"key":"e_1_3_2_1_8_1","volume-title":"Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478","author":"Chen Jun","year":"2023","unstructured":"Jun Chen, Deyao Zhu, Xiaoqian Shen, Xiang Li, Zechun Liu, Pengchuan Zhang, Raghuraman Krishnamoorthi, Vikas Chandra, Yunyang Xiong, and Mohamed Elhoseiny. Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478, 2023."},{"key":"e_1_3_2_1_9_1","volume-title":"Reasoning3d-grounding and reasoning in 3d: Fine-grained zero-shot open-vocabulary 3d reasoning part segmentation via large vision-language models. arXiv preprint arXiv:2405.19326","author":"Chen Tianrun","year":"2024","unstructured":"Tianrun Chen, Chunan Yu, Jing Li, Jianqi Zhang, Lanyun Zhu, Deyi Ji, Yong Zhang, Ying Zang, Zejian Li, and Lingyun Sun. Reasoning3d-grounding and reasoning in 3d: Fine-grained zero-shot open-vocabulary 3d reasoning part segmentation via large vision-language models. arXiv preprint arXiv:2405.19326, 2024."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01263"},{"key":"e_1_3_2_1_11_1","first-page":"93","volume-title":"European Conference on Computer Vision","author":"Hu Jiarui","year":"2024","unstructured":"Jiarui Hu, Xianhao Chen, Boyin Feng, Guanglin Li, Liangjing Yang, Hujun Bao, Guofeng Zhang, and Zhaopeng Cui. Cg-slam: Efficient dense rgb-d slam in a consistent uncertainty-aware 3d gaussian field. In European Conference on Computer Vision, pages 93-112. Springer, 2024."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02039"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/3DV66043.2025.00112"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16253"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2303"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01508"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160433"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_24"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01088"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00492"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01807"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10846-023-02046-3"},{"key":"e_1_3_2_1_23_1","first-page":"19730","volume-title":"International Conference on Machine Learning","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International Conference on Machine Learning, pages 19730-19742. PMLR, 2023."},{"key":"e_1_3_2_1_24_1","volume-title":"Seeground: See and ground for zero-shot open-vocabulary 3d visual grounding. arXiv preprint arXiv:2412.04383","author":"Li Rong","year":"2024","unstructured":"Rong Li, Shijie Li, Lingdong Kong, Xulei Yang, and Junwei Liang. Seeground: See and ground for zero-shot open-vocabulary 3d visual grounding. arXiv preprint arXiv:2412.04383, 2024."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2024.3463801"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627673.3679793"},{"key":"e_1_3_2_1_27_1","volume-title":"Aligning cyber space with physical world: A comprehensive survey on embodied ai. arXiv preprint arXiv:2407.06886","author":"Liu Yang","year":"2024","unstructured":"Yang Liu, Weixing Chen, Yongjie Bai, Xiaodan Liang, Guanbin Li, Wen Gao, and Liang Lin. Aligning cyber space with physical world: A comprehensive survey on embodied ai. arXiv preprint arXiv:2407.06886, 2024."},{"key":"e_1_3_2_1_28_1","first-page":"2949","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Liu Ze","year":"2021","unstructured":"Ze Liu, Zheng Zhang, Yue Cao, Han Hu, and Xin Tong. Group-free 3d object detection via transformers. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pages 2949-2958, 2021."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01596"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2025.3542333"},{"key":"e_1_3_2_1_31_1","volume-title":"https:\/\/chatgpt.com\/","author":"AI.","year":"2024","unstructured":"OpenAI. ChatGPT. https:\/\/chatgpt.com\/, 2024. Version: 2024-05-13."},{"key":"e_1_3_2_1_32_1","first-page":"815","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Peng Songyou","year":"2023","unstructured":"Songyou Peng, Kyle Genova, Chiyu Jiang, Andrea Tagliasacchi, Marc Pollefeys, Thomas Funkhouser, et al. Openscene: 3d scene understanding with open vocabularies. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 815-824, 2023."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00689"},{"key":"e_1_3_2_1_34_1","unstructured":"Zhipeng Qian Yiwei Ma Zhekai Lin Jiayi Ji and Xiawu Zheng. Multi-branch collaborative learning network for 3d visual grounding."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02089-5"},{"key":"e_1_3_2_1_36_1","first-page":"8748","volume-title":"International Conference on Machine Learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning, pages 8748-8763. PmLR, 2021."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160590"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.157"},{"key":"e_1_3_2_1_39_1","volume-title":"Flowcam: Training generalizable 3d radiance fields without camera poses via pixel-aligned scene flow. arXiv preprint arXiv:2306.00180","author":"Smith Cameron","year":"2023","unstructured":"Cameron Smith, Yilun Du, Ayush Tewari, and Vincent Sitzmann. Flowcam: Training generalizable 3d radiance fields without camera poses via pixel-aligned scene flow. arXiv preprint arXiv:2306.00180, 2023."},{"key":"e_1_3_2_1_40_1","first-page":"196","volume-title":"European Conference on Computer Vision","author":"Unal Ozan","year":"2024","unstructured":"Ozan Unal, Christos Sakaridis, Suman Saha, and Luc Van Gool. Four ways to improve verbo-visual fusion for dense 3d visual grounding. In European Conference on Computer Vision, pages 196-213. Springer, 2024."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00273"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11704-024-40231-1"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01320"},{"key":"e_1_3_2_1_44_1","volume-title":"Chat-3d: Data-efficiently tuning large language model for universal dialogue of 3d scenes. arXiv preprint arXiv:2308.08769","author":"Wang Zehan","year":"2023","unstructured":"Zehan Wang, Haifeng Huang, Yang Zhao, Ziang Zhang, and Zhou Zhao. Chat-3d: Data-efficiently tuning large language model for universal dialogue of 3d scenes. arXiv preprint arXiv:2308.08769, 2023."},{"key":"e_1_3_2_1_45_1","first-page":"69925","article-title":"Visionllm v2: An end-to-end generalist multimodal large language model for hundreds of vision-language tasks","volume":"37","author":"Wu Jiannan","year":"2024","unstructured":"Jiannan Wu, Muyan Zhong, Sen Xing, Zeqiang Lai, Zhaoyang Liu, Zhe Chen, Wenhai Wang, Xizhou Zhu, Lewei Lu, Tong Lu, et al. Visionllm v2: An end-to-end generalist multimodal large language model for hundreds of vision-language tasks. Advances in Neural Information Processing Systems, 37:69925-69975, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01843"},{"key":"e_1_3_2_1_47_1","volume-title":"M-divo: Multiple tof rgb-d cameras enhanced depth-inertial-visual odometry","author":"Xu Jie","year":"2024","unstructured":"Jie Xu, Wenlu Yu, Song Huang, Shenghai Yuan, Lijun Zhao, Ruifeng Li, and Lihua Xie. M-divo: Multiple tof rgb-d cameras enhanced depth-inertial-visual odometry. IEEE Internet of Things Journal, 2024."},{"key":"e_1_3_2_1_48_1","volume-title":"Vlm-grounder: A vlm agent for zero-shot 3d visual grounding. arXiv preprint arXiv:2410.13860","author":"Xu Runsen","year":"2024","unstructured":"Runsen Xu, Zhiwei Huang, Tai Wang, Yilun Chen, Jiangmiao Pang, and Dahua Lin. Vlm-grounder: A vlm agent for zero-shot 3d visual grounding. arXiv preprint arXiv:2410.13860, 2024."},{"key":"e_1_3_2_1_49_1","first-page":"20923","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Yan Zhiwen","year":"2024","unstructured":"Zhiwen Yan, Weng Fei Low, Yu Chen, and Gim Hee Lee. Multi-scale 3d gaussian splatting for anti-aliased rendering. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 20923-20931, 2024."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610443"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00187"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01949"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00181"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28559"},{"key":"e_1_3_2_1_55_1","article-title":"A universal multi-modal large language model for multi-sensor image comprehension in remote sensing domain","author":"Zhang Wei","year":"2024","unstructured":"Wei Zhang, Miaoxin Cai, Tong Zhang, Yin Zhuang, and Xuerui Mao. Earthgpt: A universal multi-modal large language model for multi-sensor image comprehension in remote sensing domain. IEEE Transactions on Geoscience and Remote Sensing, 2024.","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"e_1_3_2_1_56_1","article-title":"Prototype correlation matching and class-relation reasoning for few-shot medical image segmentation","author":"Zhang Yumin","year":"2024","unstructured":"Yumin Zhang, Hongliu Li, Yajun Gao, Haoran Duan, Yawen Huang, and Yefeng Zheng. Prototype correlation matching and class-relation reasoning for few-shot medical image segmentation. IEEE Transactions on Medical Imaging, 2024.","journal-title":"IEEE Transactions on Medical Imaging"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01241"},{"key":"e_1_3_2_1_58_1","first-page":"2928","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Zhao Lichen","year":"2021","unstructured":"Lichen Zhao, Daigang Cai, Lu Sheng, and Dong Xu. 3dvg-transformer: Relation modeling for visual grounding on point clouds. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pages 2928-2937, 2021."},{"key":"e_1_3_2_1_59_1","volume-title":"A stitch in time saves nine: Small vlm is a precise guidance for accelerating large vlms. arXiv preprint arXiv:2412.03324","author":"Zhao Wangbo","year":"2024","unstructured":"Wangbo Zhao, Yizeng Han, Jiasheng Tang, Zhikai Li, Yibing Song, Kai Wang, Zhangyang Wang, and Yang You. A stitch in time saves nine: Small vlm is a precise guidance for accelerating large vlms. arXiv preprint arXiv:2412.03324, 2024."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01752"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2022.3154022"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754985","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:05:23Z","timestamp":1765339523000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754985"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":61,"alternative-id":["10.1145\/3746027.3754985","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754985","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}