{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:21:32Z","timestamp":1765308092713,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755533","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:47:42Z","timestamp":1761371262000},"page":"6183-6192","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Open3DSearch: Zero-Shot Precise Retrieval of 3D Shapes Using Text Descriptions"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5343-3921","authenticated-orcid":false,"given":"Xiong","family":"Li","sequence":"first","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3786-455X","authenticated-orcid":false,"given":"Yikang","family":"Yan","sequence":"additional","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2914-912X","authenticated-orcid":false,"given":"Zhenyu","family":"Wen","sequence":"additional","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7123-6155","authenticated-orcid":false,"given":"Qin","family":"Yuan","sequence":"additional","affiliation":[{"name":"Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2401-6499","authenticated-orcid":false,"given":"Fangda","family":"Guo","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9956-3732","authenticated-orcid":false,"given":"Zhen","family":"Hong","sequence":"additional","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0247-9866","authenticated-orcid":false,"given":"Ye","family":"Yuan","sequence":"additional","affiliation":[{"name":"Beijing Institute of Technology, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Kseniya Cherenkova, Rig Das, Gleb Gusev, Djamila Aouada, and Bjorn Ottersten.","author":"Ahmed Eman","year":"2018","unstructured":"Eman Ahmed, Alexandre Saint, Abd El Rahman Shabayek, Kseniya Cherenkova, Rig Das, Gleb Gusev, Djamila Aouada, and Bjorn Ottersten. 2018. A survey on deep learning advances on different 3D data representations. arXiv preprint arXiv:1808.01462 (2018)."},{"key":"e_1_3_2_1_3_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems Vol. 35 (2022) 23716-23736."},{"key":"e_1_3_2_1_4_1","volume-title":"Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond. arXiv preprint arXiv:2308.12966, Vol. 1, 2 (2023), 3."},{"key":"e_1_3_2_1_5_1","unstructured":"Xiao Bi Deli Chen Guanting Chen Shanhuang Chen Damai Dai Chengqi Deng Honghui Ding Kai Dong Qiushi Du Zhe Fu et al. 2024. Deepseek llm: Scaling open-source language models with longtermism. arXiv preprint arXiv:2401.02954 (2024)."},{"key":"e_1_3_2_1_6_1","volume-title":"Shapenet: An information-rich 3d model repository. arXiv preprint arXiv:1512.03012","author":"Chang Angel X","year":"2015","unstructured":"Angel X Chang, Thomas Funkhouser, Leonidas Guibas, Pat Hanrahan, Qixing Huang, Zimo Li, Silvio Savarese, Manolis Savva, Shuran Song, Hao Su, et al., 2015. Shapenet: An information-rich 3d model repository. arXiv preprint arXiv:1512.03012 (2015)."},{"key":"e_1_3_2_1_7_1","first-page":"100","volume-title":"Perth","author":"Chen Kevin","year":"2019","unstructured":"Kevin Chen, Christopher B Choy, Manolis Savva, Angel X Chang, Thomas Funkhouser, and Silvio Savarese. 2019. Text2shape: Generating shapes from natural language by learning joint embeddings. In Computer Vision-ACCV 2018: 14th Asian Conference on Computer Vision, Perth, Australia, December 2-6, 2018, Revised Selected Papers, Part III 14. Springer, 100-116."},{"key":"e_1_3_2_1_8_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi. 2023. Instructblip: Towards general-purpose vision-language models with instruction tuning. arXiv preprint arXiv:2305.06500 (2023)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01263"},{"key":"e_1_3_2_1_10_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv preprint arXiv:2406.12793","author":"Aohan Zeng Team GLM","year":"2024","unstructured":"Team GLM, Aohan Zeng, Bin Xu, Bowen Wang, Chenhui Zhang, Da Yin, Dan Zhang, Diego Rojas, Guanyu Feng, Hanlin Zhao, et al., 2024. Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv preprint arXiv:2406.12793 (2024)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1086\/427976"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.3301126"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00217"},{"key":"e_1_3_2_1_15_1","first-page":"20482","article-title":"3d-llm: Injecting the 3d world into large language models","volume":"36","author":"Hong Yining","year":"2023","unstructured":"Yining Hong, Haoyu Zhen, Peihao Chen, Shuhong Zheng, Yilun Du, Zhenfang Chen, and Chuang Gan. 2023. 3d-llm: Injecting the 3d world into large language models. Advances in Neural Information Processing Systems, Vol. 36 (2023), 20482-20494.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02025"},{"key":"e_1_3_2_1_17_1","volume-title":"Deep learning advances in computer vision with 3d data: A survey. ACM computing surveys (CSUR)","author":"Ioannidou Anastasia","year":"2017","unstructured":"Anastasia Ioannidou, Elisavet Chatzilari, Spiros Nikolopoulos, and Ioannis Kompatsiaris. 2017. Deep learning advances in computer vision with 3d data: A survey. ACM computing surveys (CSUR), Vol. 50, 2 (2017), 1-38."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/582415.582418"},{"key":"e_1_3_2_1_19_1","volume-title":"International conference on machine learning. PMLR, 4904-4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning. PMLR, 4904-4916."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cag.2023.07.026"},{"key":"e_1_3_2_1_21_1","volume-title":"Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326","author":"Li Bo","year":"2024","unstructured":"Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Peiyuan Zhang, Yanwei Li, Ziwei Liu, et al., 2024. Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326 (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_2_1_24_1","volume-title":"A survey on hallucination in large vision-language models. arXiv preprint arXiv:2402.00253","author":"Liu Hanchao","year":"2024","unstructured":"Hanchao Liu, Wenyuan Xue, Yifei Chen, Dapeng Chen, Xiutian Zhao, Ke Wang, Liping Hou, Rongjun Li, and Wei Peng. 2024c. A survey on hallucination in large vision-language models. arXiv preprint arXiv:2402.00253 (2024)."},{"key":"e_1_3_2_1_25_1","volume-title":"Openshape: Scaling up 3d shape representation towards open-world understanding. Advances in neural information processing systems","author":"Liu Minghua","year":"2024","unstructured":"Minghua Liu, Ruoxi Shi, Kaiming Kuang, Yinhao Zhu, Xuanlin Li, Shizhong Han, Hong Cai, Fatih Porikli, and Hao Su. 2024b. Openshape: Scaling up 3d shape representation towards open-world understanding. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_26_1","unstructured":"Haoyu Lu Wen Liu Bo Zhang Bingxuan Wang Kai Dong Bo Liu Jingxiang Sun Tongzheng Ren Zhuoshu Li Hao Yang et al. 2024. Deepseek-vl: towards real-world vision-language understanding. arXiv preprint arXiv:2403.05525 (2024)."},{"key":"e_1_3_2_1_27_1","first-page":"75307","article-title":"Scalable 3d captioning with pretrained models","volume":"36","author":"Luo Tiange","year":"2023","unstructured":"Tiange Luo, Chris Rockwell, Honglak Lee, and Justin Johnson. 2023. Scalable 3d captioning with pretrained models. Advances in Neural Information Processing Systems, Vol. 36 (2023), 75307-75337.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_28_1","volume-title":"International Conference on Machine Learning. PMLR, 28223-28243","author":"Qi Zekun","year":"2023","unstructured":"Zekun Qi, Runpei Dong, Guofan Fan, Zheng Ge, Xiangyu Zhang, Kaisheng Ma, and Li Yi. 2023. Contrast with reconstruct: Contrastive 3d representation learning guided by generative pretraining. In International Conference on Machine Learning. PMLR, 28223-28243."},{"key":"e_1_3_2_1_29_1","volume-title":"European Conference on Computer Vision. Springer, 214-238","author":"Qi Zekun","year":"2024","unstructured":"Zekun Qi, Runpei Dong, Shaochen Zhang, Haoran Geng, Chunrui Han, Zheng Ge, Li Yi, and Kaisheng Ma. 2024. Shapellm: Universal 3d object understanding for embodied interaction. In European Conference on Computer Vision. Springer, 214-238."},{"key":"e_1_3_2_1_30_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00571"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00665"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681257"},{"key":"e_1_3_2_1_34_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al., 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al., 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, Vol. 35 (2022), 24824-24837."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/BFb0038202"},{"key":"e_1_3_2_1_37_1","volume-title":"COM3D: Leveraging Cross-View Correspondence and Cross-Modal Mining for 3D Retrieval. arXiv preprint arXiv:2405.04103","author":"Wu Hao","year":"2024","unstructured":"Hao Wu, Ruochong LI, Hao Wang, and Hui Xiong. 2024. COM3D: Leveraging Cross-View Correspondence and Cross-Modal Mining for 3D Retrieval. arXiv preprint arXiv:2405.04103 (2024)."},{"key":"e_1_3_2_1_38_1","volume-title":"European Conference on Computer Vision. Springer, 131-147","author":"Xu Runsen","year":"2024","unstructured":"Runsen Xu, Xiaolong Wang, Tai Wang, Yilun Chen, Jiangmiao Pang, and Dahua Lin. 2024. Pointllm: Empowering large language models to understand point clouds. In European Conference on Computer Vision. Springer, 131-147."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00120"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02558"},{"key":"e_1_3_2_1_41_1","unstructured":"Aiyuan Yang Bin Xiao Bingning Wang Borong Zhang Ce Bian Chao Yin Chenxu Lv Da Pan Dian Wang Dong Yan et al. 2023b. Baichuan 2: Open large-scale language models. arXiv preprint arXiv:2309.10305 (2023)."},{"key":"e_1_3_2_1_42_1","unstructured":"An Yang Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chengyuan Li Dayiheng Liu Fei Huang Haoran Wei et al. 2024. Qwen2. 5 technical report. arXiv preprint arXiv:2412.15115 (2024)."},{"key":"e_1_3_2_1_43_1","volume-title":"The dawn of lmms: Preliminary explorations with gpt-4v (ision). arXiv preprint arXiv:2309.17421","author":"Yang Zhengyuan","year":"2023","unstructured":"Zhengyuan Yang, Linjie Li, Kevin Lin, Jianfeng Wang, Chung-Ching Lin, Zicheng Liu, and Lijuan Wang. 2023a. The dawn of lmms: Preliminary explorations with gpt-4v (ision). arXiv preprint arXiv:2309.17421, Vol. 9, 1 (2023), 1."},{"key":"e_1_3_2_1_44_1","volume-title":"Minicpm-v: A gpt-4v level mllm on your phone. arXiv preprint arXiv:2408.01800","author":"Yao Yuan","year":"2024","unstructured":"Yuan Yao, Tianyu Yu, Ao Zhang, Chongyi Wang, Junbo Cui, Hongji Zhu, Tianchi Cai, Haoyu Li, Weilin Zhao, Zhihui He, et al., 2024. Minicpm-v: A gpt-4v level mllm on your phone. arXiv preprint arXiv:2408.01800 (2024)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00836"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755533","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:19:33Z","timestamp":1765307973000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755533"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":45,"alternative-id":["10.1145\/3746027.3755533","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755533","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}