{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T01:10:25Z","timestamp":1755825025503,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733405","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:31:39Z","timestamp":1750876299000},"page":"192-201","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MuSeLLM: SDF Generation and Understanding via Multi-Scale Tokenization with Position-Aware Guidance"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-1487-9113","authenticated-orcid":false,"given":"Tianwei","family":"Ding","sequence":"first","affiliation":[{"name":"Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6093-8803","authenticated-orcid":false,"given":"Lanshan","family":"He","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0650-6448","authenticated-orcid":false,"given":"Weijian","family":"Ma","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5538-7367","authenticated-orcid":false,"given":"Xiangdong","family":"Zhou","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang et al. 2023a. Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023b. Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_3_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_4_1","volume-title":"Shapenet: An information-rich 3d model repository. arXiv preprint arXiv:1512.03012","author":"Chang Angel X","year":"2015","unstructured":"Angel X Chang, Thomas Funkhouser, Leonidas Guibas, Pat Hanrahan, Qixing Huang, Zimo Li, Silvio Savarese, Manolis Savva, Shuran Song, Hao Su, et al. 2015. Shapenet: An information-rich 3d model repository. arXiv preprint arXiv:1512.03012 (2015)."},{"key":"e_1_3_2_1_5_1","volume-title":"Text2Shape: Generating Shapes from Natural Language by Learning Joint Embeddings. arXiv preprint arXiv:1803.08495","author":"Chen Kevin","year":"2018","unstructured":"Kevin Chen, Christopher B Choy, Manolis Savva, Angel X Chang, Thomas Funkhouser, and Silvio Savarese. 2018. Text2Shape: Generating Shapes from Natural Language by Learning Joint Embeddings. arXiv preprint arXiv:1803.08495 (2018)."},{"key":"e_1_3_2_1_6_1","unstructured":"Sijin Chen Xin Chen Anqi Pang Xianfang Zeng Wei Cheng Yijun Fu Fukun Yin Yanru Wang Zhibin Wang Chi Zhang et al. 2024. MeshXL: Neural Coordinate Field for Generative 3D Foundation Models. arXiv preprint arXiv:2405.20853 (2024)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00433"},{"key":"e_1_3_2_1_8_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arxiv: 2305.06500 [cs.CV] https:\/\/arxiv.org\/abs\/2305.06500"},{"key":"e_1_3_2_1_9_1","unstructured":"Yining Hong Haoyu Zhen Peihao Chen Shuhong Zheng Yilun Du Zhenfang Chen and Chuang Gan. 2023. 3D-LLM: Injecting the 3D World into Large Language Models. arxiv: 2307.12981 [cs.CV] https:\/\/arxiv.org\/abs\/2307.12981"},{"key":"e_1_3_2_1_10_1","first-page":"20067","article-title":"Motiongpt: Human motion as a foreign language","volume":"36","author":"Jiang Biao","year":"2023","unstructured":"Biao Jiang, Xin Chen, Wen Liu, Jingyi Yu, Gang Yu, and Tao Chen. 2023. Motiongpt: Human motion as a foreign language. Advances in Neural Information Processing Systems, Vol. 36 (2023), 20067--20079.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_11_1","unstructured":"Heewoo Jun and Alex Nichol. 2023. Shap-E: Generating Conditional 3D Implicit Functions. arxiv: 2305.02463 [cs.CV] https:\/\/arxiv.org\/abs\/2305.02463"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592433"},{"key":"e_1_3_2_1_13_1","volume-title":"3dqd: Generalized deep 3d shape prior via part-discretized diffusion process. arXiv preprint arXiv:2303.10406","author":"Li Yuhan","year":"2023","unstructured":"Yuhan Li, Yishun Dou, Xuanhong Chen, Bingbing Ni, Yilin Sun, Yutian Liu, and Fuzhen Wang. 2023. 3dqd: Generalized deep 3d shape prior via part-discretized diffusion process. arXiv preprint arXiv:2303.10406 (2023)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00449"},{"key":"e_1_3_2_1_15_1","volume-title":"Uni3d-llm: Unifying point cloud perception, generation and editing with large language models. arXiv preprint arXiv:2402.03327","author":"Liu Dingning","year":"2024","unstructured":"Dingning Liu, Xiaoshui Huang, Yuenan Hou, Zhihui Wang, Zhenfei Yin, Yongshun Gong, Peng Gao, and Wanli Ouyang. 2024a. Uni3d-llm: Unifying point cloud perception, generation and editing with large language models. arXiv preprint arXiv:2402.03327 (2024)."},{"key":"e_1_3_2_1_16_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong Jae Lee. 2023a. Visual Instruction Tuning."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/BIBM62325.2024.10822061"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00040"},{"key":"e_1_3_2_1_21_1","volume-title":"Point-e: A system for generating 3d point clouds from complex prompts. arXiv preprint arXiv:2212.08751","author":"Nichol Alex","year":"2022","unstructured":"Alex Nichol, Heewoo Jun, Prafulla Dhariwal, Pamela Mishkin, and Mark Chen. 2022a. Point-e: A system for generating 3d point clouds from complex prompts. arXiv preprint arXiv:2212.08751 (2022)."},{"key":"e_1_3_2_1_22_1","unstructured":"Alex Nichol Heewoo Jun Prafulla Dhariwal Pamela Mishkin and Mark Chen. 2022b. Point-E: A System for Generating 3D Point Clouds from Complex Prompts. arxiv: 2212.08751 [cs.CV] https:\/\/arxiv.org\/abs\/2212.08751"},{"key":"e_1_3_2_1_23_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Qi Zekun","year":"2024","unstructured":"Zekun Qi, Muzhou Yu, Runpei Dong, and Kaisheng Ma. 2024. Vpp: Efficient conditional 3d generation via voxel-point progressive representation. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_24_1","unstructured":"Guocheng Qian Jinjie Mai Abdullah Hamdi Jian Ren Aliaksandr Siarohin Bing Li Hsin-Ying Lee Ivan Skorokhodov Peter Wonka Sergey Tulyakov et al. 2023. Magic123: One image to high-quality 3d object generation using both 2d and 3d diffusion priors. arXiv preprint arXiv:2306.17843 (2023)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00403"},{"key":"e_1_3_2_1_26_1","volume-title":"Zero123: a single image to consistent multi-view diffusion base model. arXiv preprint arXiv:2310.15110","author":"Shi Ruoxi","year":"2023","unstructured":"Ruoxi Shi, Hansheng Chen, Zhuoyang Zhang, Minghua Liu, Chao Xu, Xinyue Wei, Linghao Chen, Chong Zeng, and Hao Su. 2023. Zero123: a single image to consistent multi-view diffusion base model. arXiv preprint arXiv:2310.15110 (2023)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01855"},{"key":"e_1_3_2_1_28_1","volume-title":"3d-gpt: Procedural 3d modeling with large language models. arXiv preprint arXiv:2310.12945","author":"Sun Chunyi","year":"2023","unstructured":"Chunyi Sun, Junlin Han, Weijian Deng, Xinlong Wang, Zishan Qin, and Stephen Gould. 2023. 3d-gpt: Procedural 3d modeling with large language models. arXiv preprint arXiv:2310.12945 (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"Autoregressive Model Beats Diffusion: Llama for Scalable Image Generation. arXiv preprint arXiv:2406.06525","author":"Sun Peize","year":"2024","unstructured":"Peize Sun, Yi Jiang, Shoufa Chen, Shilong Zhang, Bingyue Peng, Ping Luo, and Zehuan Yuan. 2024. Autoregressive Model Beats Diffusion: Llama for Scalable Image Generation. arXiv preprint arXiv:2406.06525 (2024)."},{"key":"e_1_3_2_1_30_1","volume-title":"Visual autoregressive modeling: Scalable image generation via next-scale prediction. arXiv preprint arXiv:2404.02905","author":"Tian Keyu","year":"2024","unstructured":"Keyu Tian, Yi Jiang, Zehuan Yuan, Bingyue Peng, and Liwei Wang. 2024. Visual autoregressive modeling: Scalable image generation via next-scale prediction. arXiv preprint arXiv:2404.02905 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"LLaMA-Mesh: Unifying 3D Mesh Generation with Language Models. arXiv preprint arXiv:2411.09595","author":"Wang Zhengyi","year":"2024","unstructured":"Zhengyi Wang, Jonathan Lorraine, Yikai Wang, Hang Su, Jun Zhu, Sanja Fidler, and Xiaohui Zeng. 2024a. LLaMA-Mesh: Unifying 3D Mesh Generation with Language Models. arXiv preprint arXiv:2411.09595 (2024)."},{"key":"e_1_3_2_1_32_1","unstructured":"Zhengyi Wang Jonathan Lorraine Yikai Wang Hang Su Jun Zhu Sanja Fidler and Xiaohui Zeng. 2024b. LLaMA-Mesh: Unifying 3D Mesh Generation with Language Models. arxiv: 2411.09595 [cs.LG] https:\/\/arxiv.org\/abs\/2411.09595"},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 1912--1920","author":"Wu Zhirong","year":"2015","unstructured":"Zhirong Wu, Shuran Song, Aditya Khosla, Fisher Yu, Linguang Zhang, Xiaoou Tang, and Jianxiong Xiao. 2015. 3d shapenets: A deep representation for volumetric shapes. In Proceedings of the IEEE conference on computer vision and pattern recognition. 1912--1920."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72698-9_8"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00649"},{"key":"e_1_3_2_1_36_1","unstructured":"Fukun Yin Xin Chen Chi Zhang Biao Jiang Zibo Zhao Jiayuan Fan Gang Yu Taihao Li and Tao Chen. 2023. ShapeGPT: 3D Shape Generation with A Unified Multi-modal Language Model. arxiv: 2311.17618 [cs.CV] https:\/\/arxiv.org\/abs\/2311.17618"},{"key":"e_1_3_2_1_37_1","volume-title":"Anygpt: Unified multimodal llm with discrete sequence modeling. arXiv preprint arXiv:2402.12226","author":"Zhan Jun","year":"2024","unstructured":"Jun Zhan, Junqi Dai, Jiasheng Ye, Yunhua Zhou, Dong Zhang, Zhigeng Liu, Xin Zhang, Ruibin Yuan, Ge Zhang, Linyang Li, et al. 2024. Anygpt: Unified multimodal llm with discrete sequence modeling. arXiv preprint arXiv:2402.12226 (2024)."},{"key":"e_1_3_2_1_38_1","volume-title":"GaussianCube: Structuring Gaussian Splatting using Optimal Transport for 3D Generative Modeling. arXiv preprint arXiv:2403.19655","author":"Zhang Bowen","year":"2024","unstructured":"Bowen Zhang, Yiji Cheng, Jiaolong Yang, Chunyu Wang, Feng Zhao, Yansong Tang, Dong Chen, and Baining Guo. 2024. GaussianCube: Structuring Gaussian Splatting using Optimal Transport for 3D Generative Modeling. arXiv preprint arXiv:2403.19655 (2024)."},{"key":"e_1_3_2_1_39_1","volume-title":"Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858","author":"Zhang Hang","year":"2023","unstructured":"Hang Zhang, Xin Li, and Lidong Bing. 2023. Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02164"}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Chicago IL USA","acronym":"ICMR '25"},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733405","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:10:58Z","timestamp":1755749458000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733405"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":40,"alternative-id":["10.1145\/3731715.3733405","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733405","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}