{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:54:55Z","timestamp":1781538895317,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":73,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376182"],"award-info":[{"award-number":["62376182"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810588","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1231-1240","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["VAR-3D: View-aware Auto-Regressive Model for Text-to-3D Generation via a 3D Tokenizer"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-3477-9778","authenticated-orcid":false,"given":"Zongcheng","family":"Han","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0606-3718","authenticated-orcid":false,"given":"Yu","family":"Hong","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8072-0308","authenticated-orcid":false,"given":"Haoran","family":"Sun","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6857-5333","authenticated-orcid":false,"given":"Dongyan","family":"Cao","sequence":"additional","affiliation":[{"name":"Suzhou Research Institute, Harbin Institute of Technology, Suzhou, China and Research Center for Social Computing and Interactive Robotics, Harbin Institute of Technology, Harbin, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia\u00a0Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et\u00a0al. 2023. Gpt-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"crossref","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et\u00a0al. 2022. Flamingo: a visual language model for few-shot learning. 35 (2022) 23716\u201323736.","DOI":"10.52202\/068431-1723"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"Tenglong Ao Qingzhe Gao Yuke Lou Baoquan Chen and Libin Liu. 2022. Rhythmic gesticulator: Rhythm-aware co-speech gesture synthesis with hierarchical neural embeddings. ACM Transactions on Graphics (TOG) 41 6 (2022) 1\u201319.","DOI":"10.1145\/3550454.3555435"},{"key":"e_1_3_3_1_5_2","unstructured":"Yoshua Bengio Nicholas L\u00e9onard and Aaron Courville. 2013. Estimating or propagating gradients through stochastic neurons for conditional computation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1308.3432 (2013)."},{"key":"e_1_3_3_1_6_2","unstructured":"Miko\u0142aj Bi\u0144kowski Danica\u00a0J Sutherland Michael Arbel and Arthur Gretton. 2018. Demystifying MMD GANs."},{"key":"e_1_3_3_1_7_2","unstructured":"Tim Brooks Bill Peebles Connor Holmes Will DePue Yufei Guo Li Jing David Schnurr Joe Taylor Troy Luhman Eric Luhman et\u00a0al. 2024. Video generation models as world simulators. OpenAI Blog 1 (2024) 8."},{"key":"e_1_3_3_1_8_2","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared\u00a0D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et\u00a0al. 2020. Language models are few-shot learners. 33 (2020) 1877\u20131901."},{"key":"e_1_3_3_1_9_2","unstructured":"Huiwen Chang Han Zhang Jarred Barber AJ Maschinot Jose Lezama Lu Jiang Ming-Hsuan Yang Kevin Murphy William\u00a0T. Freeman Michael Rubinstein Yuanzhen Li and Dilip Krishnan. 2023. Muse: Text-To-Image Generation via Masked Generative Transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2301.00704 (2023)."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01103"},{"key":"e_1_3_3_1_11_2","unstructured":"Jinnan Chen Chen Li Jianfeng Zhang Lingting Zhu Buzhen Huang Hanlin Chen and Gim\u00a0Hee Lee. 2024. Generalizable Human Gaussians from Single-View Image. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.06050 (2024)."},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Rui Chen Yongwei Chen Ningxin Jiao and Kui Jia. 2023. Fantasia3d: Disentangling geometry and appearance for high-quality text-to-3d content creation. 22246\u201322256.","DOI":"10.1109\/ICCV51070.2023.02033"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"Sijin Chen Xin Chen Anqi Pang Xianfang Zeng Wei Cheng Yijun Fu Fukun Yin Yanru Wang Zhibin Wang Chi Zhang et\u00a0al. 2024. MeshXL: Neural Coordinate Field for Generative 3D Foundation Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.20853 (2024).","DOI":"10.52202\/079017-3080"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"Yiwen Chen Tong He Di Huang Weicai Ye Sijin Chen Jiaxiang Tang Xin Chen Zhongang Cai Lei Yang Gang Yu Guosheng Lin and Chi Zhang. 2025. MeshAnything: Artist-Created Mesh Generation with Autoregressive Transformers.","DOI":"10.1109\/ICCV51701.2025.01292"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"Yongwei Chen Yushi Lan Shangchen Zhou Tengfei Wang and Xingang Pan. 2025. SAR3D: Autoregressive 3D Object Generation and Understanding via Multi-scale 3D VQVAE. (2025).","DOI":"10.1109\/CVPR52734.2025.02642"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Yongwei Chen Tengfei Wang Tong Wu Xingang Pan Kui Jia and Ziwei Liu. 2024. Comboverse: Compositional 3d assets creation using spatially-aware diffusion guidance. Springer 128\u2013146.","DOI":"10.1007\/978-3-031-72691-0_8"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02475"},{"key":"e_1_3_3_1_18_2","unstructured":"Aakanksha Chowdhery Sharan Narang Jacob Devlin Maarten Bosma Gaurav Mishra Adam Roberts Paul Barham Hyung\u00a0Won Chung Charles Sutton Sebastian Gehrmann et\u00a0al. 2023. Palm: Scaling language modeling with pathways. 24 240 (2023) 1\u2013113."},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Matt Deitke Ruoshi Liu Matthew Wallingford Huong Ngo Oscar Michel Aditya Kusupati Alan Fan Christian Laforte Vikram Voleti Samir\u00a0Yitzhak Gadre et\u00a0al. 2023. Objaverse-xl: A universe of 10m+ 3d objects. 36 (2023) 35799\u201335813.","DOI":"10.52202\/075280-1554"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","unstructured":"Matt Deitke Dustin Schwenk Jordi Salvador Luca Weihs Oscar Michel Eli VanderBilt Ludwig Schmidt Kiana Ehsani Aniruddha Kembhavi and Ali Farhadi. 2023. Objaverse: A universe of annotated 3d objects. 13142\u201313153.","DOI":"10.1109\/CVPR52729.2023.01263"},{"key":"e_1_3_3_1_21_2","unstructured":"Danny Driess Fei Xia Mehdi\u00a0SM Sajjadi Corey Lynch Aakanksha Chowdhery Brian Ichter Ayzaan Wahid Jonathan Tompson Quan Vuong Tianhe Yu et\u00a0al. 2023. PaLM-E: An Embodied Multimodal Language Model. (2023) 8469\u20138488."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"e_1_3_3_1_23_2","unstructured":"Ishaan Gulrajani Faruk Ahmed Martin Arjovsky Vincent Dumoulin and Aaron Courville. 2017. Improved Training of Wasserstein GANs. (2017). arxiv:https:\/\/arXiv.org\/abs\/1704.00028"},{"key":"e_1_3_3_1_24_2","unstructured":"Martin Heusel Hubert Ramsauer Thomas Unterthiner Bernhard Nessler and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. 30 (2017)."},{"key":"e_1_3_3_1_25_2","unstructured":"Fangzhou Hong Jiaxiang Tang Ziang Cao Min Shi Tong Wu Zhaoxi Chen Tengfei Wang Liang Pan Dahua Lin and Ziwei Liu. 2024. 3DTopia: Large Text-to-3D Generation Model with Hybrid Diffusion Priors. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.02234 (2024)."},{"key":"e_1_3_3_1_26_2","unstructured":"Yicong Hong Kai Zhang Jiuxiang Gu Sai Bi Yang Zhou Difan Liu Feng Liu Kalyan Sunkavalli Trung Bui and Hao Tan. 2024. LRM: Large Reconstruction Model for Single Image to 3D. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.04400 (2024)."},{"key":"e_1_3_3_1_27_2","unstructured":"Heewoo Jun and Alex Nichol. 2023. Shap-E: Generating Conditional 3D Implicit Functions. (2023). arxiv:https:\/\/arXiv.org\/abs\/2305.02463"},{"key":"e_1_3_3_1_28_2","unstructured":"Diederik\u00a0P Kingma. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1312.6114 (2013)."},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Yushi Lan Fangzhou Hong Shuai Yang Shangchen Zhou Xuyi Meng Bo Dai Xingang Pan and Chen\u00a0Change Loy. 2024. LN3Diff: Scalable Latent Neural Fields Diffusion for Speedy 3D Generation. (2024).","DOI":"10.1007\/978-3-031-73235-5_7"},{"key":"e_1_3_3_1_30_2","unstructured":"Yushi Lan Shangchen Zhou Zhaoyang Lyu Fangzhou Hong Shuai Yang Bo Dai Xingang Pan and Chen\u00a0Change Loy. 2025. GaussianAnything: Interactive Point Cloud Latent Diffusion for 3D Generation."},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01123"},{"key":"e_1_3_3_1_32_2","unstructured":"Tianhong Li Yonglong Tian He Li Mingyang Deng and Kaiming He. 2024. Autoregressive Image Generation without Vector Quantization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.11838 (2024)."},{"key":"e_1_3_3_1_33_2","unstructured":"Weiyu Li Jiarui Liu Hongyu Yan Rui Chen Yixun Liang Xuelin Chen Ping Tan and Xiaoxiao Long. 2025. CraftsMan: High-fidelity Mesh Generation with 3D Native Generation and Interactive Geometry Refiner."},{"key":"e_1_3_3_1_34_2","unstructured":"Yuan Liu Cheng Lin Zijiao Zeng Xiaoxiao Long Lingjie Liu Taku Komura and Wenping Wang. 2023. Syncdreamer: Generating multiview-consistent images from a single-view image. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.03453 (2023)."},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72652-1_5"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"crossref","unstructured":"Xiaoxiao Long Yuan-Chen Guo Cheng Lin Yuan Liu Zhiyang Dou Lingjie Liu Yuexin Ma Song-Hai Zhang Marc Habermann Christian Theobalt et\u00a0al. 2024. Wonder3d: Single image to 3d using cross-domain diffusion. 9970\u20139980.","DOI":"10.1109\/CVPR52733.2024.00951"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"crossref","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray et\u00a0al. 2022. Training language models to follow instructions with human feedback. 35 (2022) 27730\u201327744.","DOI":"10.52202\/068431-2011"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"crossref","unstructured":"William Peebles and Saining Xie. 2023. Scalable diffusion models with transformers. 4195\u20134205.","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_3_1_39_2","unstructured":"Ben Poole Ajay Jain Jonathan\u00a0T Barron and Ben Mildenhall. 2023. DreamFusion: Text-to-3D using 2D Diffusion."},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"Lingteng Qiu Guanying Chen Xiaodong Gu Qi Zuo Mutian Xu Yushuang Wu Weihao Yuan Zilong Dong Liefeng Bo and Xiaoguang Han. 2024. Richdreamer: A generalizable normal-depth diffusion model for detail richness in text-to-3d. 9914\u20139925.","DOI":"10.1109\/CVPR52733.2024.00946"},{"key":"e_1_3_3_1_41_2","unstructured":"Alec Radford Jong\u00a0Wook Kim Chris Hallacy Aditya Ramesh Gabriel Goh Sandhini Agarwal Girish Sastry Amanda Askell Pamela Mishkin Jack Clark et\u00a0al. 2021. Learning transferable visual models from natural language supervision. PmLR 8748\u20138763."},{"key":"e_1_3_3_1_42_2","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR."},{"key":"e_1_3_3_1_43_2","first-page":"8821","volume-title":"International conference on machine learning","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In International conference on machine learning. Pmlr, 8821\u20138831."},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2022. High-resolution image synthesis with latent diffusion models. 10684\u201310695.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_1_45_2","unstructured":"Tim Salimans Andrej Karpathy Xi Chen and Diederik\u00a0P Kingma. 2017. PixelCNN++: Improving the PixelCNN with Discretized Logistic Mixture Likelihood and Other Modifications."},{"key":"e_1_3_3_1_46_2","volume-title":"arXiv","author":"Shi Ruoxi","year":"2023","unstructured":"Ruoxi Shi, Hansheng Chen, Zhuoyang Zhang, Minghua Liu, Chao Xu, Xinyue Wei, Linghao Chen, Chong Zeng, and Hao Su. 2023. Zero123++: a Single Image to Consistent Multi-view Diffusion Base Model. In arXiv."},{"key":"e_1_3_3_1_47_2","unstructured":"Yichun Shi Peng Wang Jianglong Ye Long Mai Kejie Li and Xiao Yang. 2023. MVDream: Multi-view Diffusion for 3D Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.16512 (2023)."},{"key":"e_1_3_3_1_48_2","volume-title":"CVPR","author":"Siddiqui Yawar","year":"2023","unstructured":"Yawar Siddiqui, Antonio Alliegro, Alexey Artemov, Tatiana Tommasi, Daniele Sirigatti, Vladislav Rosov, Angela Dai, and Matthias Nie\u00dfner. 2023. MeshGPT: Generating Triangle Meshes with Decoder-Only Transformers. In CVPR."},{"key":"e_1_3_3_1_49_2","unstructured":"Vincent Sitzmann Semon Rezchikov Bill Freeman Josh Tenenbaum and Fredo Durand. 2021. Light field networks: Neural scene representations with single-evaluation rendering. 34 (2021) 19313\u201319325."},{"key":"e_1_3_3_1_50_2","unstructured":"Peize Sun Yi Jiang Shoufa Chen Shilong Zhang Bingyue Peng Ping Luo and Zehuan Yuan. 2024. Autoregressive Model Beats Diffusion: Llama for Scalable Image Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.06525 (2024)."},{"key":"e_1_3_3_1_51_2","unstructured":"Peize Sun Yi Jiang Shoufa Chen Shilong Zhang Bingyue Peng Ping Luo and Zehuan Yuan. 2024. Autoregressive Model Beats Diffusion: Llama for Scalable Image Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.06525 (2024)."},{"key":"e_1_3_3_1_52_2","unstructured":"Quan Sun Qiying Yu Yufeng Cui Fan Zhang Xiaosong Zhang Yueze Wang Hongcheng Gao Jingjing Liu Tiejun Huang and Xinlong Wang. 2024. Emu: Generative Pretraining in Multimodality. (2024)."},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"crossref","unstructured":"Jiaxiang Tang Zhaoxi Chen Xiaokang Chen Tengfei Wang Gang Zeng and Ziwei Liu. 2024. LGM: Large Multi-View Gaussian Model for High-Resolution 3D Content Creation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.05054 (2024).","DOI":"10.1007\/978-3-031-73235-5_1"},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"crossref","unstructured":"Junshu Tang Tengfei Wang Bo Zhang Ting Zhang Ran Yi Lizhuang Ma and Dong Chen. 2023. Make-it-3d: High-fidelity 3d creation from a single image with diffusion prior. (2023) 22819\u201322829.","DOI":"10.1109\/ICCV51070.2023.02086"},{"key":"e_1_3_3_1_55_2","doi-asserted-by":"crossref","unstructured":"Keyu Tian Yi Jiang Zehuan Yuan Bingyue Peng and Liwei Wang. 2024. Visual autoregressive modeling: Scalable image generation via next-scale prediction. 37 (2024) 84839\u201384865.","DOI":"10.52202\/079017-2694"},{"key":"e_1_3_3_1_56_2","unstructured":"Dmitry Tochilkin David Pankratz Zexiang Liu Zixuan Huang Adam Letts Yangguang Li Ding Liang Christian Laforte Varun Jampani and Yan-Pei Cao. 2024. TripoSR: Fast 3D Object Reconstruction from a Single Image. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.02151 (2024)."},{"key":"e_1_3_3_1_57_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar et\u00a0al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971 (2023)."},{"key":"e_1_3_3_1_58_2","doi-asserted-by":"crossref","unstructured":"Arash Vahdat Francis Williams Zan Gojcic Or Litany Sanja Fidler Karsten Kreis et\u00a0al. 2022. Lion: Latent point diffusion models for 3d shape generation. 35 (2022) 10021\u201310039.","DOI":"10.52202\/068431-0728"},{"key":"e_1_3_3_1_59_2","unstructured":"Aaron Van Den\u00a0Oord Oriol Vinyals et\u00a0al. 2017. Neural discrete representation learning. 30 (2017)."},{"key":"e_1_3_3_1_60_2","unstructured":"Peng Wang and Yichun Shi. 2023. ImageDream: Image-Prompt Multi-view Diffusion for 3D Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.02201 (2023)."},{"key":"e_1_3_3_1_61_2","doi-asserted-by":"crossref","unstructured":"Tengfei Wang Bo Zhang Ting Zhang Shuyang Gu Jianmin Bao Tadas Baltrusaitis Jingjing Shen Dong Chen Fang Wen Qifeng Chen et\u00a0al. 2023. Rodin: A generative model for sculpting 3d digital avatars using diffusion. 4563\u20134573.","DOI":"10.1109\/CVPR52729.2023.00443"},{"key":"e_1_3_3_1_62_2","doi-asserted-by":"crossref","unstructured":"Zhou Wang Alan\u00a0C Bovik Hamid\u00a0R Sheikh and Eero\u00a0P Simoncelli. 2004. Image quality assessment: from error visibility to structural similarity. IEEE transactions on image processing 13 4 (2004) 600\u2013612.","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_3_1_63_2","unstructured":"Zhengyi Wang Jonathan Lorraine Yikai Wang Hang Su Jun Zhu Sanja Fidler and Xiaohui Zeng. 2024. LLaMA-Mesh: Unifying 3D Mesh Generation with Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.09595 (2024)."},{"key":"e_1_3_3_1_64_2","doi-asserted-by":"crossref","unstructured":"Zhengyi Wang Cheng Lu Yikai Wang Fan Bao Chongxuan Li Hang Su and Jun Zhu. 2023. Prolificdreamer: High-fidelity and diverse text-to-3d generation with variational score distillation. 36 (2023) 8406\u20138441.","DOI":"10.52202\/075280-0368"},{"key":"e_1_3_3_1_65_2","unstructured":"Zhenwei Wang Tengfei Wang Zexin He Gerhard\u00a0Petrus Hancke Ziwei Liu and Rynson\u00a0WH Lau. 2025. Phidias: A Generative Model for Creating 3D Content from Text Image and 3D Conditions with Reference-Augmented Diffusion."},{"key":"e_1_3_3_1_66_2","doi-asserted-by":"publisher","DOI":"10.1145\/3721238.3730601"},{"key":"e_1_3_3_1_67_2","unstructured":"Shuang Wu Youtian Lin Feihu Zhang Yifei Zeng Jingxi Xu Philip Torr Xun Cao and Yao Yao. 2024. Direct3D: Scalable Image-to-3D Generation via 3D Latent Diffusion Transformer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.14832 (2024)."},{"key":"e_1_3_3_1_68_2","unstructured":"Jiale Xu Weihao Cheng Yiming Gao Xintao Wang Shenghua Gao and Ying Shan. 2024. Instantmesh: Efficient 3d mesh generation from a single image with sparse-view large reconstruction models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.07191 (2024)."},{"key":"e_1_3_3_1_69_2","unstructured":"Lijun Yu Jose Lezama Nitesh\u00a0Bharadwaj Gundavarapu Luca Versari Kihyuk Sohn David Minnen Yong Cheng Agrim Gupta Xiuye Gu Alexander\u00a0G Hauptmann et\u00a0al. 2024. Language Model Beats Diffusion-Tokenizer is key to visual generation. (2024)."},{"key":"e_1_3_3_1_70_2","doi-asserted-by":"crossref","unstructured":"Biao Zhang Jiapeng Tang Matthias Niessner and Peter Wonka. 2023. 3dshape2vecset: A 3d shape representation for neural fields and generative diffusion models. ACM Transactions on Graphics 42 4 (2023) 1\u201316.","DOI":"10.1145\/3592442"},{"key":"e_1_3_3_1_71_2","doi-asserted-by":"crossref","unstructured":"Longwen Zhang Ziyu Wang Qixuan Zhang Qiwei Qiu Anqi Pang Haoran Jiang Wei Yang Lan Xu and Jingyi Yu. 2024. CLAY: A Controllable Large-scale Generative Model for Creating High-quality 3D Assets. ACM Transactions on Graphics (TOG) 43 4 (2024) 1\u201320.","DOI":"10.1145\/3658146"},{"key":"e_1_3_3_1_72_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_3_1_73_2","doi-asserted-by":"crossref","unstructured":"Xuying Zhang Yutong Liu Yangguang Li Renrui Zhang Yufei Liu Kai Wang Wanli Ouyang Zhiwei Xiong Peng Gao Qibin Hou and Ming-Ming Cheng. 2025. Tar3d: Creating high-quality 3d assets via next-part prediction. (2025) 5134\u20135145.","DOI":"10.1109\/ICCV51701.2025.00488"},{"key":"e_1_3_3_1_74_2","doi-asserted-by":"crossref","unstructured":"Xuying Zhang Yupeng Zhou Kai Wang Yikai Wang Zhen Li Shaohui Jiao Daquan Zhou Qibin Hou and Ming-Ming Cheng. 2025. Ar-1-to-3: Single image to consistent 3d object generation via next-view prediction. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.12929 (2025).","DOI":"10.1109\/ICCV51701.2025.02438"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:08:30Z","timestamp":1781536110000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810588"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":73,"alternative-id":["10.1145\/3805622.3810588","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810588","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}