{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,25]],"date-time":"2026-01-25T04:30:55Z","timestamp":1769315455328,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100002858","name":"China Postdoctoral Science Foundation","doi-asserted-by":"publisher","award":["2023M731957"],"award-info":[{"award-number":["2023M731957"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100002858","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62306165"],"award-info":[{"award-number":["62306165"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shenzhen Key Laboratory of next generation interactive media innovative technology","award":["ZDSYS20210623092001004"],"award-info":[{"award-number":["ZDSYS20210623092001004"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680994","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"6715-6724","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":21,"title":["Consistent123: One Image to Highly Consistent 3D Asset Using Case-Aware Diffusion Priors"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-2469-5690","authenticated-orcid":false,"given":"Yukang","family":"Lin","sequence":"first","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4894-3860","authenticated-orcid":false,"given":"Haonan","family":"Han","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1790-0434","authenticated-orcid":false,"given":"Chaoqun","family":"Gong","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5586-4971","authenticated-orcid":false,"given":"Zunnan","family":"Xu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6153-5004","authenticated-orcid":false,"given":"Yachao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0403-1923","authenticated-orcid":false,"given":"Xiu","family":"Li","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00097"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01565"},{"key":"e_1_3_2_1_3_1","first-page":"35799","article-title":"Objaverse-XL: A Universe of 10M 3D Objects","volume":"36","author":"Deitke Matt","year":"2023","unstructured":"Matt Deitke, Ruoshi Liu, Matthew Wallingford, Huong Ngo, Oscar Michel, Aditya Kusupati, Alan Fan, Christian Laforte, Vikram Voleti, Samir Yitzhak Gadre, Eli VanderBilt, Aniruddha Kembhavi, Carl Vondrick, Georgia Gkioxari, Kiana Ehsani, Ludwig Schmidt, and Ali Farhadi. 2023. Objaverse-XL: A Universe of 10M 3D Objects. In Advances in Neural Information Processing Systems (NeurIPS), Vol. 36. 35799--35813.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9811809"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01061"},{"key":"e_1_3_2_1_6_1","volume-title":"Point2mesh: A self-prior for deformable meshes. arXiv preprint arXiv:2005.11084","author":"Hanocka Rana","year":"2020","unstructured":"Rana Hanocka, Gal Metzer, Raja Giryes, and Daniel Cohen-Or. 2020. Point2mesh: A self-prior for deformable meshes. arXiv preprint arXiv:2005.11084 (2020)."},{"key":"e_1_3_2_1_7_1","volume-title":"Zero-Shot Text-Guided Object Generation with Dream Fields. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 867--876","author":"Jain Ajay","year":"2022","unstructured":"Ajay Jain, Ben Mildenhall, Jonathan T. Barron, Pieter Abbeel, and Ben Poole. 2022. Zero-Shot Text-Guided Object Generation with Dream Fields. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 867--876."},{"key":"e_1_3_2_1_8_1","volume-title":"Shap-e: Generating conditional 3d implicit functions. arXiv preprint arXiv:2305.02463","author":"Jun Heewoo","year":"2023","unstructured":"Heewoo Jun and Alex Nichol. 2023. Shap-e: Generating conditional 3d implicit functions. arXiv preprint arXiv:2305.02463 (2023)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01839"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML). 19730--19742","author":"Junnan","unstructured":"Junnan Li et al. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In Proceedings of the International Conference on Machine Learning (ICML). 19730--19742."},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML). 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In Proceedings of the International Conference on Machine Learning (ICML). 12888--12900."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00037"},{"key":"e_1_3_2_1_13_1","first-page":"22226","article-title":"One-2--3--45: Any Single Image to 3D Mesh in 45 Seconds without Per-Shape Optimization","volume":"36","author":"Liu Minghua","year":"2023","unstructured":"Minghua Liu, Chao Xu, Haian Jin, Linghao Chen, Mukund Varma T, Zexiang Xu, and Hao Su. 2023. One-2--3--45: Any Single Image to 3D Mesh in 45 Seconds without Per-Shape Optimization. In Advances in Neural Information Processing Systems (NeurIPS), Vol. 36. 22226--22246.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00816"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00459"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_24"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550469.3555392"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530127"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00025"},{"key":"e_1_3_2_1_21_1","volume-title":"The Eleventh International Conference on Learning Representations (ICLR).","author":"Poole Ben","year":"2023","unstructured":"Ben Poole, Ajay Jain, Jonathan T. Barron, and Ben Mildenhall. 2023. DreamFusion: Text-to-3D using 2D Diffusion. In The Eleventh International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_22_1","volume-title":"Magic123: One Image to High-Quality 3D Object Generation Using Both 2D and 3D Diffusion Priors. arXiv preprint arXiv:2306.17843","author":"Qian Guocheng","year":"2023","unstructured":"Guocheng Qian, Jinjie Mai, Abdullah Hamdi, Jian Ren, Aliaksandr Siarohin, Bing Li, Hsin-Ying Lee, Ivan Skorokhodov, Peter Wonka, Sergey Tulyakov, and Bernard Ghanem. 2023. Magic123: One Image to High-Quality 3D Object Generation Using Both 2D and 3D Diffusion Priors. arXiv preprint arXiv:2306.17843 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML). 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In Proceedings of the International Conference on Machine Learning (ICML). 8748--8763."},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML). 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the International Conference on Machine Learning (ICML). 8748--8763."},{"key":"e_1_3_2_1_25_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591503"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544777"},{"key":"e_1_3_2_1_28_1","volume-title":"High-Resolution Image Synthesis With Latent Diffusion Models. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 10684--10695","author":"Rombach Robin","year":"2022","unstructured":"Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, and Bj\u00f6rn Ommer. 2022. High-Resolution Image Synthesis With Latent Diffusion Models. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 10684--10695."},{"key":"e_1_3_2_1_29_1","volume-title":"Let 2D Diffusion Model Know 3D-Consistency for Robust Text-to-3D Generation. arXiv preprint arXiv:2303.07937","author":"Seo Junyoung","year":"2023","unstructured":"Junyoung Seo, Wooseok Jang, Min-Seop Kwak, Jaehoon Ko, Hyeonsu Kim, Junho Kim, Jin-Hwa Kim, Jiyoung Lee, and Seungryong Kim. 2023. Let 2D Diffusion Model Know 3D-Consistency for Robust Text-to-3D Generation. arXiv preprint arXiv:2303.07937 (2023)."},{"key":"e_1_3_2_1_30_1","first-page":"6087","article-title":"Deep Marching Tetrahedra: a Hybrid Representation for High-Resolution 3D Shape Synthesis","volume":"34","author":"Shen Tianchang","year":"2021","unstructured":"Tianchang Shen, Jun Gao, Kangxue Yin, Ming-Yu Liu, and Sanja Fidler. 2021. Deep Marching Tetrahedra: a Hybrid Representation for High-Resolution 3D Shape Synthesis. In Advances in Neural Information Processing Systems (NeurIPS), Vol. 34. 6087--6101.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_1_31_1","volume-title":"Stable-dreamfusion: Text-to-3D with Stable-diffusion. https:\/\/github.com\/ashawkey\/stable-dreamfusion.","author":"Tang Jiaxiang","year":"2022","unstructured":"Jiaxiang Tang. 2022. Stable-dreamfusion: Text-to-3D with Stable-diffusion. https:\/\/github.com\/ashawkey\/stable-dreamfusion."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02086"},{"key":"e_1_3_2_1_33_1","volume-title":"CLIP-NeRF: Text-and-Image Driven Manipulation of Neural Radiance Fields. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 3825--3834","author":"Wang Can","year":"2022","unstructured":"Can Wang, Menglei Chai, Mingming He, Dongdong Chen, and Jing Liao. 2022. CLIP-NeRF: Text-and-Image Driven Manipulation of Neural Radiance Fields. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 3825--3834."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01252-6_4"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00443"},{"key":"e_1_3_2_1_36_1","volume-title":"Pretraining is all you need for image-to-image translation. arXiv preprint arXiv:2205.12952","author":"Wang Tengfei","year":"2022","unstructured":"Tengfei Wang, Ting Zhang, Bo Zhang, Hao Ouyang, Dong Chen, Qifeng Chen, and Fang Wen. 2022. Pretraining is all you need for image-to-image translation. arXiv preprint arXiv:2205.12952 (2022)."},{"key":"e_1_3_2_1_37_1","volume-title":"High-Fidelity GAN Inversion for Image Attribute Editing. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 11379--11388","author":"Wang Tengfei","year":"2022","unstructured":"Tengfei Wang, Yong Zhang, Yanbo Fan, Jue Wang, and Qifeng Chen. 2022. High-Fidelity GAN Inversion for Image Attribute Editing. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 11379--11388."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00209"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00039"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01605"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00041"},{"key":"e_1_3_2_1_42_1","volume-title":"CloSET: Modeling Clothed Humans on Continuous Surface with Explicit Template Decomposition. In 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 501--511","author":"Zhang Hongwen","year":"2023","unstructured":"Hongwen Zhang, Siyou Lin, Ruizhi Shao, Yuxiang Zhang, Zerong Zheng, Han Huang, Yandong Guo, and Yebin Liu. 2023. CloSET: Modeling Clothed Humans on Continuous Surface with Explicit Template Decomposition. In 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 501--511."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680994","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680994","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:35Z","timestamp":1750295855000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680994"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":43,"alternative-id":["10.1145\/3664647.3680994","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680994","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}