{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,10]],"date-time":"2026-02-10T19:44:37Z","timestamp":1770752677884,"version":"3.50.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"This work is supported by the National Key R&D Program of China under Grant No. 2021QY1500, the State Key Program of the National Natural Science Foundation of China (NSFC) (No.61831022).","award":["2021QY1500"],"award-info":[{"award-number":["2021QY1500"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612022","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"5734-5745","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":19,"title":["AvatarFusion: Zero-shot Generation of Clothing-Decoupled 3D Avatars Using 2D Diffusion"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-7100-1067","authenticated-orcid":false,"given":"Shuo","family":"Huang","sequence":"first","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8783-8313","authenticated-orcid":false,"given":"Zongxin","family":"Yang","sequence":"additional","affiliation":[{"name":"ReLER, CCAI, Zhejiang University, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2265-4026","authenticated-orcid":false,"given":"Liangting","family":"Li","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0512-880X","authenticated-orcid":false,"given":"Yi","family":"Yang","sequence":"additional","affiliation":[{"name":"ReLER, CCAI, Zhejiang University, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8449-278X","authenticated-orcid":false,"given":"Jia","family":"Jia","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University &amp; Beijing National Research Center for Information Science and Technology, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_21"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01565"},{"key":"e_1_3_2_1_3_1","volume-title":"Segment and track anything. arXiv preprint arXiv:2305.06558","author":"Cheng Yangming","year":"2023","unstructured":"Yangming Cheng, Liulei Li, Yuanyou Xu, Xiaodi Li, Zongxin Yang, Wenguan Wang, and Yi Yang. 2023. Segment and track anything. arXiv preprint arXiv:2305.06558 (2023)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01170"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01129"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/378456.378484"},{"key":"e_1_3_2_1_7_1","volume-title":"Capturing and animation of body and clothing from monocular video. arXiv preprint arXiv:2210.01868","author":"Feng Yao","year":"2022","unstructured":"Yao Feng, Jinlong Yang, Marc Pollefeys, Michael J Black, and Timo Bolkart. 2022. Capturing and animation of body and clothing from monocular video. arXiv preprint arXiv:2210.01868 (2022)."},{"key":"e_1_3_2_1_8_1","unstructured":"Geitgey. 2017. Face Recognition. https:\/\/github.com\/ageitgey\/face_recognition"},{"key":"e_1_3_2_1_9_1","volume-title":"Avatarclip: Zero-shot text-driven generation and animation of 3d avatars. arXiv preprint arXiv:2205.08535","author":"Hong Fangzhou","year":"2022","unstructured":"Fangzhou Hong, Mingyuan Zhang, Liang Pan, Zhongang Cai, Lei Yang, and Ziwei Liu. 2022. Avatarclip: Zero-shot text-driven generation and animation of 3d avatars. arXiv preprint arXiv:2205.08535 (2022)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00094"},{"key":"e_1_3_2_1_11_1","volume-title":"Bcnet: Learning body and cloth shape from a single image. In Computer Vision-ECCV 2020: 16th European Conference","author":"Jiang Boyi","year":"2020","unstructured":"Boyi Jiang, Juyong Zhang, Yang Hong, Jinhao Luo, Ligang Liu, and Hujun Bao. 2020. Bcnet: Learning body and cloth shape from a single image. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XX 16. Springer, 18--35."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550454.3555513"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3450626.3459884"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3272127.3275055"},{"key":"e_1_3_2_1_15_1","volume-title":"Magic3D: High-Resolution Text-to-3D Content Creation. arXiv preprint arXiv:2211.10440","author":"Lin Chen-Hsuan","year":"2022","unstructured":"Chen-Hsuan Lin, Jun Gao, Luming Tang, Towaki Takikawa, Xiaohui Zeng, Xun Huang, Karsten Kreis, Sanja Fidler, Ming-Yu Liu, and Tsung-Yi Lin. 2022. Magic3D: High-Resolution Text-to-3D Content Creation. arXiv preprint arXiv:2211.10440 (2022)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/2816795.2818013"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00286"},{"key":"e_1_3_2_1_18_1","volume-title":"Latent-NeRF for Shape-Guided Generation of 3D Shapes and Textures. arXiv preprint arXiv:2211.07600","author":"Metzer Gal","year":"2022","unstructured":"Gal Metzer, Elad Richardson, Or Patashnik, Raja Giryes, and Daniel Cohen-Or. 2022. Latent-NeRF for Shape-Guided Generation of 3D Shapes and Textures. arXiv preprint arXiv:2211.07600 (2022)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01313"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"e_1_3_2_1_21_1","volume-title":"Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734","author":"Mokady Ron","year":"2021","unstructured":"Ron Mokady, Amir Hertz, and Amit H Bermano. 2021. Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734 (2021)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341156"},{"key":"e_1_3_2_1_23_1","volume-title":"Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741","author":"Nichol Alex","year":"2021","unstructured":"Alex Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, and Mark Chen. 2021. Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)."},{"key":"e_1_3_2_1_24_1","volume-title":"Point-E: A System for Generating 3D Point Clouds from Complex Prompts. arXiv preprint arXiv:2212.08751","author":"Nichol Alex","year":"2022","unstructured":"Alex Nichol, Heewoo Jun, Prafulla Dhariwal, Pamela Mishkin, and Mark Chen. 2022. Point-E: A System for Generating 3D Point Clouds from Complex Prompts. arXiv preprint arXiv:2212.08751 (2022)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00356"},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 13503--13513","author":"Or-El Roy","year":"2022","unstructured":"Roy Or-El, Xuan Luo, Mengyi Shan, Eli Shechtman, Jeong Joon Park, and Ira Kemelmacher-Shlizerman. 2022. Stylesdf: High-resolution 3d-consistent image and geometry generation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 13503--13513."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00328"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00581"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00244"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00739"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01405"},{"key":"e_1_3_2_1_32_1","volume-title":"Dreamfusion: Text-to-3d using 2d diffusion. arXiv preprint arXiv:2209.14988","author":"Poole Ben","year":"2022","unstructured":"Ben Poole, Ajay Jain, Jonathan T Barron, and Ben Mildenhall. 2022. Dreamfusion: Text-to-3d using 2d diffusion. arXiv preprint arXiv:2209.14988 (2022)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01018"},{"key":"e_1_3_2_1_34_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_35_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_1_36_1","volume-title":"Texture: Text-guided texturing of 3d shapes. arXiv preprint arXiv:2302.01721","author":"Richardson Elad","year":"2023","unstructured":"Elad Richardson, Gal Metzer, Yuval Alaluf, Raja Giryes, and Daniel Cohen-Or. 2023. Texture: Text-guided texturing of 3d shapes. arXiv preprint arXiv:2302.01721 (2023)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_38_1","first-page":"36479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume":"35","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in Neural Information Processing Systems, Vol. 35 (2022), 36479--36494.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_39_1","volume-title":"Computer Graphics Forum","author":"Santesteban Igor","unstructured":"Igor Santesteban, Miguel A Otaduy, and Dan Casas. 2019. Learning-based animation of clothing for virtual try-on. In Computer Graphics Forum, Vol. 38. Wiley Online Library, 355--366."},{"key":"e_1_3_2_1_40_1","unstructured":"Maximilian Seitzer. 2020. pytorch-fid: FID Score for PyTorch. https:\/\/github.com\/mseitzer\/pytorch-fid. Version 0.3.0."},{"key":"e_1_3_2_1_41_1","volume-title":"Stable-dreamfusion: Text-to-3D with Stable-diffusion. https:\/\/github.com\/ashawkey\/stable-dreamfusion.","author":"Tang Jiaxiang","year":"2022","unstructured":"Jiaxiang Tang. 2022. Stable-dreamfusion: Text-to-3D with Stable-diffusion. https:\/\/github.com\/ashawkey\/stable-dreamfusion."},{"key":"e_1_3_2_1_42_1","volume-title":"Zero-shot image-to-text generation for visual-semantic arithmetic. arXiv preprint arXiv:2111.14447","author":"Tewel Yoad","year":"2021","unstructured":"Yoad Tewel, Yoav Shalev, Idan Schwartz, and Lior Wolf. 2021. Zero-shot image-to-text generation for visual-semantic arithmetic. arXiv preprint arXiv:2111.14447 (2021)."},{"key":"e_1_3_2_1_43_1","volume-title":"Computer Graphics Forum","author":"Vidaurre Raquel","unstructured":"Raquel Vidaurre, Igor Santesteban, Elena Garces, and Dan Casas. 2020. Fully convolutional graph neural networks for parametric virtual try-on. In Computer Graphics Forum, Vol. 39. Wiley Online Library, 145--156."},{"key":"e_1_3_2_1_44_1","volume-title":"NeRF-Art: Text-Driven Neural Radiance Fields Stylization. arXiv preprint arXiv:2212.08070","author":"Wang Can","year":"2022","unstructured":"Can Wang, Ruixiang Jiang, Menglei Chai, Mingming He, Dongdong Chen, and Jing Liao. 2022a. NeRF-Art: Text-Driven Neural Radiance Fields Stylization. arXiv preprint arXiv:2212.08070 (2022)."},{"key":"e_1_3_2_1_45_1","volume-title":"Neus: Learning neural implicit surfaces by","author":"Wang Peng","year":"2021","unstructured":"Peng Wang, Lingjie Liu, Yuan Liu, Christian Theobalt, Taku Komura, and Wenping Wang. 2021. Neus: Learning neural implicit surfaces by volume rendering for multi-view reconstruction. arXiv preprint arXiv:2106.10689 (2021)."},{"key":"e_1_3_2_1_46_1","volume-title":"Rodin: A Generative Model for Sculpting 3D Digital Avatars Using Diffusion. arXiv preprint arXiv:2212.06135","author":"Wang Tengfei","year":"2022","unstructured":"Tengfei Wang, Bo Zhang, Ting Zhang, Shuyang Gu, Jianmin Bao, Tadas Baltrusaitis, Jingjing Shen, Dong Chen, Fang Wen, Qifeng Chen, et al. 2022c. Rodin: A Generative Model for Sculpting 3D Digital Avatars Using Diffusion. arXiv preprint arXiv:2212.06135 (2022)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00917"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00776"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3478513.3480545"},{"key":"e_1_3_2_1_51_1","volume-title":"A simple baseline for zero-shot semantic segmentation with pre-trained vision-language model. arXiv preprint arXiv:2112.14757","author":"Xu Mengde","year":"2021","unstructured":"Mengde Xu, Zheng Zhang, Fangyun Wei, Yutong Lin, Yue Cao, Han Hu, and Xiang Bai. 2021. A simple baseline for zero-shot semantic segmentation with pre-trained vision-language model. arXiv preprint arXiv:2112.14757 (2021)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1631\/FITEE.2100463"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01479"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-25066-8_39"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00577"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_30"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612022","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612022","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:03:57Z","timestamp":1755821037000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612022"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":56,"alternative-id":["10.1145\/3581783.3612022","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612022","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}