{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T16:53:10Z","timestamp":1777654390796,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key R&D Program of China","award":["2022YFB3104703"],"award-info":[{"award-number":["2022YFB3104703"]}]},{"name":"National Natural Science Foundation of China","award":["62172103"],"award-info":[{"award-number":["62172103"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612363","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"6860-6868","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":23,"title":["3DStyle-Diffusion: Pursuing Fine-grained Text-driven 3D Stylization with 2D Diffusion Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-6521-2145","authenticated-orcid":false,"given":"Haibo","family":"Yang","sequence":"first","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9058-5051","authenticated-orcid":false,"given":"Yang","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Heifei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4344-8898","authenticated-orcid":false,"given":"Yingwei","family":"Pan","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7587-101X","authenticated-orcid":false,"given":"Ting","family":"Yao","sequence":"additional","affiliation":[{"name":"HiDream.ai Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1543-6889","authenticated-orcid":false,"given":"Zhineng","family":"Chen","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5990-7307","authenticated-orcid":false,"given":"Tao","family":"Mei","sequence":"additional","affiliation":[{"name":"HiDream.ai Inc., Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Jonathan T. Barron Ben Mildenhall Dor Verbin Pratul P. Srinivasan and Peter Hedman. 2022. Mip-NeRF 360: Unbounded Anti-Aliased Neural Radiance Fields. In CVPR.","DOI":"10.1109\/CVPR52688.2022.00539"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Tim Brooks Aleksander Holynski and Alexei A Efros. 2023. Instructpix2pix: Learning to follow image editing instructions. In CVPR.","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_2_1_3_1","volume-title":"TANGO: Text-driven Photorealistic and Robust 3D Stylization via Lighting Decomposition. In NeurIPS.","author":"Chen Yongwei","year":"2022","unstructured":"Yongwei Chen, Rui Chen, Jiabao Lei, Yabin Zhang, and Kui Jia. 2022. TANGO: Text-driven Photorealistic and Robust 3D Stylization via Lighting Decomposition. In NeurIPS."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Yang Chen Yingwei Pan Ting Yao Xinmei Tian and Tao Mei. 2019a. Animating Your Life: Real-Time Video-to-Animation Translation. In ACM MM Demo.","DOI":"10.1145\/3343031.3350593"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350937"},{"key":"e_1_3_2_1_6_1","volume-title":"Objaverse: A Universe of Annotated 3D Objects. In CVPR.","author":"Deitke Matt","year":"2023","unstructured":"Matt Deitke, Dustin Schwenk, Jordi Salvador, Luca Weihs, Oscar Michel, Eli VanderBilt, Ludwig Schmidt, Kiana Ehsani, Aniruddha Kembhavi, and Ali Farhadi. 2023. Objaverse: A Universe of Annotated 3D Objects. In CVPR."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Patrick Esser Robin Rombach and Bj\u00f6rn Ommer. 2021. Taming Transformers for High-Resolution Image Synthesis. In CVPR.","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"e_1_3_2_1_9_1","unstructured":"Yuan-Chen Guo Ying-Tian Liu Ruizhi Shao Christian Laforte Vikram Voleti Guan Luo Chia-Hao Chen Zi-Xin Zou Chen Wang Yan-Pei Cao and Song-Hai Zhang. 2023. threestudio: A unified framework for 3D content generation. https:\/\/github.com\/threestudio-project\/threestudio."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3267347","article-title":"ALIGNet: partial-shape agnostic alignment via unsupervised learning","volume":"38","author":"Hanocka Rana","year":"2018","unstructured":"Rana Hanocka, Noa Fish, Zhenhua Wang, Raja Giryes, Shachar Fleishman, and Daniel Cohen-Or. 2018. ALIGNet: partial-shape agnostic alignment via unsupervised learning. ACM Transactions on Graphics (TOG), Vol. 38, 1 (2018), 1.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"e_1_3_2_1_11_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising Diffusion Probabilistic Models. In NeurIPS."},{"key":"e_1_3_2_1_12_1","volume-title":"NeurIPS Workshop.","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-free diffusion guidance. In NeurIPS Workshop."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Nisha Huang Fan Tang Weiming Dong and Changsheng Xu. 2022. Draw your art dream: Diverse digital art synthesis with multimodal guided diffusion. In ACM MM.","DOI":"10.1145\/3503161.3548282"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Ajay Jain Ben Mildenhall Jonathan T Barron Pieter Abbeel and Ben Poole. 2022. Zero-shot text-guided object generation with dream fields. In CVPR.","DOI":"10.1109\/CVPR52688.2022.00094"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Hiroharu Kato Yoshitaka Ushiku and Tatsuya Harada. 2018. Neural 3d mesh renderer. In CVPR.","DOI":"10.1109\/CVPR.2018.00411"},{"key":"e_1_3_2_1_16_1","volume-title":"Imagic: Text-Based Real Image Editing with Diffusion Models. In CVPR.","author":"Kawar Bahjat","year":"2023","unstructured":"Bahjat Kawar, Shiran Zada, Oran Lang, Omer Tov, Huiwen Chang, Tali Dekel, Inbar Mosseri, and Michal Irani. 2023. Imagic: Text-Based Real Image Editing with Diffusion Models. In CVPR."},{"key":"e_1_3_2_1_17_1","volume-title":"BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In ICML.","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022a. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In ICML."},{"key":"e_1_3_2_1_18_1","unstructured":"Yehao Li Yingwei Pan Ting Yao Jingwen Chen and Tao Mei. 2021. Scheduled sampling in vision-language pretraining with decoupled encoder-decoder network. In AAAI."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3164083"},{"key":"e_1_3_2_1_20_1","unstructured":"Chen-Hsuan Lin Jun Gao Luming Tang Towaki Takikawa Xiaohui Zeng Xun Huang Karsten Kreis Sanja Fidler Ming-Yu Liu and Tsung-Yi Lin. 2023. Magic3D: High-Resolution Text-to-3D Content Creation. In CVPR."},{"key":"e_1_3_2_1_21_1","unstructured":"Tsung-Yi Lin Michael Maire Serge Belongie Lubomir Bourdev Ross Girshick James Hays Pietro Perona Deva Ramanan C. Lawrence Zitnick and Piotr Doll\u00e1r. 2014. Microsoft COCO: Common Objects in Context. In ECCV."},{"key":"e_1_3_2_1_22_1","first-page":"221","article-title":"Paparazzi: surface editing by way of multi-view image processing","volume":"37","author":"Derek Liu Hsueh-Ti","year":"2018","unstructured":"Hsueh-Ti Derek Liu, Michael Tao, and Alec Jacobson. 2018. Paparazzi: surface editing by way of multi-view image processing. ACM Trans. Graph., Vol. 37, 6 (2018), 221--1.","journal-title":"ACM Trans. Graph."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Gal Metzer Elad Richardson Or Patashnik Raja Giryes and Daniel Cohen-Or. 2023. Latent-NeRF for Shape-Guided Generation of 3D Shapes and Textures. In CVPR.","DOI":"10.1109\/CVPR52729.2023.01218"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Oscar Michel Roi Bar-On Richard Liu Sagie Benaim and Rana Hanocka. 2022. Text2Mesh: Text-Driven Neural Stylization for Meshes. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01313"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Ben Mildenhall Pratul P. Srinivasan Matthew Tancik Jonathan T. Barron Ravi Ramamoorthi and Ren Ng. 2020. NeRF: Representing Scenes as Neural Radiance Fields for View Synthesis. In ECCV.","DOI":"10.1007\/978-3-030-58452-8_24"},{"key":"e_1_3_2_1_26_1","volume-title":"Conditional generative adversarial nets. arXiv preprint arXiv:1411.1784","author":"Mirza Mehdi","year":"2014","unstructured":"Mehdi Mirza and Simon Osindero. 2014. Conditional generative adversarial nets. arXiv preprint arXiv:1411.1784 (2014)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Nasir Mohammad Khalid Tianhao Xie Eugene Belilovsky and Tiberiu Popa. 2022. CLIP-Mesh: Generating textured meshes from text using pretrained image-text models. In SIGGRAPH Asia.","DOI":"10.1145\/3550469.3555392"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Ron Mokady Amir Hertz Kfir Aberman Yael Pritch and Daniel Cohen-Or. 2023. Null-text Inversion for Editing Real Images using Guided Diffusion Models. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"e_1_3_2_1_29_1","volume-title":"GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In PMLR.","author":"Nichol Alex","year":"2022","unstructured":"Alex Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, and Mark Chen. 2022a. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In PMLR."},{"key":"e_1_3_2_1_30_1","volume-title":"Point-E: A System for Generating 3D Point Clouds from Complex Prompts. arXiv preprint arXiv:2212.08751","author":"Nichol Alex","year":"2022","unstructured":"Alex Nichol, Heewoo Jun, Prafulla Dhariwal, Pamela Mishkin, and Mark Chen. 2022b. Point-E: A System for Generating 3D Point Clouds from Complex Prompts. arXiv preprint arXiv:2212.08751 (2022)."},{"key":"e_1_3_2_1_31_1","unstructured":"Alexander Quinn Nichol and Prafulla Dhariwal. 2021. Improved denoising diffusion probabilistic models. In ICLR."},{"key":"e_1_3_2_1_32_1","unstructured":"Yingwei Pan Yehao Li Jianjie Luo Jun Xu Ting Yao and Tao Mei. 2022. Auto-captions on GIF: A large-scale video-sentence dataset for vision-language pre-training. In ACM MM."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3127905"},{"key":"e_1_3_2_1_34_1","unstructured":"Dario Pavllo Graham Spinks Thomas Hofmann Marie-Francine Moens and Aurelien Lucchi. 2020. Convolutional Generation of Textured 3D Meshes. In NeurIPS."},{"key":"e_1_3_2_1_35_1","volume-title":"Dreamfusion: Text-to-3d using 2d diffusion. In ICLR.","author":"Poole Ben","year":"2023","unstructured":"Ben Poole, Ajay Jain, Jonathan T Barron, and Ben Mildenhall. 2023. Dreamfusion: Text-to-3d using 2d diffusion. In ICLR."},{"key":"e_1_3_2_1_36_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML."},{"key":"e_1_3_2_1_37_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_1_38_1","unstructured":"Scott Reed Zeynep Akata Xinchen Yan Lajanugen Logeswaran Bernt Schiele and Honglak Lee. 2016. Generative Adversarial Text-to-Image Synthesis. In ICML."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2022. High-resolution image synthesis with latent diffusion models. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1016\/0146-664X(82)90169-1"},{"key":"e_1_3_2_1_41_1","volume-title":"Dreambooth: Fine tuning text-to-image diffusion models for subject-driven generation. In CVPR.","author":"Ruiz Nataniel","year":"2023","unstructured":"Nataniel Ruiz, Yuanzhen Li, Varun Jampani, Yael Pritch, Michael Rubinstein, and Kfir Aberman. 2023. Dreambooth: Fine tuning text-to-image diffusion models for subject-driven generation. In CVPR."},{"key":"e_1_3_2_1_42_1","volume-title":"Burcu Karagol Ayan, Tim Salimans, et al.","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. In NeurIPS."},{"key":"e_1_3_2_1_43_1","unstructured":"Jascha Sohl-Dickstein Eric Weiss Niru Maheswaranathan and Surya Ganguli. 2015. Deep unsupervised learning using nonequilibrium thermodynamics. In ICML."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Haochen Wang Xiaodan Du Jiahao Li Raymond A Yeh and Greg Shakhnarovich. 2023. Score Jacobian Chaining: Lifting Pretrained 2D Diffusion Models for 3D Generation. In CVPR.","DOI":"10.1109\/CVPR52729.2023.01214"},{"key":"e_1_3_2_1_45_1","volume-title":"Omnivl: One foundation model for image-language and video-language tasks. In NeurIPS.","author":"Wang Junke","year":"2022","unstructured":"Junke Wang, Dongdong Chen, Zuxuan Wu, Chong Luo, Luowei Zhou, Yucheng Zhao, Yujia Xie, Ce Liu, Yu-Gang Jiang, and Lu Yuan. 2022. Omnivl: One foundation model for image-language and video-language tasks. In NeurIPS."},{"key":"e_1_3_2_1_46_1","unstructured":"Zejia Weng Xitong Yang Ang Li Zuxuan Wu and Yu-Gang Jiang. 2023. Open-VCLIP: Transforming CLIP to an Open-vocabulary Video Model via Interpolated Weight Optimization. In ICML."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3268446"},{"key":"e_1_3_2_1_48_1","volume-title":"Physg: Inverse rendering with spherical gaussians for physics-based material editing and relighting. In CVPR.","author":"Zhang Kai","year":"2021","unstructured":"Kai Zhang, Fujun Luan, Qianqian Wang, Kavita Bala, and Noah Snavely. 2021. Physg: Inverse rendering with spherical gaussians for physics-based material editing and relighting. In CVPR."},{"key":"e_1_3_2_1_49_1","volume-title":"Adding conditional control to text-to-image diffusion models. arXiv preprint arXiv:2302.05543","author":"Zhang Lvmin","year":"2023","unstructured":"Lvmin Zhang and Maneesh Agrawala. 2023. Adding conditional control to text-to-image diffusion models. arXiv preprint arXiv:2302.05543 (2023)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Richard Zhang Phillip Isola Alexei A Efros Eli Shechtman and Oliver Wang. 2018. The Unreasonable Effectiveness of Deep Features as a Perceptual Metric. In CVPR.","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Yiheng Zhang Zhaofan Qiu Yingwei Pan Ting Yao and Tao Mei. 2023. Learning Neural Implicit Surfaces with Object-Aware Radiance Fields. In ICCV.","DOI":"10.1109\/ICCV51070.2023.01640"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612363","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612363","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:03:38Z","timestamp":1755821018000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612363"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":51,"alternative-id":["10.1145\/3581783.3612363","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612363","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}