{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,19]],"date-time":"2025-12-19T09:55:15Z","timestamp":1766138115522,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":78,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T00:00:00Z","timestamp":1665360000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["No. 2020AAA0106200"],"award-info":[{"award-number":["No. 2020AAA0106200"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["Nos. 61832016, U20B2070, 6210070958, 62102162"],"award-info":[{"award-number":["Nos. 61832016, U20B2070, 6210070958, 62102162"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Open Projects Program of NLPR"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,10]]},"DOI":"10.1145\/3503161.3548282","type":"proceedings-article","created":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T15:42:46Z","timestamp":1665416566000},"page":"1085-1094","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":36,"title":["Draw Your Art Dream: Diverse Digital Art Synthesis with Multimodal Guided Diffusion"],"prefix":"10.1145","author":[{"given":"Nisha","family":"Huang","sequence":"first","affiliation":[{"name":"UCAS &amp; Institute of Automation, CAS, Beijing, China"}]},{"given":"Fan","family":"Tang","sequence":"additional","affiliation":[{"name":"Jilin University, Changchun, China"}]},{"given":"Weiming","family":"Dong","sequence":"additional","affiliation":[{"name":"Institute of Automation, CAS &amp; UCAS, Beijing, China"}]},{"given":"Changsheng","family":"Xu","sequence":"additional","affiliation":[{"name":"Institute of Automation, CAS &amp; UCAS, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2022,10,10]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"2022. Yahoo Flickr Creative Commons 100 Million (YFCC100m) dataset. http:\/\/projects.dfki.uni-kl.de\/yfcc100m\/  2022. Yahoo Flickr Creative Commons 100 Million (YFCC100m) dataset. http:\/\/projects.dfki.uni-kl.de\/yfcc100m\/"},{"key":"e_1_3_2_2_2_1","unstructured":"Adverb. 2022. The BigSleep: BigGANCLIP. https:\/\/colab.research.google.com\/ drive\/1NCceX2mbiKOSlAd_o7IU7nA9UskKN5WR?usp=sharing#scrollTo= WtlDVVMvzMUd  Adverb. 2022. The BigSleep: BigGANCLIP. https:\/\/colab.research.google.com\/ drive\/1NCceX2mbiKOSlAd_o7IU7nA9UskKN5WR?usp=sharing#scrollTo= WtlDVVMvzMUd"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1162\/leon_a_01768"},{"key":"e_1_3_2_2_4_1","volume-title":"Large Scale GAN Training for High Fidelity Natural Image Synthesis. In International Conference on Learning Representations (ICLR).","author":"Brock Andrew","year":"2018","unstructured":"Andrew Brock , Jeff Donahue , and Karen Simonyan . 2018 . Large Scale GAN Training for High Fidelity Natural Image Synthesis. In International Conference on Learning Representations (ICLR). Andrew Brock, Jeff Donahue, and Karen Simonyan. 2018. Large Scale GAN Training for High Fidelity Natural Image Synthesis. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_2_5_1","unstructured":"Johannes Buchner. 2021. ImageHash:An image hashing library written in Python. https:\/\/github.com\/JohannesBuchner\/imagehash  Johannes Buchner. 2021. ImageHash:An image hashing library written in Python. https:\/\/github.com\/JohannesBuchner\/imagehash"},{"key":"e_1_3_2_2_6_1","volume-title":"DualAST: Dual Style-Learning Networks for Artistic Style Transfer. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 872--881","author":"Chen Haibo","year":"2021","unstructured":"Haibo Chen , Lei Zhao , Zhizhong Wang , Huiming Zhang , Zhiwen Zuo , Ailin Li , Wei Xing , and Dongming Lu . 2021 . DualAST: Dual Style-Learning Networks for Artistic Style Transfer. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 872--881 . Haibo Chen, Lei Zhao, Zhizhong Wang, Huiming Zhang, Zhiwen Zuo, Ailin Li, Wei Xing, and Dongming Lu. 2021. DualAST: Dual Style-Learning Networks for Artistic Style Transfer. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 872--881."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01461"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_2_9_1","volume-title":"ILVR: Conditioning Method for Denoising Diffusion Probabilistic Models. In IEEE\/CVF International Conference on Computer Vision (ICCV). 14347-- 14356","author":"Choi Jooyoung","year":"2021","unstructured":"Jooyoung Choi , Sungwon Kim , Yonghyun Jeong , Youngjune Gwon , and Sungroh Yoon . 2021 . ILVR: Conditioning Method for Denoising Diffusion Probabilistic Models. In IEEE\/CVF International Conference on Computer Vision (ICCV). 14347-- 14356 . Jooyoung Choi, Sungwon Kim, Yonghyun Jeong, Youngjune Gwon, and Sungroh Yoon. 2021. ILVR: Conditioning Method for Denoising Diffusion Probabilistic Models. In IEEE\/CVF International Conference on Computer Vision (ICCV). 14347-- 14356."},{"key":"e_1_3_2_2_10_1","unstructured":"Katherine Crowson and Chainbreakers AI. 2022. Diffusion 512x512 secondary model method. https:\/\/github.com\/crowsonkb\/v-diffusion-pytorch  Katherine Crowson and Chainbreakers AI. 2022. Diffusion 512x512 secondary model method. https:\/\/github.com\/crowsonkb\/v-diffusion-pytorch"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16208"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3016887"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01104"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3414015"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-019-7271-7"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01101"},{"key":"e_1_3_2_2_17_1","unstructured":"Prafulla Dhariwal and Alex Nichol. 2021. Diffusion Models Beat GANs on Image Synthesis. In Advances in Neural Information Processing Systems (NeurIPS).  Prafulla Dhariwal and Alex Nichol. 2021. Diffusion Models Beat GANs on Image Synthesis. In Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_2_18_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy , Lucas Beyer , Alexander Kolesnikov , Dirk Weissenborn , Xiaohua Zhai , Thomas Unterthiner , Mostafa Dehghani , Matthias Minderer , Georg Heigold , Sylvain Gelly , 2020 . An image is worth 16x16 words: Transformers for image recognition at scale . In International Conference on Learning Representations (ICLR). Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"e_1_3_2_2_20_1","volume-title":"CLIPDraw: Exploring Text-to-Drawing Synthesis through Language-Image Encoders. arXiv preprint arXiv:2106.14843","author":"Frans Kevin","year":"2021","unstructured":"Kevin Frans , LB Soros , and Olaf Witkowski . 2021. CLIPDraw: Exploring Text-to-Drawing Synthesis through Language-Image Encoders. arXiv preprint arXiv:2106.14843 ( 2021 ). Kevin Frans, LB Soros, and Olaf Witkowski. 2021. CLIPDraw: Exploring Text-to-Drawing Synthesis through Language-Image Encoders. arXiv preprint arXiv:2106.14843 (2021)."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"crossref","unstructured":"Rinon Gal Or Patashnik Haggai Maron Gal Chechik and Daniel Cohen-Or. 2021. StyleGAN-NADA: CLIP-Guided Domain Adaptation of Image Generators. arXiv:2108.00946 [cs.CV]  Rinon Gal Or Patashnik Haggai Maron Gal Chechik and Daniel Cohen-Or. 2021. StyleGAN-NADA: CLIP-Guided Domain Adaptation of Image Generators. arXiv:2108.00946 [cs.CV]","DOI":"10.1145\/3528223.3530164"},{"key":"e_1_3_2_2_22_1","unstructured":"Ian Goodfellow Jean Pouget-Abadie Mehdi Mirza Bing Xu David Warde-Farley Sherjil Ozair Aaron Courville and Yoshua Bengio. 2014. Generative Adversarial Nets. In Neural Information Processing Systems (NIPS).  Ian Goodfellow Jean Pouget-Abadie Mehdi Mirza Bing Xu David Warde-Farley Sherjil Ozair Aaron Courville and Yoshua Bengio. 2014. Generative Adversarial Nets. In Neural Information Processing Systems (NIPS)."},{"key":"e_1_3_2_2_23_1","volume-title":"Vector Quantized Diffusion Model for Text-to-Image Synthesis. arXiv preprint arXiv:2111.14822","author":"Gu Shuyang","year":"2021","unstructured":"Shuyang Gu , Dong Chen , Jianmin Bao , Fang Wen , Bo Zhang , Dongdong Chen , Lu Yuan , and Baining Guo . 2021. Vector Quantized Diffusion Model for Text-to-Image Synthesis. arXiv preprint arXiv:2111.14822 ( 2021 ). Shuyang Gu, Dong Chen, Jianmin Bao, Fang Wen, Bo Zhang, Dongdong Chen, Lu Yuan, and Baining Guo. 2021. Vector Quantized Diffusion Model for Text-to-Image Synthesis. arXiv preprint arXiv:2111.14822 (2021)."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-015-0017-1"},{"key":"e_1_3_2_2_25_1","volume-title":"Denoising Diffusion Probabilistic Models. arXiv: Learning","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho , Ajay Jain , and Pieter Abbeel . 2020. Denoising Diffusion Probabilistic Models. arXiv: Learning ( 2020 ). Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising Diffusion Probabilistic Models. arXiv: Learning (2020)."},{"key":"e_1_3_2_2_26_1","volume-title":"Classifier-Free Diffusion Guidance. In NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications.","author":"Ho Jonathan","year":"2021","unstructured":"Jonathan Ho and Tim Salimans . 2021 . Classifier-Free Diffusion Guidance. In NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications. Jonathan Ho and Tim Salimans. 2021. Classifier-Free Diffusion Guidance. In NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.167"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-021-0227-7"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-021-0209-9"},{"key":"e_1_3_2_2_30_1","unstructured":"Ajay Jain. 2021. VectorAscent: Generate vector graphics from a textual description. https:\/\/github.com\/ajayjain\/VectorAscent  Ajay Jain. 2021. VectorAscent: Generate vector graphics from a textual description. https:\/\/github.com\/ajayjain\/VectorAscent"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_43"},{"key":"e_1_3_2_2_32_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Jolicoeur-Martineau Alexia","year":"2021","unstructured":"Alexia Jolicoeur-Martineau , R\u00e9mi Pich\u00e9-Taillefer , Ioannis Mitliagkas , and Remi Tachet des Combes . 2021 . Adversarial score matching and improved sampling for image generation . In International Conference on Learning Representations (ICLR). Alexia Jolicoeur-Martineau, R\u00e9mi Pich\u00e9-Taillefer, Ioannis Mitliagkas, and Remi Tachet des Combes. 2021. Adversarial score matching and improved sampling for image generation. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_2_33_1","volume-title":"A Style-Based Generator Architecture for Generative Adversarial Networks. arXiv: Neural and Evolutionary Computing","author":"Karras Tero","year":"2018","unstructured":"Tero Karras , Samuli Laine , and Timo Aila . 2018. A Style-Based Generator Architecture for Generative Adversarial Networks. arXiv: Neural and Evolutionary Computing ( 2018 ). Tero Karras, Samuli Laine, and Timo Aila. 2018. A Style-Based Generator Architecture for Generative Adversarial Networks. arXiv: Neural and Evolutionary Computing (2018)."},{"key":"e_1_3_2_2_34_1","volume-title":"Analyzing and Improving the Image Quality of StyleGAN. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 8107--8116","author":"Karras Tero","year":"2020","unstructured":"Tero Karras , Samuli Laine , Miika Aittala , Janne Hellsten , Jaakko Lehtinen , and Timo Aila . 2020 . Analyzing and Improving the Image Quality of StyleGAN. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 8107--8116 . Tero Karras, Samuli Laine, Miika Aittala, Janne Hellsten, Jaakko Lehtinen, and Timo Aila. 2020. Analyzing and Improving the Image Quality of StyleGAN. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 8107--8116."},{"key":"e_1_3_2_2_35_1","unstructured":"Gwanghyun Kim Taesung Kwon and Jong Chul Ye. 2021. DiffusionCLIP: Text- Guided Diffusion Models for Robust Image Manipulation. (2021). https:\/\/doi.org\/10.48550\/ARXIV.2110.02711  Gwanghyun Kim Taesung Kwon and Jong Chul Ye. 2021. DiffusionCLIP: Text- Guided Diffusion Models for Robust Image Manipulation. (2021). https:\/\/doi.org\/10.48550\/ARXIV.2110.02711"},{"key":"e_1_3_2_2_36_1","volume-title":"Kingma and Prafulla Dhariwal","author":"Diederik","year":"2018","unstructured":"Diederik P. Kingma and Prafulla Dhariwal . 2018 . Glow : Generative Flow with Invertible 1x1 Convolutions. arXiv: Machine Learning ( 2018). Diederik P. Kingma and Prafulla Dhariwal. 2018. Glow: Generative Flow with Invertible 1x1 Convolutions. arXiv: Machine Learning (2018)."},{"key":"e_1_3_2_2_37_1","volume-title":"Content and Style Disentanglement for Artistic Style Transfer. In IEEE\/CVF International Conference on Computer Vision (ICCV). 4422--4431","author":"Kotovenko Dmytro","year":"2019","unstructured":"Dmytro Kotovenko , Artsiom Sanakoyeu , Sabine Lang , and Bjorn Ommer . 2019 . Content and Style Disentanglement for Artistic Style Transfer. In IEEE\/CVF International Conference on Computer Vision (ICCV). 4422--4431 . Dmytro Kotovenko, Artsiom Sanakoyeu, Sabine Lang, and Bjorn Ommer. 2019. Content and Style Disentanglement for Artistic Style Transfer. In IEEE\/CVF International Conference on Computer Vision (ICCV). 4422--4431."},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01753"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417763"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3450525"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00658"},{"key":"e_1_3_2_2_43_1","unstructured":"Xingchao Liu Chengyue Gong Lemeng Wu Shujian Zhang Hao Su and Qiang Liu. 2021. FuseDream: Training-Free Text-to-Image Generation with Improved CLIPGAN Space Optimization. arXiv:2112.01573 [cs.CV]  Xingchao Liu Chengyue Gong Lemeng Wu Shujian Zhang Hao Su and Qiang Liu. 2021. FuseDream: Training-Free Text-to-Image Generation with Improved CLIPGAN Space Optimization. arXiv:2112.01573 [cs.CV]"},{"key":"e_1_3_2_2_44_1","volume-title":"Samaneh Azadi, Gong Zhang, Arman Chopikyan, Yuxiao Hu, Humphrey Shi, Anna Rohrbach, and Trevor Darrell.","author":"Liu Xihui","year":"2021","unstructured":"Xihui Liu , Dong Huk Park , Samaneh Azadi, Gong Zhang, Arman Chopikyan, Yuxiao Hu, Humphrey Shi, Anna Rohrbach, and Trevor Darrell. 2021 . More Control for Free! Image Synthesis with Semantic Diffusion Guidance . arXiv:2112.05744 [cs.CV] Xihui Liu, Dong Huk Park, Samaneh Azadi, Gong Zhang, Arman Chopikyan, Yuxiao Hu, Humphrey Shi, Anna Rohrbach, and Trevor Darrell. 2021. More Control for Free! Image Synthesis with Semantic Diffusion Guidance. arXiv:2112.05744 [cs.CV]"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413505"},{"key":"e_1_3_2_2_46_1","volume-title":"Generating High Fidelity Images with Subscale Pixel Networks and Multidimensional Upscaling. international conference on learning representations (ICLR)","author":"Menick Jacob","year":"2018","unstructured":"Jacob Menick and Nal Kalchbrenner . 2018 . Generating High Fidelity Images with Subscale Pixel Networks and Multidimensional Upscaling. international conference on learning representations (ICLR) (2018). Jacob Menick and Nal Kalchbrenner. 2018. Generating High Fidelity Images with Subscale Pixel Networks and Multidimensional Upscaling. international conference on learning representations (ICLR) (2018)."},{"key":"e_1_3_2_2_47_1","volume-title":"Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741","author":"Nichol Alex","year":"2021","unstructured":"Alex Nichol , Prafulla Dhariwal , Aditya Ramesh , Pranav Shyam , Pamela Mishkin , Bob McGrew , Ilya Sutskever , and Mark Chen . 2021 . Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021). Alex Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, and Mark Chen. 2021. Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00209"},{"key":"e_1_3_2_2_49_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford , Jong Wook Kim , Chris Hallacy , Aditya Ramesh , Gabriel Goh , Sandhini Agarwal , Girish Sastry , Amanda Askell , Pamela Mishkin , Jack Clark , 2021 . Learning transferable visual models from natural language supervision . In International Conference on Machine Learning. PMLR, 8748--8763 . Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_2_50_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh , Prafulla Dhariwal , Alex Nichol , Casey Chu , and Mark Chen . 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 ( 2022 ). Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_2_51_1","volume-title":"Zero-Shot Text-to-Image Generation. In International Conference on Machine Learning (ICML). 8821--8831","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh , Mikhail Pavlov , Gabriel Goh , Scott Gray , Chelsea Voss , Alec Radford , Mark Chen , and Ilya Sutskever . 2021 . Zero-Shot Text-to-Image Generation. In International Conference on Machine Learning (ICML). 8821--8831 . Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-Shot Text-to-Image Generation. In International Conference on Machine Learning (ICML). 8821--8831."},{"key":"e_1_3_2_2_52_1","unstructured":"Ali Razavi A\u00e4ron van den Oord and Oriol Vinyals. 2019. Generating Diverse High-Fidelity Images with VQ-VAE-2. In Advances in Neural Information Processing Systems.  Ali Razavi A\u00e4ron van den Oord and Oriol Vinyals. 2019. Generating Diverse High-Fidelity Images with VQ-VAE-2. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_2_53_1","unstructured":"Nerdy Rodent. 2022. Source Code of VQGAN-CLIP. https:\/\/github.com\/nerdyrodent\/VQGAN-CLIP  Nerdy Rodent. 2022. Source Code of VQGAN-CLIP. https:\/\/github.com\/nerdyrodent\/VQGAN-CLIP"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2021. High-Resolution Image Synthesis with Latent Diffusion Models.  Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2021. High-Resolution Image Synthesis with Latent Diffusion Models.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"crossref","unstructured":"Olaf Ronneberger Philipp Fischer and Thomas Brox. 2015. U-Net: Convolutional Networks for Biomedical Image Segmentation. In Medical Image Computing and Computer-Assisted Intervention.  Olaf Ronneberger Philipp Fischer and Thomas Brox. 2015. U-Net: Convolutional Networks for Biomedical Image Segmentation. In Medical Image Computing and Computer-Assisted Intervention.","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"e_1_3_2_2_56_1","volume-title":"DAE-GAN: Dynamic Aspect-aware GAN for Text-to-Image Synthesis. In IEEE\/CVF International Conference on Computer Vision (ICCV). 13960--13969","author":"Ruan Shulan","year":"2021","unstructured":"Shulan Ruan , Yong Zhang , Kun Zhang , Yanbo Fan , Fan Tang , Qi Liu , and Enhong Chen . 2021 . DAE-GAN: Dynamic Aspect-aware GAN for Text-to-Image Synthesis. In IEEE\/CVF International Conference on Computer Vision (ICCV). 13960--13969 . Shulan Ruan, Yong Zhang, Kun Zhang, Yanbo Fan, Fan Tang, Qi Liu, and Enhong Chen. 2021. DAE-GAN: Dynamic Aspect-aware GAN for Text-to-Image Synthesis. In IEEE\/CVF International Conference on Computer Vision (ICCV). 13960--13969."},{"key":"e_1_3_2_2_57_1","volume-title":"StyleCLIPDraw: Coupling Content and Style in Text-to-Drawing Translation. arXiv preprint arXiv:2202.12362","author":"Schaldenbrand Peter","year":"2022","unstructured":"Peter Schaldenbrand , Zhixuan Liu , and Jean Oh. 2022. StyleCLIPDraw: Coupling Content and Style in Text-to-Drawing Translation. arXiv preprint arXiv:2202.12362 ( 2022 ). Peter Schaldenbrand, Zhixuan Liu, and Jean Oh. 2022. StyleCLIPDraw: Coupling Content and Style in Text-to-Drawing Translation. arXiv preprint arXiv:2202.12362 (2022)."},{"key":"e_1_3_2_2_58_1","volume-title":"Deep Unsupervised Learning using Nonequilibrium Thermodynamics. arXiv: Learning","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein , Eric L. Weiss , Niru Maheswaranathan , and Surya Ganguli . 2015. Deep Unsupervised Learning using Nonequilibrium Thermodynamics. arXiv: Learning ( 2015 ). Jascha Sohl-Dickstein, Eric L. Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep Unsupervised Learning using Nonequilibrium Thermodynamics. arXiv: Learning (2015)."},{"key":"e_1_3_2_2_59_1","volume-title":"Deep Unsupervised Learning using Nonequilibrium Thermodynamics. arXiv: Learning","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein , Eric L. Weiss , Niru Maheswaranathan , and Surya Ganguli . 2015. Deep Unsupervised Learning using Nonequilibrium Thermodynamics. arXiv: Learning ( 2015 ). Jascha Sohl-Dickstein, Eric L. Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep Unsupervised Learning using Nonequilibrium Thermodynamics. arXiv: Learning (2015)."},{"key":"e_1_3_2_2_60_1","volume-title":"Denoising Diffusion Implicit Models. In International Conference on Learning Representations (ICLR).","author":"Song Jiaming","year":"2021","unstructured":"Jiaming Song , Chenlin Meng , and Stefano Ermon . 2021 . Denoising Diffusion Implicit Models. In International Conference on Learning Representations (ICLR). Jiaming Song, Chenlin Meng, and Stefano Ermon. 2021. Denoising Diffusion Implicit Models. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_2_61_1","volume-title":"Generative modeling by estimating gradients of the data distribution. Advances in Neural Information Processing Systems 32","author":"Song Yang","year":"2019","unstructured":"Yang Song and Stefano Ermon . 2019. Generative modeling by estimating gradients of the data distribution. Advances in Neural Information Processing Systems 32 ( 2019 ). Yang Song and Stefano Ermon. 2019. Generative modeling by estimating gradients of the data distribution. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_2_2_62_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Song Yang","year":"2021","unstructured":"Yang Song , Jascha Sohl-Dickstein , Diederik P. Kingma , Abhishek Kumar , Stefano Ermon , and Ben Poole . 2021 . Score-Based Generative Modeling through Stochastic Differential Equations . In International Conference on Learning Representations (ICLR). Yang Song, Jascha Sohl-Dickstein, Diederik P. Kingma, Abhishek Kumar, Stefano Ermon, and Ben Poole. 2021. Score-Based Generative Modeling through Stochastic Differential Equations. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_2_63_1","volume-title":"Hern\u00e1n Aguirre, and Kiyoshi Tanaka.","author":"Tan Wei Ren","year":"2017","unstructured":"Wei Ren Tan , Chee Seng Chan , Hern\u00e1n Aguirre, and Kiyoshi Tanaka. 2017 . ArtGAN: Artwork Synthesis with Conditional Categorial GANs . Wei Ren Tan, Chee Seng Chan, Hern\u00e1n Aguirre, and Kiyoshi Tanaka. 2017. ArtGAN: Artwork Synthesis with Conditional Categorial GANs."},{"key":"e_1_3_2_2_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2866698"},{"key":"e_1_3_2_2_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2017.2774292"},{"key":"e_1_3_2_2_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475226"},{"key":"e_1_3_2_2_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2014.2303984"},{"key":"e_1_3_2_2_68_1","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 7786--7795","author":"Zhao Lei","year":"2020","unstructured":"ZhizhongWang, Lei Zhao , Haibo Chen , Lihong Qiu , Qihang Mo , Sihuan Lin , Wei Xing , and Dongming Lu . 2020 . Diversified Arbitrary Style Transfer via Deep Feature Perturbation . In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 7786--7795 . ZhizhongWang, Lei Zhao, Haibo Chen, Lihong Qiu, Qihang Mo, Sihuan Lin,Wei Xing, and Dongming Lu. 2020. Diversified Arbitrary Style Transfer via Deep Feature Perturbation. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 7786--7795."},{"key":"e_1_3_2_2_69_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11390-022-2140-7"},{"key":"e_1_3_2_2_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00143"},{"key":"e_1_3_2_2_71_1","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-021-0234-8"},{"key":"e_1_3_2_2_72_1","volume-title":"Exploring Painting Synthesis with Diffusion Models. In IEEE 1st International Conference on Digital Twins and Parallel Intelligence (DTPI). 332--335","author":"Yi Da","year":"2021","unstructured":"Da Yi , Chao Guo , and Tianxiang Bai . 2021 . Exploring Painting Synthesis with Diffusion Models. In IEEE 1st International Conference on Digital Twins and Parallel Intelligence (DTPI). 332--335 . Da Yi, Chao Guo, and Tianxiang Bai. 2021. Exploring Painting Synthesis with Diffusion Models. In IEEE 1st International Conference on Digital Twins and Parallel Intelligence (DTPI). 332--335."},{"key":"e_1_3_2_2_73_1","volume-title":"Wide residual networks. arXiv preprint arXiv:1605.07146","author":"Zagoruyko Sergey","year":"2016","unstructured":"Sergey Zagoruyko and Nikos Komodakis . 2016. Wide residual networks. arXiv preprint arXiv:1605.07146 ( 2016 ). Sergey Zagoruyko and Nikos Komodakis. 2016. Wide residual networks. arXiv preprint arXiv:1605.07146 (2016)."},{"key":"e_1_3_2_2_74_1","doi-asserted-by":"publisher","DOI":"10.1162\/LEON_a_00908"},{"key":"e_1_3_2_2_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_2_2_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528233.3530736"},{"key":"e_1_3_2_2_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240536"},{"key":"e_1_3_2_2_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.244"}],"event":{"name":"MM '22: The 30th ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Lisboa Portugal","acronym":"MM '22"},"container-title":["Proceedings of the 30th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548282","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3503161.3548282","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:00:42Z","timestamp":1750186842000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548282"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,10]]},"references-count":78,"alternative-id":["10.1145\/3503161.3548282","10.1145\/3503161"],"URL":"https:\/\/doi.org\/10.1145\/3503161.3548282","relation":{},"subject":[],"published":{"date-parts":[[2022,10,10]]},"assertion":[{"value":"2022-10-10","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}