{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,15]],"date-time":"2026-04-15T22:08:59Z","timestamp":1776290939249,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,27]],"date-time":"2024-10-27T00:00:00Z","timestamp":1729987200000},"content-version":"vor","delay-in-days":367,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100020959","name":"JST-Mirai Program","doi-asserted-by":"publisher","award":["JPMJMI21H1"],"award-info":[{"award-number":["JPMJMI21H1"]}],"id":[{"id":"10.13039\/501100020959","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612191","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"5321-5329","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":32,"title":["Guided Image Synthesis via Initial Image Editing in Diffusion Model"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-0907-7522","authenticated-orcid":false,"given":"Jiafeng","family":"Mao","sequence":"first","affiliation":[{"name":"The University of Tokyo, Bunkyo-ku, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1009-1095","authenticated-orcid":false,"given":"Xueting","family":"Wang","sequence":"additional","affiliation":[{"name":"CyberAgent, Inc., Shibuya-ku, Tokyo , Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2146-6275","authenticated-orcid":false,"given":"Kiyoharu","family":"Aizawa","sequence":"additional","affiliation":[{"name":"The University of Tokyo, Bunkyo-ku, Japan"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"SpaText: Spatio-Textual Representation for Controllable Image Generation. arXiv preprint arXiv:2211.14305","author":"Avrahami Omri","year":"2022","unstructured":"Omri Avrahami, Thomas Hayes, Oran Gafni, Sonal Gupta, Yaniv Taigman, Devi Parikh, Dani Lischinski, Ohad Fried, and Xi Yin. 2022. SpaText: Spatio-Textual Representation for Controllable Image Generation. arXiv preprint arXiv:2211.14305 (2022)."},{"key":"e_1_3_2_2_2_1","unstructured":"Yogesh Balaji Seungjun Nah Xun Huang Arash Vahdat Jiaming Song Karsten Kreis Miika Aittala Timo Aila Samuli Laine Bryan Catanzaro et al. 2022. ediffi: Text-to-image diffusion models with an ensemble of expert denoisers. arXiv preprint arXiv:2211.01324 (2022)."},{"key":"e_1_3_2_2_3_1","first-page":"8780","article-title":"Diffusion models beat gans on image synthesis","volume":"34","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. NeurIPS, Vol. 34 (2021), 8780--8794.","journal-title":"NeurIPS"},{"key":"e_1_3_2_2_4_1","first-page":"19822","article-title":"Cogview: Mastering text-to-image generation via transformers","volume":"34","author":"Ding Ming","year":"2021","unstructured":"Ming Ding, Zhuoyi Yang, Wenyi Hong, Wendi Zheng, Chang Zhou, Da Yin, Junyang Lin, Xu Zou, Zhou Shao, Hongxia Yang, et al. 2021. Cogview: Mastering text-to-image generation via transformers. NeurIPS, Vol. 34 (2021), 19822--19835.","journal-title":"NeurIPS"},{"key":"e_1_3_2_2_5_1","volume-title":"Xin Eric Wang, and William Yang Wang","author":"Feng Weixi","year":"2022","unstructured":"Weixi Feng, Xuehai He, Tsu-Jui Fu, Varun Jampani, Arjun Akula, Pradyumna Narayana, Sugato Basu, Xin Eric Wang, and William Yang Wang. 2022. Training-Free Structured Diffusion Guidance for Compositional Text-to-Image Synthesis. arXiv preprint arXiv:2212.05032 (2022)."},{"key":"e_1_3_2_2_6_1","volume-title":"Make-a-scene: Scene-based text-to-image generation with human priors","author":"Gafni Oran","year":"2022","unstructured":"Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv Taigman. 2022. Make-a-scene: Scene-based text-to-image generation with human priors. In ECCV. Springer, 89--106."},{"key":"e_1_3_2_2_7_1","volume-title":"Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626","author":"Hertz Amir","year":"2022","unstructured":"Amir Hertz, Ron Mokady, Jay Tenenbaum, Kfir Aberman, Yael Pritch, and Daniel Cohen-Or. 2022. Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626 (2022)."},{"key":"e_1_3_2_2_8_1","volume-title":"NeurIPS","volume":"30","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. NeurIPS, Vol. 30 (2017)."},{"key":"e_1_3_2_2_9_1","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. NeurIPS, Vol. 33 (2020), 6840--6851.","journal-title":"NeurIPS"},{"key":"e_1_3_2_2_10_1","volume-title":"Classifier-Free Diffusion Guidance. In NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications.","author":"Ho Jonathan","unstructured":"Jonathan Ho and Tim Salimans. [n.,d.]. Classifier-Free Diffusion Guidance. In NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications."},{"key":"e_1_3_2_2_11_1","volume-title":"Composer: Creative and controllable image synthesis with composable conditions. arXiv preprint arXiv:2302.09778","author":"Huang Lianghua","year":"2023","unstructured":"Lianghua Huang, Di Chen, Yu Liu, Yujun Shen, Deli Zhao, and Jingren Zhou. 2023. Composer: Creative and controllable image synthesis with composable conditions. arXiv preprint arXiv:2302.09778 (2023)."},{"key":"e_1_3_2_2_12_1","volume-title":"Imagic: Text-based real image editing with diffusion models. arXiv preprint arXiv:2210.09276","author":"Kawar Bahjat","year":"2022","unstructured":"Bahjat Kawar, Shiran Zada, Oran Lang, Omer Tov, Huiwen Chang, Tali Dekel, Inbar Mosseri, and Michal Irani. 2022. Imagic: Text-based real image editing with diffusion models. arXiv preprint arXiv:2210.09276 (2022)."},{"key":"e_1_3_2_2_13_1","volume-title":"Diffusionclip: Text-guided diffusion models for robust image manipulation. In CVPR. 2426--2435.","author":"Kim Gwanghyun","year":"2022","unstructured":"Gwanghyun Kim, Taesung Kwon, and Jong Chul Ye. 2022. Diffusionclip: Text-guided diffusion models for robust image manipulation. In CVPR. 2426--2435."},{"key":"e_1_3_2_2_14_1","volume-title":"Microsoft coco: Common objects in context","author":"Lin Tsung-Yi","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In ECCV. Springer, 740--755."},{"key":"e_1_3_2_2_15_1","unstructured":"Luping Liu Yi Ren Zhijie Lin and Zhou Zhao. 2022b. Pseudo Numerical Methods for Diffusion Models on Manifolds. In ICLR."},{"key":"e_1_3_2_2_16_1","volume-title":"Compositional visual generation with composable diffusion models","author":"Liu Nan","unstructured":"Nan Liu, Shuang Li, Yilun Du, Antonio Torralba, and Joshua B Tenenbaum. 2022a. Compositional visual generation with composable diffusion models. In ECCV. Springer, 423--439."},{"key":"e_1_3_2_2_17_1","volume-title":"Training-Free Location-Aware Text-to-Image Synthesis. arXiv preprint arXiv:2304.13427","author":"Mao Jiafeng","year":"2023","unstructured":"Jiafeng Mao and Xueting Wang. 2023. Training-Free Location-Aware Text-to-Image Synthesis. arXiv preprint arXiv:2304.13427 (2023)."},{"key":"e_1_3_2_2_18_1","volume-title":"T2I-Adapter: Learning Adapters to Dig out More Controllable Ability for Text-to-Image Diffusion Models. arXiv e-prints","author":"Mou Chong","year":"2023","unstructured":"Chong Mou, Xintao Wang, Liangbin Xie, Jian Zhang, Zhongang Qi, Ying Shan, and Xiaohu Qie. 2023. T2I-Adapter: Learning Adapters to Dig out More Controllable Ability for Text-to-Image Diffusion Models. arXiv e-prints (2023), arXiv--2302."},{"key":"e_1_3_2_2_19_1","unstructured":"Alexander Quinn Nichol and Prafulla Dhariwal. 2021. Improved denoising diffusion probabilistic models. In ICML. PMLR 8162--8171."},{"key":"e_1_3_2_2_20_1","volume-title":"GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In ICML. PMLR, 16784--16804.","author":"Nichol Alexander Quinn","year":"2022","unstructured":"Alexander Quinn Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob Mcgrew, Ilya Sutskever, and Mark Chen. 2022. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In ICML. PMLR, 16784--16804."},{"key":"e_1_3_2_2_21_1","series-title":"Round 1","volume-title":"NeurIPS Datasets and Benchmarks Track","author":"Park Dong Huk","unstructured":"Dong Huk Park, Samaneh Azadi, Xihui Liu, Trevor Darrell, and Anna Rohrbach. 2021. Benchmark for compositional text-to-image synthesis. In NeurIPS Datasets and Benchmarks Track (Round 1)."},{"key":"e_1_3_2_2_22_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML. PMLR, 8748--8763."},{"key":"e_1_3_2_2_23_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_2_24_1","unstructured":"Aditya Ramesh Mikhail Pavlov Gabriel Goh Scott Gray Chelsea Voss Alec Radford Mark Chen and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In ICML. PMLR 8821--8831."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2022. High-resolution image synthesis with latent diffusion models. In CVPR. 10684--10695.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_2_26_1","volume-title":"U-net: Convolutional networks for biomedical image segmentation","author":"Ronneberger Olaf","year":"2015","unstructured":"Olaf Ronneberger, Philipp Fischer, and Thomas Brox. 2015. U-net: Convolutional networks for biomedical image segmentation. In MICCAI. Springer, 234--241."},{"key":"e_1_3_2_2_27_1","unstructured":"Jiaming Song Chenlin Meng and Stefano Ermon. 2021. Denoising Diffusion Implicit Models. In ICLR."},{"key":"e_1_3_2_2_28_1","volume-title":"NeurIPS","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. NeurIPS, Vol. 30 (2017)."},{"key":"e_1_3_2_2_29_1","volume-title":"Sketch-Guided Text-to-Image Diffusion Models. arXiv preprint arXiv:2211.13752","author":"Voynov Andrey","year":"2022","unstructured":"Andrey Voynov, Kfir Aberman, and Daniel Cohen-Or. 2022. Sketch-Guided Text-to-Image Diffusion Models. arXiv preprint arXiv:2211.13752 (2022)."},{"key":"e_1_3_2_2_30_1","volume-title":"You only learn one representation: Unified network for multiple tasks. arXiv preprint arXiv:2105.04206","author":"Wang Chien-Yao","year":"2021","unstructured":"Chien-Yao Wang, I-Hau Yeh, and Hong-Yuan Mark Liao. 2021. You only learn one representation: Unified network for multiple tasks. arXiv preprint arXiv:2105.04206 (2021)."},{"key":"e_1_3_2_2_31_1","volume-title":"Pretraining is all you need for image-to-image translation. arXiv preprint arXiv:2205.12952","author":"Wang Tengfei","year":"2022","unstructured":"Tengfei Wang, Ting Zhang, Bo Zhang, Hao Ouyang, Dong Chen, Qifeng Chen, and Fang Wen. 2022. Pretraining is all you need for image-to-image translation. arXiv preprint arXiv:2205.12952 (2022)."},{"key":"e_1_3_2_2_32_1","volume-title":"Adding conditional control to text-to-image diffusion models. arXiv preprint arXiv:2302.05543","author":"Zhang Lvmin","year":"2023","unstructured":"Lvmin Zhang and Maneesh Agrawala. 2023. Adding conditional control to text-to-image diffusion models. arXiv preprint arXiv:2302.05543 (2023)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612191","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612191","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612191","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:02:22Z","timestamp":1755820942000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612191"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":32,"alternative-id":["10.1145\/3581783.3612191","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612191","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}