{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T18:07:19Z","timestamp":1775066839821,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":84,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,12,10]],"date-time":"2023-12-10T00:00:00Z","timestamp":1702166400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-sa\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,12,10]]},"DOI":"10.1145\/3610548.3618154","type":"proceedings-article","created":{"date-parts":[[2023,12,11]],"date-time":"2023-12-11T12:28:40Z","timestamp":1702297720000},"page":"1-12","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":118,"title":["Break-A-Scene: Extracting Multiple Concepts from a Single Image"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7628-7525","authenticated-orcid":false,"given":"Omri","family":"Avrahami","sequence":"first","affiliation":[{"name":"The Hebrew University of Jerusalem, Israel"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4958-601X","authenticated-orcid":false,"given":"Kfir","family":"Aberman","sequence":"additional","affiliation":[{"name":"Google Research, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7109-4006","authenticated-orcid":false,"given":"Ohad","family":"Fried","sequence":"additional","affiliation":[{"name":"Reichman University, Israel"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6777-7445","authenticated-orcid":false,"given":"Daniel","family":"Cohen-Or","sequence":"additional","affiliation":[{"name":"Tel Aviv University, Israel"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6191-0361","authenticated-orcid":false,"given":"Dani","family":"Lischinski","sequence":"additional","affiliation":[{"name":"The Hebrew University of Jerusalem, Israel"}]}],"member":"320","published-online":{"date-parts":[[2023,12,11]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00453"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00832"},{"key":"e_1_3_2_2_3_1","volume-title":"HyperStyle: StyleGAN Inversion with HyperNetworks for Real Image Editing. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","author":"Alaluf Yuval","year":"2021","unstructured":"Yuval Alaluf, Omer Tov, Ron Mokady, Rinon Gal, and Amit\u00a0Haim Bermano. 2021. HyperStyle: StyleGAN Inversion with HyperNetworks for Real Image Editing. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021), 18490\u201318500. https:\/\/api.semanticscholar.org\/CorpusID:244729249"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592450"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01762"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01767"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19784-0_41"},{"key":"e_1_3_2_2_8_1","volume-title":"Multidiffusion: Fusing diffusion paths for controlled image generation.","author":"Bar-Tal Omer","year":"2023","unstructured":"Omer Bar-Tal, Lior Yariv, Yaron Lipman, and Tali Dekel. 2023. Multidiffusion: Fusing diffusion paths for controlled image generation. (2023)."},{"key":"e_1_3_2_2_9_1","unstructured":"David Bau Alex Andonian Audrey Cui YeonHwan Park Ali Jahanian Aude Oliva and Antonio Torralba. 2021. Paint by Word. arxiv:2103.10951\u00a0[cs.CV]"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3323023"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"crossref","unstructured":"Tim Brooks Aleksander Holynski and Alexei\u00a0A. Efros. 2023. InstructPix2Pix: Learning to Follow Image Editing Instructions. In CVPR.","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_2_2_12_1","volume-title":"Emerging Properties in Self-Supervised Vision Transformers. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV)","author":"Caron Mathilde","year":"2021","unstructured":"Mathilde Caron, Hugo Touvron, Ishan Misra, Herv\u2019e J\u2019egou, Julien Mairal, Piotr Bojanowski, and Armand Joulin. 2021. Emerging Properties in Self-Supervised Vision Transformers. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021), 9630\u20139640."},{"key":"e_1_3_2_2_13_1","volume-title":"International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:255372955","author":"Chang Huiwen","year":"2023","unstructured":"Huiwen Chang, Han Zhang, Jarred Barber, AJ Maschinot, Jos\u00e9 Lezama, Lu Jiang, Ming Yang, Kevin\u00a0P. Murphy, William\u00a0T. Freeman, Michael Rubinstein, Yuanzhen Li, and Dilip Krishnan. 2023. Muse: Text-To-Image Generation via Masked Generative Transformers. In International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:255372955"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592116"},{"key":"e_1_3_2_2_15_1","volume-title":"Transformer Interpretability Beyond Attention Visualization. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Chefer Hila","year":"2020","unstructured":"Hila Chefer, Shir Gur, and Lior Wolf. 2020. Transformer Interpretability Beyond Attention Visualization. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020), 782\u2013791."},{"key":"e_1_3_2_2_16_1","volume-title":"Generic Attention-model Explainability for Interpreting Bi-Modal and Encoder-Decoder Transformers. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV)","author":"Chefer Hila","year":"2021","unstructured":"Hila Chefer, Shir Gur, and Lior Wolf. 2021. Generic Attention-model Explainability for Interpreting Bi-Modal and Encoder-Decoder Transformers. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021), 387\u2013396."},{"key":"e_1_3_2_2_17_1","volume-title":"Subject-driven Text-to-Image Generation via Apprenticeship Learning. ArXiv abs\/2304.00186","author":"Chen Wenhu","year":"2023","unstructured":"Wenhu Chen, Hexiang Hu, Yandong Li, Nataniel Rui, Xuhui Jia, Ming-Wei Chang, and William\u00a0W. Cohen. 2023. Subject-driven Text-to-Image Generation via Apprenticeship Learning. ArXiv abs\/2304.00186 (2023)."},{"key":"e_1_3_2_2_18_1","unstructured":"Bowen Cheng Alexander\u00a0G. Schwing and Alexander Kirillov. 2021. Per-Pixel Classification is Not All You Need for Semantic Segmentation. In Neural Information Processing Systems."},{"key":"e_1_3_2_2_19_1","volume-title":"European Conference on Computer Vision. Springer, 558\u2013577","author":"Cohen Niv","year":"2022","unstructured":"Niv Cohen, Rinon Gal, Eli\u00a0A Meirom, Gal Chechik, and Yuval Atzmon. 2022. \u201cThis is my unicorn, Fluffy\u201d: Personalizing frozen vision-language representations. In European Conference on Computer Vision. Springer, 558\u2013577."},{"key":"e_1_3_2_2_20_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Couairon Guillaume","year":"2022","unstructured":"Guillaume Couairon, Jakob Verbeek, Holger Schwenk, and Matthieu Cord. 2022. DiffEdit: Diffusion-based semantic image editing with mask guidance. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_6"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19784-0_6"},{"key":"e_1_3_2_2_23_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Gal Rinon","year":"2022","unstructured":"Rinon Gal, Yuval Alaluf, Yuval Atzmon, Or Patashnik, Amit\u00a0Haim Bermano, Gal Chechik, and Daniel Cohen-or. 2022. An Image is Worth One Word: Personalizing Text-to-Image Generation using Textual Inversion. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592133"},{"key":"e_1_3_2_2_25_1","volume-title":"Generative adversarial nets. Advances in neural information processing systems 27","author":"Goodfellow Ian","year":"2014","unstructured":"Ian Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair, Aaron Courville, and Yoshua Bengio. 2014. Generative adversarial nets. Advances in neural information processing systems 27 (2014)."},{"key":"e_1_3_2_2_26_1","volume-title":"SVDiff: Compact Parameter Space for Diffusion Fine-Tuning. ArXiv abs\/2303.11305","author":"Han Ligong","year":"2023","unstructured":"Ligong Han, Yinxiao Li, Han Zhang, Peyman Milanfar, Dimitris\u00a0N. Metaxas, and Feng Yang. 2023. SVDiff: Compact Parameter Space for Diffusion Fine-Tuning. ArXiv abs\/2303.11305 (2023)."},{"key":"e_1_3_2_2_27_1","volume-title":"Prompt-to-Prompt Image Editing with Cross-Attention Control. In The Eleventh International Conference on Learning Representations.","author":"Hertz Amir","year":"2022","unstructured":"Amir Hertz, Ron Mokady, Jay Tenenbaum, Kfir Aberman, Yael Pritch, and Daniel Cohen-or. 2022. Prompt-to-Prompt Image Editing with Cross-Attention Control. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_2_28_1","volume-title":"Imagen Video: High Definition Video Generation with Diffusion Models. ArXiv abs\/2210.02303","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho, William Chan, Chitwan Saharia, Jay Whang, Ruiqi Gao, Alexey\u00a0A. Gritsenko, Diederik\u00a0P. Kingma, Ben Poole, Mohammad Norouzi, David\u00a0J. Fleet, and Tim Salimans. 2022. Imagen Video: High Definition Video Generation with Diffusion Models. ArXiv abs\/2210.02303 (2022)."},{"key":"e_1_3_2_2_29_1","volume-title":"Proc.\u00a0NeurIPS.","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising Diffusion Probabilistic Models. In Proc.\u00a0NeurIPS."},{"key":"e_1_3_2_2_30_1","volume-title":"Conffusion: Confidence Intervals for Diffusion Models. ArXiv abs\/2211.09795","author":"Horwitz Eliahu","year":"2022","unstructured":"Eliahu Horwitz and Yedid Hoshen. 2022. Conffusion: Confidence Intervals for Diffusion Models. ArXiv abs\/2211.09795 (2022)."},{"key":"e_1_3_2_2_31_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations.","author":"Hu J","year":"2021","unstructured":"Edward\u00a0J Hu, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, 2021. LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592123"},{"key":"e_1_3_2_2_33_1","volume-title":"Taming Encoder for Zero Fine-tuning Image Customization with Text-to-Image Diffusion Models. ArXiv abs\/2304.02642","author":"Jia Xuhui","year":"2023","unstructured":"Xuhui Jia, Yang Zhao, Kelvin C.\u00a0K. Chan, Yandong Li, Han-Ying Zhang, Boqing Gong, Tingbo Hou, H. Wang, and Yu-Chuan Su. 2023. Taming Encoder for Zero Fine-tuning Image Customization with Text-to-Image Diffusion Models. ArXiv abs\/2304.02642 (2023)."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00453"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"crossref","unstructured":"Alexander Kirillov Eric Mintun Nikhila Ravi Hanzi Mao Chloe Rolland Laura Gustafson Tete Xiao Spencer Whitehead Alexander\u00a0C. Berg Wan-Yen Lo Piotr Doll\u00e1r and Ross Girshick. 2023. Segment Anything. arxiv:2304.02643\u00a0[cs.CV]","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01753"},{"key":"e_1_3_2_2_40_1","volume-title":"Microsoft COCO: Common Objects in Context. In European Conference on Computer Vision.","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge\u00a0J. Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C.\u00a0Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In European Conference on Computer Vision."},{"key":"e_1_3_2_2_41_1","volume-title":"Generating Images from Captions with Attention. CoRR abs\/1511.02793","author":"Mansimov Elman","year":"2016","unstructured":"Elman Mansimov, Emilio Parisotto, Jimmy Ba, and Ruslan Salakhutdinov. 2016. Generating Images from Captions with Attention. CoRR abs\/1511.02793 (2016)."},{"key":"e_1_3_2_2_42_1","volume-title":"SDEdit: Guided Image Synthesis and Editing with Stochastic Differential Equations. In International Conference on Learning Representations.","author":"Meng Chenlin","year":"2021","unstructured":"Chenlin Meng, Yutong He, Yang Song, Jiaming Song, Jiajun Wu, Jun-Yan Zhu, and Stefano Ermon. 2021. SDEdit: Guided Image Synthesis and Editing with Stochastic Differential Equations. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"e_1_3_2_2_44_1","volume-title":"Dreamix: Video Diffusion Models are General Video Editors","author":"Molad Eyal","year":"2023","unstructured":"Eyal Molad, Eliahu Horwitz, Dani Valevski, Alex\u00a0Rav Acha, Y. Matias, Yael Pritch, Yaniv Leviathan, and Yedid Hoshen. 2023. Dreamix: Video Diffusion Models are General Video Editors. ArXiv abs\/2302.01329 (2023)."},{"key":"e_1_3_2_2_45_1","volume-title":"GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:245335086","author":"Nichol Alex","year":"2021","unstructured":"Alex Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, and Mark Chen. 2021. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:245335086"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550454.3555436"},{"key":"e_1_3_2_2_47_1","volume-title":"Localizing Object-level Shape Variations with Text-to-Image Diffusion Models. ArXiv abs\/2303.11306","author":"Patashnik Or","year":"2023","unstructured":"Or Patashnik, Daniel Garibi, Idan Azuri, Hadar Averbuch-Elor, and Daniel Cohen-Or. 2023. Localizing Object-level Shape Variations with Text-to-Image Diffusion Models. ArXiv abs\/2303.11306 (2023)."},{"key":"e_1_3_2_2_48_1","volume-title":"StyleCLIP: Text-Driven Manipulation of StyleGAN Imagery. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021)","author":"Patashnik Or","year":"2021","unstructured":"Or Patashnik, Zongze Wu, Eli Shechtman, Daniel Cohen-Or, and Dani Lischinski. 2021. StyleCLIP: Text-Driven Manipulation of StyleGAN Imagery. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021), 2065\u20132074. https:\/\/api.semanticscholar.org\/CorpusID:232428282"},{"key":"e_1_3_2_2_49_1","volume-title":"Adversarial Latent Autoencoders. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Pidhorskyi Stanislav","year":"2020","unstructured":"Stanislav Pidhorskyi, Donald\u00a0A. Adjeroh, and Gianfranco Doretto. 2020. Adversarial Latent Autoencoders. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020), 14092\u201314101."},{"key":"e_1_3_2_2_50_1","volume-title":"Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning."},{"key":"e_1_3_2_2_51_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_2_52_1","volume-title":"International Conference on Machine Learning. PMLR, 8821\u20138831","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In International Conference on Machine Learning. PMLR, 8821\u20138831."},{"key":"e_1_3_2_2_53_1","volume-title":"Proc.\u00a0ICLR. 1060\u20131069","author":"Reed Scott","year":"2016","unstructured":"Scott Reed, Zeynep Akata, Xinchen Yan, Lajanugen Logeswaran, Bernt Schiele, and Honglak Lee. 2016. Generative adversarial text to image synthesis. In Proc.\u00a0ICLR. 1060\u20131069."},{"key":"e_1_3_2_2_54_1","volume-title":"2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Richardson Elad","year":"2020","unstructured":"Elad Richardson, Yuval Alaluf, Or Patashnik, Yotam Nitzan, Yaniv Azar, Stav Shapiro, and Daniel Cohen-Or. 2020. Encoding in Style: a StyleGAN Encoder for Image-to-Image Translation. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020), 2287\u20132296."},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591503"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544777"},{"key":"e_1_3_2_2_57_1","volume-title":"High-Resolution Image Synthesis with Latent Diffusion Models. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Rombach Robin","year":"2021","unstructured":"Robin Rombach, A. Blattmann, Dominik Lorenz, Patrick Esser, and Bj\u00f6rn Ommer. 2021. High-Resolution Image Synthesis with Latent Diffusion Models. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021), 10674\u201310685."},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_2_2_59_1","unstructured":"Simo Ryu. 2022. Low-rank Adaptation for Fast Text-to-Image Diffusion Fine-tuning. https:\/\/github.com\/cloneofsimo\/lora."},{"key":"e_1_3_2_2_60_1","volume-title":"Palette: Image-to-Image Diffusion Models. ACM SIGGRAPH 2022 Conference Proceedings","author":"Saharia Chitwan","year":"2021","unstructured":"Chitwan Saharia, William Chan, Huiwen Chang, Chris\u00a0A. Lee, Jonathan Ho, Tim Salimans, David\u00a0J. Fleet, and Mohammad Norouzi. 2021a. Palette: Image-to-Image Diffusion Models. ACM SIGGRAPH 2022 Conference Proceedings (2021)."},{"key":"e_1_3_2_2_61_1","first-page":"36479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume":"35","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily\u00a0L Denton, Kamyar Ghasemipour, Raphael Gontijo\u00a0Lopes, Burcu Karagol\u00a0Ayan, Tim Salimans, 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in Neural Information Processing Systems 35 (2022), 36479\u201336494.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_62_1","first-page":"4713","article-title":"Image Super-Resolution via Iterative Refinement","volume":"45","author":"Saharia Chitwan","year":"2021","unstructured":"Chitwan Saharia, Jonathan Ho, William Chan, Tim Salimans, David\u00a0J. Fleet, and Mohammad Norouzi. 2021b. Image Super-Resolution via Iterative Refinement. IEEE Transactions on Pattern Analysis and Machine Intelligence 45 (2021), 4713\u20134726.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_2_63_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Sheynin Shelly","year":"2022","unstructured":"Shelly Sheynin, Oron Ashual, Adam Polyak, Uriel Singer, Oran Gafni, Eliya Nachmani, and Yaniv Taigman. 2022. kNN-Diffusion: Image Generation via Large-Scale Retrieval. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_2_64_1","volume-title":"InstantBooth: Personalized Text-to-Image Generation without Test-Time Finetuning. ArXiv abs\/2304.03411","author":"Shi Jing","year":"2023","unstructured":"Jing Shi, Wei Xiong, Zhe\u00a0L. Lin, and Hyun\u00a0Joon Jung. 2023. InstantBooth: Personalized Text-to-Image Generation without Test-Time Finetuning. ArXiv abs\/2304.03411 (2023)."},{"key":"e_1_3_2_2_65_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Singer Uriel","year":"2022","unstructured":"Uriel Singer, Adam Polyak, Thomas Hayes, Xi Yin, Jie An, Songyang Zhang, Qiyuan Hu, Harry Yang, Oron Ashual, Oran Gafni, 2022. Make-A-Video: Text-to-Video Generation without Text-Video Data. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_2_66_1","volume-title":"International Conference on Machine Learning. PMLR, 2256\u20132265","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein, Eric Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep unsupervised learning using nonequilibrium thermodynamics. In International Conference on Machine Learning. PMLR, 2256\u20132265."},{"key":"e_1_3_2_2_67_1","volume-title":"Denoising Diffusion Implicit Models. In International Conference on Learning Representations.","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2020. Denoising Diffusion Implicit Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_68_1","volume-title":"Generative modeling by estimating gradients of the data distribution. Advances in Neural Information Processing Systems 32","author":"Song Yang","year":"2019","unstructured":"Yang Song and Stefano Ermon. 2019. Generative modeling by estimating gradients of the data distribution. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_2_2_69_1","volume-title":"ObjectStitch: Generative Object Compositing. ArXiv abs\/2212.00932","author":"Song Yi-Zhe","year":"2022","unstructured":"Yi-Zhe Song, Zhifei Zhang, Zhe\u00a0L. Lin, Scott\u00a0D. Cohen, Brian\u00a0L. Price, Jianming Zhang, Soo\u00a0Ye Kim, and Daniel\u00a0G. Aliaga. 2022. ObjectStitch: Generative Object Compositing. ArXiv abs\/2212.00932 (2022)."},{"key":"e_1_3_2_2_70_1","volume-title":"Key-Locked Rank One Editing for Text-to-Image Personalization. ACM SIGGRAPH 2023 Conference Proceedings","author":"Tewel Yoad","year":"2023","unstructured":"Yoad Tewel, Rinon Gal, Gal Chechik, and Yuval Atzmon. 2023. Key-Locked Rank One Editing for Text-to-Image Personalization. ACM SIGGRAPH 2023 Conference Proceedings (2023). https:\/\/api.semanticscholar.org\/CorpusID:258436985"},{"key":"e_1_3_2_2_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3450626.3459838"},{"key":"e_1_3_2_2_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00191"},{"key":"e_1_3_2_2_73_1","volume-title":"UniTune: Text-Driven Image Editing by Fine Tuning an Image Generation Model on a Single Image. arXiv preprint arXiv:2210.09477","author":"Valevski Dani","year":"2022","unstructured":"Dani Valevski, Matan Kalman, Yossi Matias, and Yaniv Leviathan. 2022. UniTune: Text-Driven Image Editing by Fine Tuning an Image Generation Model on a Single Image. arXiv preprint arXiv:2210.09477 (2022)."},{"key":"e_1_3_2_2_74_1","volume-title":"Extended Textual Conditioning in Text-to-Image Generation. ArXiv abs\/2303.09522","author":"Voynov Andrey","year":"2023","unstructured":"Andrey Voynov, Q. Chu, Daniel Cohen-Or, and Kfir Aberman. 2023. P+: Extended Textual Conditioning in Text-to-Image Generation. ArXiv abs\/2303.09522 (2023)."},{"key":"e_1_3_2_2_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01761"},{"key":"e_1_3_2_2_76_1","volume-title":"ELITE: Encoding Visual Concepts into Textual Embeddings for Customized Text-to-Image Generation. ArXiv abs\/2302.13848","author":"Wei Yuxiang","year":"2023","unstructured":"Yuxiang Wei, Yabo Zhang, Zhilong Ji, Jinfeng Bai, Lei Zhang, and Wangmeng Zuo. 2023. ELITE: Encoding Visual Concepts into Textual Embeddings for Customized Text-to-Image Generation. ArXiv abs\/2302.13848 (2023)."},{"key":"e_1_3_2_2_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3181070"},{"key":"e_1_3_2_2_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00143"},{"key":"e_1_3_2_2_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01763"},{"key":"e_1_3_2_2_80_1","volume-title":"Scaling Autoregressive Models for Content-Rich Text-to-Image Generation. arXiv preprint arXiv:2206.10789","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Yuanzhong Xu, Jing\u00a0Yu Koh, Thang Luong, Gunjan Baid, Zirui Wang, Vijay Vasudevan, Alexander Ku, Yinfei Yang, Burcu\u00a0Karagol Ayan, 2022. Scaling Autoregressive Models for Content-Rich Text-to-Image Generation. arXiv preprint arXiv:2206.10789 (2022)."},{"key":"e_1_3_2_2_81_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.629"},{"key":"e_1_3_2_2_82_1","volume-title":"Realistic image synthesis with stacked generative adversarial networks","author":"Zhang Han","year":"2018","unstructured":"Han Zhang, Tao Xu, Hongsheng Li, Shaoting Zhang, Xiaogang Wang, Xiaolei Huang, and Dimitris\u00a0N Metaxas. 2018. StackGAN++: Realistic image synthesis with stacked generative adversarial networks. IEEE transactions on pattern analysis and machine intelligence 41, 8 (2018), 1947\u20131962."},{"key":"e_1_3_2_2_83_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58520-4_35"},{"key":"e_1_3_2_2_84_1","volume-title":"Improved StyleGAN Embedding: Where are the Good Latents?ArXiv abs\/2012.09036","author":"Zhu Peihao","year":"2020","unstructured":"Peihao Zhu, Rameen Abdal, Yipeng Qin, and Peter Wonka. 2020a. Improved StyleGAN Embedding: Where are the Good Latents?ArXiv abs\/2012.09036 (2020)."}],"event":{"name":"SA '23: SIGGRAPH Asia 2023","location":"Sydney NSW Australia","acronym":"SA '23","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["SIGGRAPH Asia 2023 Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3610548.3618154","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3610548.3618154","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T09:33:25Z","timestamp":1755768805000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3610548.3618154"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,10]]},"references-count":84,"alternative-id":["10.1145\/3610548.3618154","10.1145\/3610548"],"URL":"https:\/\/doi.org\/10.1145\/3610548.3618154","relation":{},"subject":[],"published":{"date-parts":[[2023,12,10]]},"assertion":[{"value":"2023-12-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}