{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T16:49:18Z","timestamp":1777567758272,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680621","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"10901-10909","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Customizing Text-to-Image Generation with Inverted Interaction"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-1301-1323","authenticated-orcid":false,"given":"Mengmeng","family":"Ge","sequence":"first","affiliation":[{"name":"Advanced Micro Devices, Inc. &amp; Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3168-3505","authenticated-orcid":false,"given":"Xu","family":"Jia","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5170-5436","authenticated-orcid":false,"given":"Takashi","family":"Isobe","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices, Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7202-6865","authenticated-orcid":false,"given":"Xiaomin","family":"Li","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6908-5485","authenticated-orcid":false,"given":"Qinghe","family":"Wang","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4812-1111","authenticated-orcid":false,"given":"Jing","family":"Mu","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7155-8484","authenticated-orcid":false,"given":"Dong","family":"Zhou","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices, Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3347-1518","authenticated-orcid":false,"given":"Li","family":"Wang","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices, Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6668-9758","authenticated-orcid":false,"given":"Huchuan","family":"Lu","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6734-1984","authenticated-orcid":false,"given":"Lu","family":"Tian","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices, Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7828-0871","authenticated-orcid":false,"given":"Ashish","family":"Sirasao","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices, Inc., San Jose, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4097-8690","authenticated-orcid":false,"given":"Emad","family":"Barsoum","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices, Inc., San Jose, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"An Image is Worth Multiple Words: Multi-attribute Inversion for Constrained Text-to-Image Synthesis. CoRR","author":"Agarwal Aishwarya","year":"1919","unstructured":"Aishwarya Agarwal, Srikrishna Karanam, Tripti Shukla, and Balaji Vasan Srinivasan. 2023. An Image is Worth Multiple Words: Multi-attribute Inversion for Constrained Text-to-Image Synthesis. CoRR, Vol. abs\/2311.11919 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Dreamstyler: Paint by style inversion with text-to-image diffusion models. In AAAI.","author":"Ahn Namhyuk","year":"2024","unstructured":"Namhyuk Ahn, Junsoo Lee, Chunggi Lee, Kunhee Kim, Daesik Kim, Seung-Hun Nam, and Kibeom Hong. 2024. Dreamstyler: Paint by style inversion with text-to-image diffusion models. In AAAI."},{"key":"e_1_3_2_1_3_1","volume-title":"Break-A-Scene: Extracting Multiple Concepts from a Single Image. CoRR","author":"Avrahami Omri","year":"2023","unstructured":"Omri Avrahami, Kfir Aberman, Ohad Fried, Daniel Cohen-Or, and Dani Lischinski. 2023. Break-A-Scene: Extracting Multiple Concepts from a Single Image. CoRR, Vol. abs\/2305.16311 (2023)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Tim Brooks Aleksander Holynski and Alexei A Efros. 2023. Instructpix2pix: Learning to follow image editing instructions. In CVPR.","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_2_1_5_1","volume-title":"MasaCtrl: Tuning-Free Mutual Self-Attention Control for Consistent Image Synthesis and Editing. CoRR","author":"Cao Mingdeng","year":"2023","unstructured":"Mingdeng Cao, Xintao Wang, Zhongang Qi, Ying Shan, Xiaohu Qie, and Yinqiang Zheng. 2023. MasaCtrl: Tuning-Free Mutual Self-Attention Control for Consistent Image Synthesis and Editing. CoRR, Vol. abs\/2304.08465 (2023)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Zhe Cao Tomas Simon Shih-En Wei and Yaser Sheikh. 2017. Realtime multi-person 2d pose estimation using part affinity fields. In CVPR.","DOI":"10.1109\/CVPR.2017.143"},{"key":"e_1_3_2_1_7_1","volume-title":"Attend-and-excite: Attention-based semantic guidance for text-to-image diffusion models. TOG","author":"Chefer Hila","year":"2023","unstructured":"Hila Chefer, Yuval Alaluf, Yael Vinker, Lior Wolf, and Daniel Cohen-Or. 2023. Attend-and-excite: Attention-based semantic guidance for text-to-image diffusion models. TOG (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"Subject-driven text-to-image generation via apprenticeship learning. NIPS","author":"Chen Wenhu","year":"2024","unstructured":"Wenhu Chen, Hexiang Hu, Yandong Li, Nataniel Ruiz, Xuhui Jia, Ming-Wei Chang, and William W Cohen. 2024. Subject-driven text-to-image generation via apprenticeship learning. NIPS (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"A learned representation for artistic style. CoRR","author":"Dumoulin Vincent","year":"2016","unstructured":"Vincent Dumoulin, Jonathon Shlens, and Manjunath Kudlur. 2016. A learned representation for artistic style. CoRR, Vol. abs\/1610.07629 (2016)."},{"key":"e_1_3_2_1_10_1","unstructured":"Rinon Gal Yuval Alaluf Yuval Atzmon Or Patashnik Amit H Bermano Gal Chechik and Daniel Cohen-Or. 2023. An image is worth one word: Personalizing text-to-image generation using textual inversion. In ICLR."},{"key":"e_1_3_2_1_11_1","unstructured":"Amir Hertz Ron Mokady Jay Tenenbaum Kfir Aberman Yael Pritch and Daniel Cohen-Or. 2022. Prompt-to-prompt image editing with cross attention control. In ICLR."},{"key":"e_1_3_2_1_12_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. In NIPS."},{"key":"e_1_3_2_1_13_1","volume-title":"Yap-Peng Tan, and Weipeng Hu.","author":"Hoe Jiun Tian","year":"2023","unstructured":"Jiun Tian Hoe, Xudong Jiang, Chee Seng Chan, Yap-Peng Tan, and Weipeng Hu. 2023. InteractDiffusion: Interaction Control in Text-to-Image Diffusion Models. CoRR, Vol. abs\/2312.05849 (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"KV Inversion: KV Embeddings Learning for Text-Conditioned Real Image Action Editing. CoRR","author":"Huang Jiancheng","year":"2023","unstructured":"Jiancheng Huang, Yifan Liu, Jin Qin, and Shifeng Chen. 2023. KV Inversion: KV Embeddings Learning for Text-Conditioned Real Image Action Editing. CoRR, Vol. abs\/2309.16608 (2023)."},{"key":"e_1_3_2_1_15_1","volume-title":"Composer: Creative and Controllable Image Synthesis with Composable Conditions. CoRR","author":"Huang Lianghua","year":"2023","unstructured":"Lianghua Huang, Di Chen, Yu Liu, Shen Yujun, Deli Zhao, and Zhou Jingren. 2023. Composer: Creative and Controllable Image Synthesis with Composable Conditions. CoRR, Vol. abs\/2302.09778 (2023)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Siteng Huang Biao Gong Yutong Feng Xi Chen Yuqian Fu Yu Liu and Donglin Wang. 2024. Learning Disentangled Identifiers for Action-Customized Text-to-Image Generation. In CVPR.","DOI":"10.1109\/CVPR52733.2024.00745"},{"key":"e_1_3_2_1_17_1","volume-title":"Kelvin CK Chan, and Ziwei Liu","author":"Huang Ziqi","year":"2023","unstructured":"Ziqi Huang, Tianxing Wu, Yuming Jiang, Kelvin CK Chan, and Ziwei Liu. 2023. ReVersion: Diffusion-Based Relation Inversion from Images. CoRR, Vol. abs\/2303.13495 (2023)."},{"key":"e_1_3_2_1_18_1","volume-title":"Semi-supervised classification with graph convolutional networks. CoRR","author":"Kipf Thomas N","year":"2016","unstructured":"Thomas N Kipf and Max Welling. 2016. Semi-supervised classification with graph convolutional networks. CoRR, Vol. abs\/1609.02907 (2016)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Alexander Kirillov Eric Mintun Nikhila Ravi Hanzi Mao Chloe Rolland Laura Gustafson Tete Xiao Spencer Whitehead Alexander C Berg Wan-Yen Lo et al. 2023. Segment anything. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Nupur Kumari Bingliang Zhang Richard Zhang Eli Shechtman and Jun-Yan Zhu. 2023. Multi-concept customization of text-to-image diffusion. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"e_1_3_2_1_21_1","volume-title":"LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models. CoRR","author":"Lian Long","year":"2023","unstructured":"Long Lian, Boyi Li, Adam Yala, and Trevor Darrell. 2023. LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models. CoRR, Vol. abs\/2305.13655 (2023)."},{"key":"e_1_3_2_1_22_1","unstructured":"Zhiheng Liu Yifei Zhang Yujun Shen Kecheng Zheng Kai Zhu Ruili Feng Yu Liu Deli Zhao Jingren Zhou and Yang Cao. 2023. Customizable Image Synthesis with Multiple Subjects. In NIPS."},{"key":"e_1_3_2_1_23_1","unstructured":"Haoming Lu Hazarapet Tunanyan Kai Wang Shant Navasardyan Zhangyang Wang and Humphrey Shi. 2023. Specialist diffusion: Plug-and-play sample-efficient fine-tuning of text-to-image diffusion models to learn any unseen style. In CVPR."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Ron Mokady Amir Hertz Kfir Aberman Yael Pritch and Daniel Cohen-Or. 2023. Null-text inversion for editing real images using guided diffusion models. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"e_1_3_2_1_25_1","unstructured":"OpenAI. 2023. DALL-E-2 https:\/\/openai.com\/product\/dall-e-2."},{"key":"e_1_3_2_1_26_1","volume-title":"Styleclip: Text-driven manipulation of stylegan imagery. In ICCV.","author":"Patashnik Or","year":"2021","unstructured":"Or Patashnik, Zongze Wu, Eli Shechtman, Daniel Cohen-Or, and Dani Lischinski. 2021. Styleclip: Text-driven manipulation of stylegan imagery. In ICCV."},{"key":"e_1_3_2_1_27_1","volume-title":"Grounded Text-to-Image Synthesis with Attention Refocusing. CoRR","author":"Phung Quynh","year":"2023","unstructured":"Quynh Phung, Songwei Ge, and Jia-Bin Huang. 2023. Grounded Text-to-Image Synthesis with Attention Refocusing. CoRR, Vol. abs\/2306.05427 (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML."},{"key":"e_1_3_2_1_29_1","volume-title":"Hierarchical text-conditional image generation with clip latents. CoRR","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. CoRR, Vol. abs\/2204.06125 (2022)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2022. High-resolution image synthesis with latent diffusion models. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_31_1","volume-title":"Dreambooth: Fine tuning text-to-image diffusion models for subject-driven generation.","author":"Ruiz Nataniel","year":"2023","unstructured":"Nataniel Ruiz, Yuanzhen Li, Varun Jampani, Yael Pritch, Michael Rubinstein, and Kfir Aberman. 2023. Dreambooth: Fine tuning text-to-image diffusion models for subject-driven generation."},{"key":"e_1_3_2_1_32_1","volume-title":"Burcu Karagol Ayan, Tim Salimans, et al.","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. 2022. Photorealistic text-to-image diffusion models with deep language understanding."},{"key":"e_1_3_2_1_33_1","volume-title":"What the daam: Interpreting stable diffusion using cross attention. CoRR","author":"Tang Raphael","year":"2023","unstructured":"Raphael Tang, Linqing Liu, Akshat Pandey, Zhiying Jiang, Gefei Yang, Karun Kumar, Pontus Stenetorp, Jimmy Lin, and Ferhan Ture. 2023. What the daam: Interpreting stable diffusion using cross attention. CoRR, Vol. abs\/2210.04885 (2023)."},{"key":"e_1_3_2_1_34_1","volume-title":"Visualizing Data using t-SNE","author":"van der Maaten Laurens","year":"2008","unstructured":"Laurens van der Maaten and Geoffrey Hinton. 2008. Visualizing Data using t-SNE. , Vol. 9, 86 (2008), 2579--2605."},{"key":"e_1_3_2_1_35_1","volume-title":"P : Extended Textual Conditioning in Text-to-Image Generation. CoRR","author":"Voynov Andrey","year":"2023","unstructured":"Andrey Voynov, Qinghao Chu, Daniel Cohen-Or, and Kfir Aberman. 2023. P : Extended Textual Conditioning in Text-to-Image Generation. CoRR, Vol. abs\/2303.09522 (2023)."},{"key":"e_1_3_2_1_36_1","unstructured":"Qiucheng Wu Yujian Liu Handong Zhao Ajinkya Kale Trung Bui Tong Yu Zhe Lin Yang Zhang and Shiyu Chang. 2023. Uncovering the disentanglement capability in text-to-image diffusion models. In CVPR."},{"key":"e_1_3_2_1_37_1","volume-title":"FastComposer: Tuning-Free Multi-Subject Image Generation with Localized Attention. CoRR","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Tianwei Yin, William T Freeman, Fr\u00e9do Durand, and Song Han. 2023. FastComposer: Tuning-Free Multi-Subject Image Generation with Localized Attention. CoRR, Vol. abs\/2305.10431 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"Attngan: Fine-grained text to image generation with attentional generative adversarial networks. In CVPR.","author":"Xu Tao","year":"2018","unstructured":"Tao Xu, Pengchuan Zhang, Qiuyuan Huang, Han Zhang, Zhe Gan, Xiaolei Huang, and Xiaodong He. 2018. Attngan: Fine-grained text to image generation with attentional generative adversarial networks. In CVPR."},{"key":"e_1_3_2_1_39_1","volume-title":"Vitpose: Simple vision transformer baselines for human pose estimation. In NIPS.","author":"Xu Yufei","year":"2022","unstructured":"Yufei Xu, Jing Zhang, Qiming Zhang, and Dacheng Tao. 2022. Vitpose: Simple vision transformer baselines for human pose estimation. In NIPS."},{"key":"e_1_3_2_1_40_1","volume-title":"ViTPose: Vision Transformer Foundation Model for Generic Body Pose Estimation. CoRR","author":"Xu Yufei","year":"2022","unstructured":"Yufei Xu, Jing Zhang, Qiming Zhang, and Dacheng Tao. 2022. ViTPose: Vision Transformer Foundation Model for Generic Body Pose Estimation. CoRR, Vol. abs\/2212.04246 (2022)."},{"key":"e_1_3_2_1_41_1","volume-title":"Zujin Guo, Kaiyang Zhou, Wayne Zhang, and Ziwei Liu.","author":"Yang Jingkang","year":"2022","unstructured":"Jingkang Yang, Yi Zhe Ang, Zujin Guo, Kaiyang Zhou, Wayne Zhang, and Ziwei Liu. 2022. Panoptic scene graph generation. In ECCV."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.597"},{"key":"e_1_3_2_1_43_1","volume-title":"Thang Luong, Gunjan Baid, Zirui Wang, Vijay Vasudevan, Alexander Ku, Yinfei Yang, Burcu Karagol Ayan, et al.","author":"Yu Jiahui","year":"2023","unstructured":"Jiahui Yu, Yuanzhong Xu, Jing Yu Koh, Thang Luong, Gunjan Baid, Zirui Wang, Vijay Vasudevan, Alexander Ku, Yinfei Yang, Burcu Karagol Ayan, et al. 2023. Scaling autoregressive models for content-rich text-to-image generation. CoRR, Vol. abs\/2206.10789 (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"Jason Baldridge, Honglak Lee, and Yinfei Yang.","author":"Zhang Han","year":"2021","unstructured":"Han Zhang, Jing Yu Koh, Jason Baldridge, Honglak Lee, and Yinfei Yang. 2021. Cross-modal contrastive learning for text-to-image generation. In CVPR."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Lvmin Zhang Anyi Rao and Maneesh Agrawala. 2023. Adding conditional control to text-to-image diffusion models. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_2_1_46_1","volume-title":"ProSpect: Expanded Conditioning for the Personalization of Attribute-aware Image Generation. CoRR","author":"Zhang Yuxin","year":"2023","unstructured":"Yuxin Zhang, Weiming Dong, Fan Tang, Nisha Huang, Haibin Huang, Chongyang Ma, Tong-Yee Lee, Oliver Deussen, and Changsheng Xu. 2023. ProSpect: Expanded Conditioning for the Personalization of Attribute-aware Image Generation. CoRR, Vol. abs\/2305.16225 (2023)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Yuxin Zhang Nisha Huang Fan Tang Haibin Huang Chongyang Ma Weiming Dong and Changsheng Xu. 2023. Inversion-based style transfer with diffusion models. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00978"},{"key":"e_1_3_2_1_48_1","volume-title":"Dm-gan: Dynamic memory generative adversarial networks for text-to-image synthesis. In CVPR.","author":"Zhu Minfeng","year":"2019","unstructured":"Minfeng Zhu, Pingbo Pan, Wei Chen, and Yi Yang. 2019. Dm-gan: Dynamic memory generative adversarial networks for text-to-image synthesis. In CVPR."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680621","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680621","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:56Z","timestamp":1750295876000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680621"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":48,"alternative-id":["10.1145\/3664647.3680621","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680621","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}