{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:02:56Z","timestamp":1750309376706,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680729","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"196-204","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Equilibrated Diffusion: Frequency-aware Textual Embedding for Equilibrated Image Customization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9492-5324","authenticated-orcid":false,"given":"Liyuan","family":"Ma","sequence":"first","affiliation":[{"name":"Westlake University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7993-5188","authenticated-orcid":false,"given":"Xueji","family":"Fang","sequence":"additional","affiliation":[{"name":"Zhejiang University &amp; Westlake University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3508-1851","authenticated-orcid":false,"given":"Guo-Jun","family":"Qi","sequence":"additional","affiliation":[{"name":"Westlake University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3618322"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3610548.3618154"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i2.27850"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02062"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"e_1_3_2_1_6_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Chen Hong","year":"2023","unstructured":"Hong Chen, Yipeng Zhang, Simin Wu, Xin Wang, Xuguang Duan, Yuwei Zhou, and Wenwu Zhu. 2023. Disenbooth: Identity-preserving disentangled tuning for subject-driven text-to-image generation. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_7_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Chen Wenhu","year":"2024","unstructured":"Wenhu Chen, Hexiang Hu, Yandong Li, Nataniel Ruiz, Xuhui Jia, Ming-Wei Chang, and William W Cohen. 2024. Subject-driven text-to-image generation via apprenticeship learning. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_8_1","volume-title":"Custom-edit: Text-guided image editing with customized diffusion models. arXiv preprint arXiv:2305.15779","author":"Choi Jooyoung","year":"2023","unstructured":"Jooyoung Choi, Yunjey Choi, Yunji Kim, Junho Kim, and Sungroh Yoon. 2023. Custom-edit: Text-guided image editing with customized diffusion models. arXiv preprint arXiv:2305.15779 (2023)."},{"key":"e_1_3_2_1_9_1","volume-title":"IDAdapter: Learning Mixed Features for Tuning-Free Personalization of Text-to-Image Models. arXiv preprint arXiv:2403.13535","author":"Cui Siying","year":"2024","unstructured":"Siying Cui, Jiankang Deng, Jia Guo, Xiang An, Yongle Zhao, Xinyu Wei, and Ziyong Feng. 2024. IDAdapter: Learning Mixed Features for Tuning-Free Personalization of Text-to-Image Models. arXiv preprint arXiv:2403.13535 (2024)."},{"key":"e_1_3_2_1_10_1","volume-title":"DreamArtist: Towards Controllable One-Shot Text-to-Image Generation via Positive-Negative Prompt-Tuning. arXiv preprint arXiv:2211.11337","author":"Dong Ziyi","year":"2022","unstructured":"Ziyi Dong, Pengxu Wei, and Liang Lin. 2022. DreamArtist: Towards Controllable One-Shot Text-to-Image Generation via Positive-Negative Prompt-Tuning. arXiv preprint arXiv:2211.11337 (2022)."},{"key":"e_1_3_2_1_11_1","volume-title":"The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=NAQvF08TcyG","author":"Gal Rinon","year":"2023","unstructured":"Rinon Gal, Yuval Alaluf, Yuval Atzmon, Or Patashnik, Amit Haim Bermano, Gal Chechik, and Daniel Cohen-or. 2023. An Image is Worth One Word: Personalizing Text-to-Image Generation using Textual Inversion. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=NAQvF08TcyG"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592133"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00673"},{"key":"e_1_3_2_1_14_1","unstructured":"Shaozhe Hao Kai Han Shihao Zhao and Kwan-Yee K Wong. 2023. ViCo: Plug-and-play Visual Condition for Personalized Text-to-image Generation. (2023)."},{"key":"e_1_3_2_1_15_1","volume-title":"Freestyle: Free lunch for text-guided style transfer using diffusion models. arXiv preprint arXiv:2401.15636","author":"He Feihong","year":"2024","unstructured":"Feihong He, Gang Li, Mengyuan Zhang, Leilei Yan, Lingyu Si, and Fanzhang Li. 2024. Freestyle: Free lunch for text-guided style transfer using diffusion models. arXiv preprint arXiv:2401.15636 (2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems, Vol. 33 (2020), 6840--6851."},{"key":"e_1_3_2_1_17_1","volume-title":"DreamTuner: Single Image is Enough for Subject-Driven Generation. arXiv preprint arXiv:2312.13691","author":"Hua Miao","year":"2023","unstructured":"Miao Hua, Jiawei Liu, Fei Ding, Wei Liu, Jie Wu, and Qian He. 2023. DreamTuner: Single Image is Enough for Subject-Driven Generation. arXiv preprint arXiv:2312.13691 (2023)."},{"key":"e_1_3_2_1_18_1","volume-title":"Yandong Li, Han Zhang, Boqing Gong, Tingbo Hou, Huisheng Wang, and Yu-Chuan Su.","author":"Jia Xuhui","year":"2023","unstructured":"Xuhui Jia, Yang Zhao, Kelvin CK Chan, Yandong Li, Han Zhang, Boqing Gong, Tingbo Hou, Huisheng Wang, and Yu-Chuan Su. 2023. Taming encoder for zero fine-tuning image customization with text-to-image diffusion models. arXiv preprint arXiv:2304.02642 (2023)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01366"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"e_1_3_2_1_21_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00352"},{"key":"e_1_3_2_1_23_1","volume-title":"Unified multi-modal latent diffusion for joint subject and text conditional image generation. arXiv preprint arXiv:2303.09319","author":"Ma Yiyang","year":"2023","unstructured":"Yiyang Ma, Huan Yang, Wenjing Wang, Jianlong Fu, and Jiaying Liu. 2023. Unified multi-modal latent diffusion for joint subject and text conditional image generation. arXiv preprint arXiv:2303.09319 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"DreamMatcher: Appearance Matching Self-Attention for Semantically-Consistent Text-to-Image Personalization. arXiv preprint arXiv:2402.09812","author":"Nam Jisu","year":"2024","unstructured":"Jisu Nam, Heesu Kim, DongJae Lee, Siyoon Jin, Seungryong Kim, and Seunggyu Chang. 2024. DreamMatcher: Appearance Matching Self-Attention for Semantically-Consistent Text-to-Image Personalization. arXiv preprint arXiv:2402.09812 (2024)."},{"key":"e_1_3_2_1_25_1","volume-title":"Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741","author":"Nichol Alex","year":"2021","unstructured":"Alex Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, and Mark Chen. 2021. Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)."},{"key":"e_1_3_2_1_26_1","volume-title":"View in Article","volume":"2","author":"R","year":"2023","unstructured":"R OpenAI. 2023. Gpt-4 technical report. arxiv 2303.08774. View in Article, Vol. 2, 5 (2023)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19797-0_15"},{"key":"e_1_3_2_1_28_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_29_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125, Vol. 1, 2 (2022), 3."},{"key":"e_1_3_2_1_30_1","unstructured":"Tianhe Ren Shilong Liu Ailing Zeng Jing Lin Kunchang Li He Cao Jiayu Chen Xinyu Huang Yukang Chen Feng Yan et al. 2024. Grounded sam: Assembling open-world models for diverse visual tasks. arXiv preprint arXiv:2401.14159 (2024)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_2_1_33_1","volume-title":"Burcu Karagol Ayan, Tim Salimans, et al.","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in neural information processing systems, Vol. 35 (2022), 36479--36494."},{"key":"e_1_3_2_1_34_1","volume-title":"Instantbooth: Personalized text-to-image generation without test-time finetuning. arXiv preprint arXiv:2304.03411","author":"Shi Jing","year":"2023","unstructured":"Jing Shi, Wei Xiong, Zhe Lin, and Hyun Joon Jung. 2023. Instantbooth: Personalized text-to-image generation without test-time finetuning. arXiv preprint arXiv:2304.03411 (2023)."},{"key":"e_1_3_2_1_35_1","first-page":"23495","article-title":"Inception transformer","volume":"35","author":"Si Chenyang","year":"2022","unstructured":"Chenyang Si, Weihao Yu, Pan Zhou, Yichen Zhou, Xinchao Wang, and Shuicheng Yan. 2022. Inception transformer. Advances in Neural Information Processing Systems, Vol. 35 (2022), 23495--23509.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_36_1","volume-title":"Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2020. Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)."},{"key":"e_1_3_2_1_37_1","volume-title":"Harmonizing Visual and Textual Embeddings for Zero-Shot Text-to-Image Customization. arXiv preprint arXiv:2403.14155","author":"Song Yeji","year":"2024","unstructured":"Yeji Song, Jimyeong Kim, Wonhark Park, Wonsik Shin, Wonjong Rhee, and Nojun Kwak. 2024. Harmonizing Visual and Textual Embeddings for Zero-Shot Text-to-Image Customization. arXiv preprint arXiv:2403.14155 (2024)."},{"key":"e_1_3_2_1_38_1","volume-title":"Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:2011.13456","author":"Song Yang","year":"2020","unstructured":"Yang Song, Jascha Sohl-Dickstein, Diederik P Kingma, Abhishek Kumar, Stefano Ermon, and Ben Poole. 2020. Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:2011.13456 (2020)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591506"},{"key":"e_1_3_2_1_40_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591560"},{"key":"e_1_3_2_1_42_1","volume-title":"Extended Textual Conditioning in Text-to-Image Generation. arXiv preprint arXiv:2303.09522","author":"Voynov Andrey","year":"2023","unstructured":"Andrey Voynov, Qinghao Chu, Daniel Cohen-Or, and Kfir Aberman. 2023. $ P $: Extended Textual Conditioning in Text-to-Image Generation. arXiv preprint arXiv:2303.09522 (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"Instantid: Zero-shot identity-preserving generation in seconds. arXiv preprint arXiv:2401.07519","author":"Wang Qixun","year":"2024","unstructured":"Qixun Wang, Xu Bai, Haofan Wang, Zekui Qin, and Anthony Chen. 2024. Instantid: Zero-shot identity-preserving generation in seconds. arXiv preprint arXiv:2401.07519 (2024)."},{"key":"e_1_3_2_1_44_1","volume-title":"Semantic image synthesis via diffusion models. arXiv preprint arXiv:2207.00050","author":"Wang Weilun","year":"2022","unstructured":"Weilun Wang, Jianmin Bao, Wengang Zhou, Dongdong Chen, Dong Chen, Lu Yuan, and Houqiang Li. 2022. Semantic image synthesis via diffusion models. arXiv preprint arXiv:2207.00050 (2022)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01461"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02148"},{"key":"e_1_3_2_1_47_1","volume-title":"Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv preprint arXiv:2308.06721","author":"Ye Hu","year":"2023","unstructured":"Hu Ye, Jun Zhang, Sibo Liu, Xiao Han, and Wei Yang. 2023. Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv preprint arXiv:2308.06721 (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680729","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680729","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:06:24Z","timestamp":1750291584000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680729"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":47,"alternative-id":["10.1145\/3664647.3680729","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680729","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}