{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T07:59:50Z","timestamp":1776931190971,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":70,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,15]]},"DOI":"10.1145\/3757377.3763956","type":"proceedings-article","created":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T16:27:29Z","timestamp":1765211249000},"page":"1-12","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["DreamO: A Unified Framework for Image Customization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0296-4893","authenticated-orcid":false,"given":"Chong","family":"Mou","sequence":"first","affiliation":[{"name":"Bytedance, beijing, China and Peking University, beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6648-5387","authenticated-orcid":false,"given":"Yanze","family":"Wu","sequence":"additional","affiliation":[{"name":"Bytedance, beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9622-3665","authenticated-orcid":false,"given":"Wenxu","family":"Wu","sequence":"additional","affiliation":[{"name":"Bytedance, beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7649-4097","authenticated-orcid":false,"given":"Zinan","family":"Guo","sequence":"additional","affiliation":[{"name":"Bytedance, beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3736-5295","authenticated-orcid":false,"given":"Pengze","family":"Zhang","sequence":"additional","affiliation":[{"name":"Bytedance, beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0918-3603","authenticated-orcid":false,"given":"Yufeng","family":"Cheng","sequence":"additional","affiliation":[{"name":"Bytedance, beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8216-0348","authenticated-orcid":false,"given":"Yiming","family":"Luo","sequence":"additional","affiliation":[{"name":"Bytedance, beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1880-9681","authenticated-orcid":false,"given":"Fei","family":"Ding","sequence":"additional","affiliation":[{"name":"Bytedance, beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1580-3396","authenticated-orcid":false,"given":"Shiwen","family":"Zhang","sequence":"additional","affiliation":[{"name":"Bytedance, beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9813-6259","authenticated-orcid":false,"given":"Xinghui","family":"Li","sequence":"additional","affiliation":[{"name":"Bytedance, beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6724-6177","authenticated-orcid":false,"given":"Mengtian","family":"Li","sequence":"additional","affiliation":[{"name":"Bytedance, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1638-8348","authenticated-orcid":false,"given":"Mingcong","family":"Liu","sequence":"additional","affiliation":[{"name":"Bytedance, beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3128-7521","authenticated-orcid":false,"given":"Yunsheng","family":"Jiang","sequence":"additional","affiliation":[{"name":"Bytedance, beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7899-0863","authenticated-orcid":false,"given":"Shaojin","family":"Wu","sequence":"additional","affiliation":[{"name":"Bytedance, beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1399-0651","authenticated-orcid":false,"given":"Songtao","family":"Zhao","sequence":"additional","affiliation":[{"name":"Bytedance, beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5486-3125","authenticated-orcid":false,"given":"Jian","family":"Zhang","sequence":"additional","affiliation":[{"name":"Peking University, shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9978-7904","authenticated-orcid":false,"given":"Qian","family":"He","sequence":"additional","affiliation":[{"name":"Bytedance, beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5860-5941","authenticated-orcid":false,"given":"Xinglong","family":"Wu","sequence":"additional","affiliation":[{"name":"Bytedance, beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,12,14]]},"reference":[{"key":"e_1_3_3_2_2_1","volume-title":"\"https:\/\/github.com\/black-forest-labs\/flux?tab=readme-ov-file\"","year":"2023","unstructured":"2023a. \"https:\/\/github.com\/black-forest-labs\/flux?tab=readme-ov-file\". \"https:\/\/github.com\/black-forest-labs\/flux?tab=readme-ov-file\""},{"key":"e_1_3_3_2_3_1","volume-title":"https:\/\/huggingface.co\/alimama-creative\/FLUX.1-Turbo-Alpha","year":"2023","unstructured":"2023b. https:\/\/huggingface.co\/alimama-creative\/FLUX.1-Turbo-Alpha. https:\/\/huggingface.co\/alimama-creative\/FLUX.1-Turbo-Alpha"},{"key":"e_1_3_3_2_4_1","volume-title":"https:\/\/huggingface.co\/black-forest-labs\/FLUX.1-Canny-dev","year":"2023","unstructured":"2023c. https:\/\/huggingface.co\/black-forest-labs\/FLUX.1-Canny-dev. https:\/\/huggingface.co\/black-forest-labs\/FLUX.1-Canny-dev"},{"key":"e_1_3_3_2_5_1","unstructured":"Marah Abdin Jyoti Aneja Hany Awadalla Ahmed Awadallah Ammar\u00a0Ahmad Awan Nguyen Bach Amit Bahree Arash Bakhtiari Jianmin Bao Harkirat Behl et\u00a0al. 2024. Phi-3 technical report: A highly capable language model locally on your phone. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.14219 (2024)."},{"key":"e_1_3_3_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"e_1_3_3_2_7_1","unstructured":"Xi Chen Zhifei Zhang He Zhang Yuqian Zhou Soo\u00a0Ye Kim Qing Liu Yijun Li Jianming Zhang Nanxuan Zhao Yilin Wang et\u00a0al. 2024b. UniReal: Universal Image Generation and Editing via Learning Real-world Dynamics. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.07774 (2024)."},{"key":"e_1_3_3_2_8_1","unstructured":"Zhe Chen Weiyun Wang Yue Cao Yangzhou Liu Zhangwei Gao Erfei Cui Jinguo Zhu Shenglong Ye Hao Tian Zhaoyang Liu et\u00a0al. 2024a. Expanding performance boundaries of open-source multimodal models with model data and test-time scaling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.05271 (2024)."},{"key":"e_1_3_3_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"e_1_3_3_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01391"},{"key":"e_1_3_3_2_11_1","first-page":"206","volume-title":"European Conference on Computer Vision","author":"Choi Yisol","year":"2024","unstructured":"Yisol Choi, Sangkyung Kwak, Kyungmin Lee, Hyungwon Choi, and Jinwoo Shin. 2024. Improving diffusion models for authentic virtual try-on in the wild. In European Conference on Computer Vision. Springer, 206\u2013235."},{"key":"e_1_3_3_2_12_1","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. Advances in neural information processing systems 34 (2021) 8780\u20138794."},{"key":"e_1_3_3_2_13_1","volume-title":"Forty-first international conference on machine learning","author":"Esser Patrick","year":"2024","unstructured":"Patrick Esser, Sumith Kulal, Andreas Blattmann, Rahim Entezari, Jonas M\u00fcller, Harry Saini, Yam Levi, Dominik Lorenz, Axel Sauer, Frederic Boesel, et\u00a0al. 2024. Scaling rectified flow transformers for high-resolution image synthesis. In Forty-first international conference on machine learning."},{"key":"e_1_3_3_2_14_1","unstructured":"Rinon Gal Yuval Alaluf Yuval Atzmon Or Patashnik Amit\u00a0H Bermano Gal Chechik and Daniel Cohen-Or. 2022. An image is worth one word: Personalizing text-to-image generation using textual inversion. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2208.01618 (2022)."},{"key":"e_1_3_3_2_15_1","first-page":"322","volume-title":"European Conference on Computer Vision","author":"Gal Rinon","year":"2024","unstructured":"Rinon Gal, Or Lichter, Elad Richardson, Or Patashnik, Amit\u00a0H Bermano, Gal Chechik, and Daniel Cohen-Or. 2024. Lcm-lookahead for encoder-based text-to-image personalization. In European Conference on Computer Vision. Springer, 322\u2013340."},{"key":"e_1_3_3_2_16_1","unstructured":"Junyao Gao Yanchen Liu Yanan Sun Yinhao Tang Yanhong Zeng Kai Chen and Cairong Zhao. 2024. Styleshot: A snapshot on any style. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.01414 (2024)."},{"key":"e_1_3_3_2_17_1","doi-asserted-by":"crossref","unstructured":"Zinan Guo Yanze Wu Chen Zhuowei Peng Zhang Qian He et\u00a0al. 2024. Pulid: Pure and lightning id customization via contrastive alignment. Advances in Neural Information Processing Systems 37 (2024) 36777\u201336804.","DOI":"10.52202\/079017-1159"},{"key":"e_1_3_3_2_18_1","unstructured":"Junjie He Yifeng Geng and Liefeng Bo. 2024. UniPortrait: A Unified Framework for Identity-Preserving Single-and Multi-Human Image Personalization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.05939 (2024)."},{"key":"e_1_3_3_2_19_1","unstructured":"Junjie He Yuxiang Tuo Binghui Chen Chongyang Zhong Yifeng Geng and Liefeng Bo. 2025. AnyStory: Towards Unified Single and Multiple Subject Personalization in Text-to-Image Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.09503 (2025)."},{"key":"e_1_3_3_2_20_1","unstructured":"Amir Hertz Ron Mokady Jay Tenenbaum Kfir Aberman Yael Pritch and Daniel Cohen-Or. 2022. Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2208.01626 (2022)."},{"key":"e_1_3_3_2_21_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in Neural Information Processing Systems 33 (2020) 6840\u20136851."},{"key":"e_1_3_3_2_22_1","unstructured":"Miao Hua Jiawei Liu Fei Ding Wei Liu Jie Wu and Qian He. 2023. Dreamtuner: Single image is enough for subject-driven generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.13691 (2023)."},{"key":"e_1_3_3_2_23_1","unstructured":"Lianghua Huang Di Chen Yu Liu Yujun Shen Deli Zhao and Jingren Zhou. 2023. Composer: Creative and controllable image synthesis with composable conditions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.09778 (2023)."},{"key":"e_1_3_3_2_24_1","unstructured":"Lianghua Huang Wei Wang Zhi-Fan Wu Yupeng Shi Huanzhang Dou Chen Liang Yutong Feng Yu Liu and Jingren Zhou. 2024b. In-context lora for diffusion transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.23775 (2024)."},{"key":"e_1_3_3_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00714"},{"key":"e_1_3_3_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00594"},{"key":"e_1_3_3_2_27_1","unstructured":"Zhenchao Jin. 2023. Sssegmenation: An open source supervised semantic segmentation toolbox based on pytorch. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.17091 (2023)."},{"key":"e_1_3_3_2_28_1","unstructured":"Zhenchao Jin Xiaowei Hu Lingting Zhu Luchuan Song Li Yuan and Lequan Yu. 2024. IDRNet: Intervention-driven relation network for semantic segmentation. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_2_29_1","unstructured":"Diederik\u00a0P Kingma Max Welling et\u00a0al. 2013. Auto-encoding variational bayes."},{"key":"e_1_3_3_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"e_1_3_3_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"e_1_3_3_2_32_1","unstructured":"Dongxu Li Junnan Li and Steven Hoi. 2023. Blip-diffusion: Pre-trained subject representation for controllable text-to-image generation and editing. Advances in Neural Information Processing Systems 36 (2023) 30146\u201330166."},{"key":"e_1_3_3_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00825"},{"key":"e_1_3_3_2_34_1","unstructured":"Yaron Lipman Ricky\u00a0TQ Chen Heli Ben-Hamu Maximilian Nickel and Matt Le. 2022. Flow matching for generative modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.02747 (2022)."},{"key":"e_1_3_3_2_35_1","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1711.05101 (2017)."},{"key":"e_1_3_3_2_36_1","unstructured":"Junsheng Luan Guangyuan Li Lei Zhao and Wei Xing. 2025. MC-VTON: Minimal Control Virtual Try-On Diffusion Transformer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.03630 (2025)."},{"key":"e_1_3_3_2_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657469"},{"key":"e_1_3_3_2_38_1","unstructured":"Yuhang Ma Wenting Xu Jiji Tang Qinfeng Jin Rongsheng Zhang Zeng Zhao Changjie Fan and Zhipeng Hu. 2024b. Character-Adapter: Prompt-Guided Region Control for High-Fidelity Character Customization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.16537 (2024)."},{"key":"e_1_3_3_2_39_1","unstructured":"Chenlin Meng Yutong He Yang Song Jiaming Song Jiajun Wu Jun-Yan Zhu and Stefano Ermon. 2021. Sdedit: Guided image synthesis and editing with stochastic differential equations. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2108.01073 (2021)."},{"key":"e_1_3_3_2_40_1","unstructured":"Chong Mou Xintao Wang Jiechong Song Ying Shan and Jian Zhang. 2023. Dragondiffusion: Enabling drag-style manipulation on diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.02421 (2023)."},{"key":"e_1_3_3_2_41_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"e_1_3_3_2_42_1","first-page":"16784","volume-title":"International Conference on Machine Learning","author":"Nichol Alexander\u00a0Quinn","year":"2022","unstructured":"Alexander\u00a0Quinn Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob Mcgrew, Ilya Sutskever, and Mark Chen. 2022. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In International Conference on Machine Learning. PMLR, 16784\u201316804."},{"key":"e_1_3_3_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_3_2_44_1","unstructured":"Adam Polyak Amit Zohar Andrew Brown Andros Tjandra Animesh Sinha Ann Lee Apoorv Vyas Bowen Shi Chih-Yao Ma Ching-Yao Chuang David Yan Dhruv Choudhary Dingkang Wang Geet Sethi Guan Pang Haoyu Ma Ishan Misra Ji Hou Jialiang Wang Kiran Jagadeesh Kunpeng Li Luxin Zhang Mannat Singh Mary Williamson Matt Le Matthew Yu Mitesh\u00a0Kumar Singh Peizhao Zhang Peter Vajda Quentin Duval Rohit Girdhar Roshan Sumbaly Sai\u00a0Saketh Rambhatla Sam Tsai Samaneh Azadi Samyak Datta Sanyuan Chen Sean Bell Sharadh Ramaswamy Shelly Sheynin Siddharth Bhattacharya Simran Motwani Tao Xu Tianhe Li Tingbo Hou Wei-Ning Hsu Xi Yin Xiaoliang Dai Yaniv Taigman Yaqiao Luo Yen-Cheng Liu Yi-Chiao Wu Yue Zhao Yuval Kirstain Zecheng He Zijian He Albert Pumarola Ali Thabet Artsiom Sanakoyeu Arun Mallya Baishan Guo Boris Araya Breena Kerr Carleigh Wood Ce Liu Cen Peng Dimitry Vengertsev Edgar Schonfeld Elliot Blanchard Felix Juefei-Xu Fraylie Nord Jeff Liang John Hoffman Jonas Kohler Kaolin Fire Karthik Sivakumar Lawrence Chen Licheng Yu Luya Gao Markos Georgopoulos Rashel Moritz Sara\u00a0K. Sampson Shikai Li Simone Parmeggiani Steve Fine Tara Fowler Vladan Petrovic and Yuming Du. 2025. Movie Gen: A Cast of Media Foundation Models. arxiv:https:\/\/arXiv.org\/abs\/2410.13720\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2410.13720"},{"key":"e_1_3_3_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00830"},{"key":"e_1_3_3_2_46_1","unstructured":"Can Qin Shu Zhang Ning Yu Yihao Feng Xinyi Yang Yingbo Zhou Huan Wang Juan\u00a0Carlos Niebles Caiming Xiong Silvio Savarese et\u00a0al. 2023. Unicontrol: A unified diffusion model for controllable visual generation in the wild. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.11147 (2023)."},{"key":"e_1_3_3_2_47_1","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. 8748\u20138763."},{"key":"e_1_3_3_2_48_1","unstructured":"Aditya Ramesh Prafulla Dhariwal Alex Nichol Casey Chu and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.06125 (2022)."},{"key":"e_1_3_3_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_3_2_51_1","unstructured":"Simo Ryu. 2023. Low-rank adaptation for fast text-to-image diffusion fine-tuning."},{"key":"e_1_3_3_2_52_1","unstructured":"Chitwan Saharia William Chan Saurabh Saxena Lala Li Jay Whang Emily Denton Seyed Kamyar\u00a0Seyed Ghasemipour Burcu\u00a0Karagol Ayan S\u00a0Sara Mahdavi Rapha\u00a0Gontijo Lopes et\u00a0al. 2022a. Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2205.11487 (2022)."},{"key":"e_1_3_3_2_53_1","unstructured":"Chitwan Saharia William Chan Saurabh Saxena Lala Li Jay Whang Emily\u00a0L Denton Kamyar Ghasemipour Raphael Gontijo\u00a0Lopes Burcu Karagol\u00a0Ayan Tim Salimans et\u00a0al. 2022b. Photorealistic text-to-image diffusion models with deep language understanding. Advances in neural information processing systems 35 (2022) 36479\u201336494."},{"key":"e_1_3_3_2_54_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i7.32729"},{"key":"e_1_3_3_2_55_1","doi-asserted-by":"crossref","unstructured":"Gowthami Somepalli Anubhav Gupta Kamal Gupta Shramay Palta Micah Goldblum Jonas Geiping Abhinav Shrivastava and Tom Goldstein. 2024. Measuring Style Similarity in Diffusion Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.01292 (2024).","DOI":"10.1007\/978-3-031-72848-8_9"},{"key":"e_1_3_3_2_56_1","doi-asserted-by":"crossref","unstructured":"Jianlin Su Murtadha Ahmed Yu Lu Shengfeng Pan Wen Bo and Yunfeng Liu. 2024. Roformer: Enhanced transformer with rotary position embedding. Neurocomputing 568 (2024) 127063.","DOI":"10.1016\/j.neucom.2023.127063"},{"key":"e_1_3_3_2_57_1","unstructured":"Zhenxiong Tan Songhua Liu Xingyi Yang Qiaochu Xue and Xinchao Wang. 2024. OminiControl: Minimal and Universal Control for Diffusion Transformer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.15098 (2024)."},{"key":"e_1_3_3_2_58_1","unstructured":"Zhenchen Wan Dongting Hu Weilun Cheng Tianxi Chen Zhaoqing Wang Feng Liu Tongliang Liu Mingming Gong et\u00a0al. 2025. MF-VITON: High-Fidelity Mask-Free Virtual Try-On with Minimal Input. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.08650 (2025)."},{"key":"e_1_3_3_2_59_1","unstructured":"Haofan Wang Matteo Spinelli Qixun Wang Xu Bai Zekui Qin and Anthony Chen. 2024c. Instantstyle: Free lunch towards style-preserving in text-to-image generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.02733 (2024)."},{"key":"e_1_3_3_2_60_1","unstructured":"Qixun Wang Xu Bai Haofan Wang Zekui Qin Anthony Chen Huaxia Li Xu Tang and Yao Hu. 2024a. Instantid: Zero-shot identity-preserving generation in seconds. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.07519 (2024)."},{"key":"e_1_3_3_2_61_1","unstructured":"Xierui Wang Siming Fu Qihan Huang Wanggui He and Hao Jiang. 2024b. Ms-diffusion: Multi-subject zero-shot image personalization with layout guidance. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.07209 (2024)."},{"key":"e_1_3_3_2_62_1","unstructured":"Zongze Wu Yotam Nitzan Eli Shechtman and Dani Lischinski. 2021. Stylealign: Analysis and applications of aligned stylegan models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2110.11323 (2021)."},{"key":"e_1_3_3_2_63_1","unstructured":"Guangxuan Xiao Tianwei Yin William\u00a0T Freeman Fr\u00e9do Durand and Song Han. 2024b. Fastcomposer: Tuning-free multi-subject image generation with localized attention. International Journal of Computer Vision (2024) 1\u201320."},{"key":"e_1_3_3_2_64_1","unstructured":"Shitao Xiao Yueze Wang Junjie Zhou Huaying Yuan Xingrun Xing Ruiran Yan Chaofan Li Shuting Wang Tiejun Huang and Zheng Liu. 2024a. Omnigen: Unified image generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.11340 (2024)."},{"key":"e_1_3_3_2_65_1","unstructured":"Peng Xing Haofan Wang Yanpeng Sun Qixun Wang Xu Bai Hao Ai Renyuan Huang and Zechao Li. 2024. Csgo: Content-style composition in text-to-image generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.16766 (2024)."},{"key":"e_1_3_3_2_66_1","first-page":"260","volume-title":"Proceedings of the Asian Conference on Computer Vision","author":"Yamaguchi Rento","year":"2024","unstructured":"Rento Yamaguchi and Keiji Yanai. 2024. Exploring Cross-Attention Maps in Multi-modal Diffusion Transformers for Training-Free Semantic Segmentation. In Proceedings of the Asian Conference on Computer Vision. 260\u2013274."},{"key":"e_1_3_3_2_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01763"},{"key":"e_1_3_3_2_68_1","unstructured":"Hu Ye Jun Zhang Sibo Liu Xiao Han and Wei Yang. 2023. Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.06721 (2023)."},{"key":"e_1_3_3_2_69_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-96530-3"},{"key":"e_1_3_3_2_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_3_2_71_1","unstructured":"Shihao Zhao Dongdong Chen Yen-Chun Chen Jianmin Bao Shaozhe Hao Lu Yuan and Kwan-Yee\u00a0K Wong. 2023. Uni-controlnet: All-in-one control to text-to-image diffusion models. Advances in Neural Information Processing Systems 36 (2023) 11127\u201311150."}],"event":{"name":"SA Conference Papers '25: SIGGRAPH Asia 2025 Conference Papers","location":"Hong Kong Hong Kong","acronym":"SA Conference Papers '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the SIGGRAPH Asia 2025 Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3757377.3763956","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T03:23:40Z","timestamp":1765250620000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3757377.3763956"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,14]]},"references-count":70,"alternative-id":["10.1145\/3757377.3763956","10.1145\/3757377"],"URL":"https:\/\/doi.org\/10.1145\/3757377.3763956","relation":{},"subject":[],"published":{"date-parts":[[2025,12,14]]},"assertion":[{"value":"2025-12-14","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}