{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T17:43:32Z","timestamp":1776102212267,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755141","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:30:51Z","timestamp":1761377451000},"page":"3645-3653","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Twin Co-Adaptive Dialogue for Progressive Image Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-7053-5645","authenticated-orcid":false,"given":"Jianhui","family":"Wang","sequence":"first","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7172-9403","authenticated-orcid":false,"given":"Yangfan","family":"He","sequence":"additional","affiliation":[{"name":"University of Minnesota Twin Cities, Minneapolis, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0005-2620","authenticated-orcid":false,"given":"Yan","family":"Zhong","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4209-5671","authenticated-orcid":false,"given":"Xinyuan","family":"Song","sequence":"additional","affiliation":[{"name":"Emory University, Atlanta, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7938-164X","authenticated-orcid":false,"given":"Jiayi","family":"Su","sequence":"additional","affiliation":[{"name":"Xiamen University Malaysia, Sepang, Malaysia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9449-937X","authenticated-orcid":false,"given":"Yuheng","family":"Feng","sequence":"additional","affiliation":[{"name":"Hong Kong Polytechnic University, Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5438-6541","authenticated-orcid":false,"given":"Ruoyu","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4889-8118","authenticated-orcid":false,"given":"Hongyang","family":"He","sequence":"additional","affiliation":[{"name":"University of Warwick, coventry, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9346-3153","authenticated-orcid":false,"given":"Wenyu","family":"Zhu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1949-9312","authenticated-orcid":false,"given":"Xinhang","family":"Yuan","sequence":"additional","affiliation":[{"name":"Washington University, Saint Louis, Saint Loius, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0551-9678","authenticated-orcid":false,"given":"Miao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9402-1930","authenticated-orcid":false,"given":"Keqin","family":"Li","sequence":"additional","affiliation":[{"name":"University of Toronto, Toronto, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6698-3353","authenticated-orcid":false,"given":"Jiaqi","family":"Chen","sequence":"additional","affiliation":[{"name":"Google, Chicago, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4271-0871","authenticated-orcid":false,"given":"Tianyu","family":"Shi","sequence":"additional","affiliation":[{"name":"University of Toronto, Toronto, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3542-0593","authenticated-orcid":false,"given":"Xueqian","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Yi: Open Foundation Models by 01.AI. arXiv:2403.04652 [cs.CL]","author":"Young Alex","year":"2024","unstructured":"01. AI:, Alex Young, Bei Chen, Chao Li, Chengen Huang, Ge Zhang, Guanwei Zhang, Heng Li, Jiangcheng Zhu, Jianqun Chen, Jing Chang, Kaidong Yu, Peng Liu, Qiang Liu, Shawn Yue, Senbin Yang, Shiming Yang, Tao Yu, Wen Xie, Wenhao Huang, Xiaohui Hu, Xiaoyi Ren, Xinyao Niu, Pengcheng Nie, Yuchi Xu, Yudong Liu, Yue Wang, Yuxuan Cai, Zhenyu Gu, Zhiyuan Liu, and Zonghong Dai. 2024. Yi: Open Foundation Models by 01.AI. arXiv:2403.04652 [cs.CL]"},{"key":"e_1_3_2_1_3_1","volume-title":"Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_4_1","unstructured":"James Betker Gabriel Goh Li Jing Tim Brooks Jianfeng Wang Linjie Li Long Ouyang Juntang Zhuang Joyce Lee Yufei Guo et al. 2023. Improving Image Generation with Better Captions. DALL-E 3 (2023). OpenAI."},{"key":"e_1_3_2_1_5_1","volume-title":"Muse: Text-to-image generation via masked generative transformers. arXiv preprint arXiv:2301.00704","author":"Chang Huiwen","year":"2023","unstructured":"Huiwen Chang, Han Zhang, Jarred Barber, AJ Maschinot, Jose Lezama, Lu Jiang, Ming-Hsuan Yang, Kevin Murphy, William T Freeman, Michael Rubinstein, et al., 2023. Muse: Text-to-image generation via masked generative transformers. arXiv preprint arXiv:2301.00704 (2023)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Hila Chefer Yuval Alaluf Yael Vinker Lior Wolf and Daniel Cohen-Or. 2023. Attend-and-Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models. arXiv:2301.13826 [cs.CV] https:\/\/arxiv.org\/abs\/2301.13826","DOI":"10.1145\/3592116"},{"key":"e_1_3_2_1_7_1","volume-title":"Masked-attention diffusion guidance for spatially controlling text-to-image generation. The Visual Computer","author":"Endo Yuki","year":"2023","unstructured":"Yuki Endo. 2023. Masked-attention diffusion guidance for spatially controlling text-to-image generation. The Visual Computer (2023), 1-13."},{"key":"e_1_3_2_1_8_1","unstructured":"Patrick Esser Sumith Kulal Andreas Blattmann Rahim Entezari Jonas M\u00fcller Harry Saini Yam Levi Dominik Lorenz Axel Sauer Frederic Boesel Dustin Podell Tim Dockhorn Zion English Kyle Lacey Alex Goodwin Yannik Marek and Robin Rombach. 2024. Scaling Rectified Flow Transformers for High-Resolution Image Synthesis. arXiv:2403.03206 [cs.CV] https:\/\/arxiv.org\/abs\/2403.03206"},{"key":"e_1_3_2_1_9_1","volume-title":"ChatGPT is not all you need. A State of the Art Review of large Generative AI models. arXiv preprint arXiv:2301.04655","author":"Gozalo-Brizuela Roberto","year":"2023","unstructured":"Roberto Gozalo-Brizuela and Eduardo C Garrido-Merchan. 2023. ChatGPT is not all you need. A State of the Art Review of large Generative AI models. arXiv preprint arXiv:2301.04655 (2023)."},{"key":"e_1_3_2_1_10_1","unstructured":"Meera Hahn Wenjun Zeng Nithish Kannen Rich Galt Kartikeya Badola Been Kim and Zi Wang. 2024. Proactive Agents for Multi-Turn Text-to-Image Generation Under Uncertainty. arXiv:2412.06771 [cs.AI] https:\/\/arxiv.org\/abs\/2412.06771"},{"key":"e_1_3_2_1_11_1","volume-title":"TDRI: Two-Phase Dialogue Refinement and Co-Adaptation for Interactive Image Generation. In ICLR 2025 Workshop on Human-AI Coevolution. https:\/\/openreview.net\/forum?id=ApDqODJVej","author":"He Yangfan","year":"2025","unstructured":"Yangfan He, Yuheng Feng, Jianhui Wang, Kun Li, Yijin Wang, Haoyuan Li, Sida Li, Yinghui Xia, TIANYU SHI, and Miao Zhang. 2025. TDRI: Two-Phase Dialogue Refinement and Co-Adaptation for Interactive Image Generation. In ICLR 2025 Workshop on Human-AI Coevolution. https:\/\/openreview.net\/forum?id=ApDqODJVej"},{"key":"e_1_3_2_1_12_1","volume-title":"Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626","author":"Hertz Amir","year":"2022","unstructured":"Amir Hertz, Ron Mokady, Jay Tenenbaum, Kfir Aberman, Yael Pritch, and Daniel Cohen-Or. 2022. Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626 (2022)."},{"key":"e_1_3_2_1_13_1","unstructured":"Minbin Huang Yanxin Long Xinchi Deng Ruihang Chu Jiangfeng Xiong Xiaodan Liang Hong Cheng Qinglin Lu and Wei Liu. 2024. DialogGen: Multi-modal Interactive Dialogue System for Multi-turn Text-to-Image Generation. arXiv:2403.08857 [cs.CV] https:\/\/arxiv.org\/abs\/2403.08857"},{"key":"e_1_3_2_1_14_1","unstructured":"Imagen-Team-Google. 2024. Imagen 3. arXiv:2408.07009 [cs.CV] https:\/\/arxiv.org\/abs\/2408.07009"},{"key":"e_1_3_2_1_15_1","volume-title":"Agrim Gupta, Yunzhi Zhang, Deepak Narayanan, Hannah Teufel, Marco Bellagente, et al.","author":"Lee Tony","year":"2024","unstructured":"Tony Lee, Michihiro Yasunaga, Chenlin Meng, Yifan Mai, Joon Sung Park, Agrim Gupta, Yunzhi Zhang, Deepak Narayanan, Hannah Teufel, Marco Bellagente, et al., 2024. Holistic evaluation of text-to-image models. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. arXiv:2201.12086 [cs.CV] https:\/\/arxiv.org\/abs\/2201.12086","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. arXiv:2201.12086 [cs.CV] https:\/\/arxiv.org\/abs\/2201.12086"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Youwei Liang Junfeng He Gang Li Peizhao Li Arseniy Klimovskiy Nicholas Carolan Jiao Sun Jordi Pont-Tuset Sarah Young Feng Yang et al. 2023. Rich Human Feedback for Text-to-Image Generation. arXiv preprint arXiv:2312.10240 (2023).","DOI":"10.1109\/CVPR52733.2024.01835"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.804"},{"key":"e_1_3_2_1_19_1","unstructured":"OpenAI. 2024. GPT-4 Technical Report. arXiv:2303.08774 [cs.CL] https:\/\/arxiv.org\/abs\/2303.08774"},{"key":"e_1_3_2_1_20_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021a. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_21_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021b. Learning Transferable Visual Models From Natural Language Supervision. arXiv:2103.00020 [cs.CV] https:\/\/arxiv.org\/abs\/2103.00020"},{"key":"e_1_3_2_1_22_1","unstructured":"Rafael Rafailov Archit Sharma Eric Mitchell Stefano Ermon Christopher D. Manning and Chelsea Finn. 2024. Direct Preference Optimization: Your Language Model is Secretly a Reward Model. arXiv:2305.18290 [cs.LG] https:\/\/arxiv.org\/abs\/2305.18290"},{"key":"e_1_3_2_1_23_1","unstructured":"Aditya Ramesh Prafulla Dhariwal Alex Nichol Casey Chu and Mark Chen. 2022. Hierarchical Text-Conditional Image Generation with CLIP Latents. arXiv:2204.06125 [cs.CV] https:\/\/arxiv.org\/abs\/2204.06125"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2022. High-Resolution Image Synthesis with Latent Diffusion Models. arXiv:2112.10752 [cs.CV] https:\/\/arxiv.org\/abs\/2112.10752","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_25_1","volume-title":"Burcu Karagol Ayan, S. Sara Mahdavi, Rapha Gontijo Lopes, Tim Salimans, Jonathan Ho, David J Fleet, and Mohammad Norouzi.","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour, Burcu Karagol Ayan, S. Sara Mahdavi, Rapha Gontijo Lopes, Tim Salimans, Jonathan Ho, David J Fleet, and Mohammad Norouzi. 2022. Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding. arXiv:2205.11487 [cs.CV] https:\/\/arxiv.org\/abs\/2205.11487"},{"key":"e_1_3_2_1_26_1","unstructured":"Jiaming Song Chenlin Meng and Stefano Ermon. 2022. Denoising Diffusion Implicit Models. arXiv:2010.02502 [cs.LG] https:\/\/arxiv.org\/abs\/2010.02502"},{"key":"e_1_3_2_1_27_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.522"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642803"},{"key":"e_1_3_2_1_30_1","unstructured":"Xun Wu Shaohan Huang Guolong Wang Jing Xiong and Furu Wei. 2024. Multimodal Large Language Models Make Text-to-Image Generative Models Align Better. In The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=IRXyPm9IPW"},{"key":"e_1_3_2_1_31_1","volume-title":"Better aligning text-to-image models with human preference. arXiv preprint arXiv:2303.14420","author":"Wu Xiaoshi","year":"2023","unstructured":"Xiaoshi Wu, Keqiang Sun, Feng Zhu, Rui Zhao, and Hongsheng Li. 2023. Better aligning text-to-image models with human preference. arXiv preprint arXiv:2303.14420 (2023)."},{"key":"e_1_3_2_1_32_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Xu Jiazheng","year":"2024","unstructured":"Jiazheng Xu, Xiao Liu, Yuchen Wu, Yuxuan Tong, Qinkai Li, Ming Ding, Jie Tang, and Yuxiao Dong. 2024. Imagereward: Learning and evaluating human preferences for text-to-image generation. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Kai Yang Jian Tao Jiafei Lyu Chunjiang Ge Jiaxin Chen Qimai Li Weihan Shen Xiaolong Zhu and Xiu Li. 2024. Using Human Feedback to Fine-tune Diffusion Models without Any Reward Model. arXiv:2311.13231 [cs.LG] https:\/\/arxiv.org\/abs\/2311.13231","DOI":"10.1109\/CVPR52733.2024.00854"},{"key":"e_1_3_2_1_34_1","volume-title":"Thang Luong, Gunjan Baid, Zirui Wang, Vijay Vasudevan, Alexander Ku, Yinfei Yang, Burcu Karagol Ayan, et al.","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Yuanzhong Xu, Jing Yu Koh, Thang Luong, Gunjan Baid, Zirui Wang, Vijay Vasudevan, Alexander Ku, Yinfei Yang, Burcu Karagol Ayan, et al., 2022. Scaling autoregressive models for content-rich text-to-image generation. arXiv preprint arXiv:2206.10789, Vol. 2, 3 (2022), 5."},{"key":"e_1_3_2_1_35_1","volume-title":"Self-Play Fine-tuning of Diffusion Models for Text-to-image Generation. In The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=q3XavKPorV","author":"Yuan Huizhuo","year":"2024","unstructured":"Huizhuo Yuan, Zixiang Chen, Kaixuan Ji, and Quanquan Gu. 2024. Self-Play Fine-tuning of Diffusion Models for Text-to-image Generation. In The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=q3XavKPorV"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00862"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Wendi Zheng Jiayan Teng Zhuoyi Yang Weihan Wang Jidong Chen Xiaotao Gu Yuxiao Dong Ming Ding and Jie Tang. 2024. CogView3: Finer and Faster Text-to-Image Generation via Relay Diffusion. arXiv:2403.05121 [cs.CV] https:\/\/arxiv.org\/abs\/2403.05121","DOI":"10.1007\/978-3-031-72980-5_1"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755141","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:55:35Z","timestamp":1765310135000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755141"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":37,"alternative-id":["10.1145\/3746027.3755141","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755141","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}