{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:51:44Z","timestamp":1765309904266,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":76,"publisher":"ACM","funder":[{"name":"Provincial Key Research and Development Plan of Zhejiang Province","award":["No. 2024C01250(SD2)"],"award-info":[{"award-number":["No. 2024C01250(SD2)"]}]},{"name":"National Natural Science Foundation of China","award":["No. 62006208"],"award-info":[{"award-number":["No. 62006208"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755220","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:38Z","timestamp":1761377198000},"page":"9901-9910","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Inversion-DPO: Precise and Efficient Post-Training for Diffusion Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5313-2742","authenticated-orcid":false,"given":"Zejian","family":"Li","sequence":"first","affiliation":[{"name":"Zhejiang University, Ningbo, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6774-6319","authenticated-orcid":false,"given":"Yize","family":"Li","sequence":"additional","affiliation":[{"name":"Zhejiang University, Ningbo, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4787-6232","authenticated-orcid":false,"given":"Chenye","family":"Meng","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8449-1802","authenticated-orcid":false,"given":"Zhongni","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1905-8053","authenticated-orcid":false,"given":"Ling","family":"Yang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3762-1612","authenticated-orcid":false,"given":"Shengyuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8061-742X","authenticated-orcid":false,"given":"Guang","family":"Yang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0065-6272","authenticated-orcid":false,"given":"Changyuan","family":"Yang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4950-671X","authenticated-orcid":false,"given":"Zhiyuan","family":"Yang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5561-0493","authenticated-orcid":false,"given":"Lingyun","family":"Sun","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"crossref","unstructured":"Oron Ashual and Lior Wolf. 2019. Specifying Object Attributes and Relations in Interactive Scene Generation. 4560-4568.","DOI":"10.1109\/ICCV.2019.00466"},{"key":"e_1_3_2_2_2_1","volume-title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback. ArXiv","author":"Bai Yuntao","year":"2022","unstructured":"Yuntao Bai, Andy Jones, Kamal Ndousse, Amanda Askell, Anna Chen, Nova DasSarma, Dawn Drain, Stanislav Fort, Deep Ganguli, Tom Henighan, Nicholas Joseph, Saurav Kadavath, Jackson Kernion, Tom Conerly, Sheer El-Showk, Nelson Elhage, Zac Hatfield-Dodds, Danny Hernandez, Tristan Hume, Scott Johnston, Shauna Kravec, Liane Lovitt, Neel Nanda, Catherine Olsson, Dario Amodei, Tom Brown, Jack Clark, Sam McCandlish, Chris Olah, Ben Mann, and Jared Kaplan. 2022. Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback. ArXiv, Vol. abs\/2204.05862 (2022)."},{"key":"e_1_3_2_2_3_1","volume-title":"Training Diffusion Models with Reinforcement Learning. ArXiv","author":"Black Kevin","year":"2024","unstructured":"Kevin Black, Michael Janner, Yilun Du, Ilya Kostrikov, and Sergey Levine. 2024. Training Diffusion Models with Reinforcement Learning. ArXiv, Vol. abs\/2305.13301 (2024)."},{"key":"e_1_3_2_2_4_1","volume-title":"Coco-stuff: Thing and stuff classes in context. 1209-1218.","author":"Caesar Holger","year":"2018","unstructured":"Holger Caesar, Jasper Uijlings, and Vittorio Ferrari. 2018. Coco-stuff: Thing and stuff classes in context. 1209-1218."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592116"},{"key":"e_1_3_2_2_6_1","volume-title":"PixArt-\u03a3: Weak-to-Strong Training of Diffusion Transformer for 4K Text-to-Image Generation. ArXiv","author":"Chen Junsong","year":"2024","unstructured":"Junsong Chen, Chongjian Ge, Enze Xie, Yue Wu, Lewei Yao, Xiaozhe Ren, Zhongdao Wang, Ping Luo, Huchuan Lu, and Zhenguo Li. 2024. PixArt-\u03a3: Weak-to-Strong Training of Diffusion Transformer for 4K Text-to-Image Generation. ArXiv, Vol. abs\/2403.04692 (2024). https:\/\/api.semanticscholar.org\/CorpusID:268264262"},{"key":"e_1_3_2_2_7_1","first-page":"8780","volume-title":"Wortman Vaughan (Eds.)","volume":"34","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion Models Beat GANs on Image Synthesis. In Advances in Neural Information Processing Systems, M. Ranzato, A. Beygelzimer, Y. Dauphin, P.S. Liang, and J. Wortman Vaughan (Eds.), Vol. 34. 8780-8794."},{"key":"e_1_3_2_2_8_1","volume-title":"Chen","author":"Domingo-Enrich Carles","year":"2025","unstructured":"Carles Domingo-Enrich, Michal Drozdzal, Brian Karrer, and Ricky T. Q. Chen. 2025. Adjoint Matching: Fine-tuning Flow and Diffusion Generative Models with Memoryless Stochastic Optimal Control. ArXiv, Vol. abs\/2409.08861 (2025)."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00683"},{"key":"e_1_3_2_2_10_1","first-page":"125487","volume-title":"Advances in Neural Information Processing Systems","volume":"37","author":"Eyring Luca","year":"2024","unstructured":"Luca Eyring, Shyamgopal Karthik, Karsten Roth, Alexey Dosovitskiy, and Zeynep Akata. 2024. ReNO: Enhancing One-step Text-to-Image Models through Reward-based Noise Optimization. In Advances in Neural Information Processing Systems, Vol. 37. Curran Associates, Inc., 125487-125519."},{"key":"e_1_3_2_2_11_1","first-page":"79858","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Fan Ying","year":"2023","unstructured":"Ying Fan, Olivia Watkins, Yuqing Du, Hao Liu, Moonkyung Ryu, Craig Boutilier, Pieter Abbeel, Mohammad Ghavamzadeh, Kangwook Lee, and Kimin Lee. 2023. DPOK: Reinforcement Learning for Fine-tuning Text-to-Image Diffusion Models. In Advances in Neural Information Processing Systems, Vol. 36. Curran Associates, Inc., 79858-79885."},{"key":"e_1_3_2_2_12_1","volume-title":"Xin Eric Wang, and William Yang Wang","author":"Feng Weixi","year":"2023","unstructured":"Weixi Feng, Xuehai He, Tsu-Jui Fu, Varun Jampani, Arjun Akula, Pradyumna Narayana, Sugato Basu, Xin Eric Wang, and William Yang Wang. 2023. Training-Free Structured Diffusion Guidance for Compositional Text-to-Image Synthesis."},{"key":"e_1_3_2_2_13_1","first-page":"4744","article-title":"Ranni","author":"Feng Yutong","year":"2024","unstructured":"Yutong Feng, Biao Gong, Di Chen, Yujun Shen, Yu Liu, and Jingren Zhou. 2024. Ranni: Taming Text-to-Image Diffusion for Accurate Instruction Following. 4744-4753.","journal-title":"Taming Text-to-Image Diffusion for Accurate Instruction Following."},{"key":"e_1_3_2_2_14_1","volume-title":"Computer Vision - ECCV","author":"Garibi Daniel","year":"2024","unstructured":"Daniel Garibi, Or Patashnik, Andrey Voynov, Hadar Averbuch-Elor, and Daniel Cohen-Or. 2025. ReNoise: Real Image Inversion Through Iterative Noising. In Computer Vision - ECCV 2024, Ale\u0161 Leonardis, Elisa Ricci, Stefan Roth, Olga Russakovsky, Torsten Sattler, and G\u00fcl Varol (Eds.). Springer Nature Switzerland, Cham, 395-413."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681678"},{"key":"e_1_3_2_2_16_1","volume-title":"GANs Trained by a Two Time-Scale Update Rule Converge to a Nash Equilibrium. ArXiv","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, G\u00fcnter Klambauer, and Sepp Hochreiter. 2017. GANs Trained by a Two Time-Scale Update Rule Converge to a Nash Equilibrium. ArXiv, Vol. abs\/1706.08500 (2017). https:\/\/api.semanticscholar.org\/CorpusID:231697514"},{"key":"e_1_3_2_2_17_1","first-page":"6840","article-title":"Denoising Diffusion Probabilistic Models","volume":"33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising Diffusion Probabilistic Models. In Advances in Neural Information Processing Systems, Vol. 33. 6840-6851.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_18_1","volume-title":"ELLA: Equip Diffusion Models with LLM for Enhanced Semantic Alignment. arXiv:2403.05135 [cs.CV]","author":"Hu Xiwei","year":"2024","unstructured":"Xiwei Hu, Rui Wang, Yixiao Fang, Bin Fu, Pei Cheng, and Gang Yu. 2024. ELLA: Equip Diffusion Models with LLM for Enhanced Semantic Alignment. arXiv:2403.05135 [cs.CV]"},{"key":"e_1_3_2_2_19_1","first-page":"78723","article-title":"T2I-CompBench: A Comprehensive Benchmark for Open-world Compositional Text-to-image Generation","volume":"36","author":"Huang Kaiyi","year":"2023","unstructured":"Kaiyi Huang, Kaiyue Sun, Enze Xie, Zhenguo Li, and Xihui Liu. 2023. T2I-CompBench: A Comprehensive Benchmark for Open-world Compositional Text-to-image Generation. In Advances in Neural Information Processing Systems, Vol. 36. 78723-78747.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681657"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681498"},{"key":"e_1_3_2_2_22_1","volume-title":"RealignDiff: Boosting Text-to-Image Diffusion Model with Coarse-to-fine Semantic Re-alignment. ArXiv","author":"Jiang Zutao","year":"1959","unstructured":"Zutao Jiang, Guian Fang, Jianhua Han, Guansong Lu, Hang Xu, Shengcai Liao, Xiaojun Chang, and Xiaodan Liang. 2024a. RealignDiff: Boosting Text-to-Image Diffusion Model with Coarse-to-fine Semantic Re-alignment. ArXiv, Vol. abs\/2305.19599 (2024)."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"crossref","unstructured":"Justin Johnson Agrim Gupta and Li Fei-Fei. 2018. Image Generation from Scene Graphs. 1219-1228.","DOI":"10.1109\/CVPR.2018.00133"},{"key":"e_1_3_2_2_24_1","volume-title":"International Conference on Representation Learning.","author":"Ju Xuan","year":"2023","unstructured":"Xuan Ju, Ailing Zeng, Yuxuan Bian, Shaoteng Liu, and Qiang Xu. 2023. Direct Inversion: Boosting Diffusion-based Editing with 3 Lines of Code. In International Conference on Representation Learning."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00246"},{"key":"e_1_3_2_2_26_1","first-page":"36652","article-title":"Pick-a-Pic: An Open Dataset of User Preferences for Text-to-Image Generation","volume":"36","author":"Kirstain Yuval","year":"2023","unstructured":"Yuval Kirstain, Adam Polyak, Uriel Singer, Shahbuland Matiana, Joe Penna, and Omer Levy. 2023. Pick-a-Pic: An Open Dataset of User Preferences for Text-to-Image Generation. In Advances in Neural Information Processing Systems, Vol. 36. 36652-36663.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_2_28_1","volume-title":"Reward Guided Latent Consistency Distillation. Transactions on Machine Learning Research","author":"Li Jiachen","year":"2024","unstructured":"Jiachen Li, Weixi Feng, Wenhu Chen, and William Yang Wang. 2024a. Reward Guided Latent Consistency Distillation. Transactions on Machine Learning Research (2024). Featured Certification."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680597"},{"key":"e_1_3_2_2_30_1","first-page":"22511","article-title":"GLIGEN","author":"Li Yuheng","year":"2023","unstructured":"Yuheng Li, Haotian Liu, Qingyang Wu, Fangzhou Mu, Jianwei Yang, Jianfeng Gao, Chunyuan Li, and Yong Jae Lee. 2023. GLIGEN: Open-Set Grounded Text-to-Image Generation. 22511-22521.","journal-title":"Open-Set Grounded Text-to-Image Generation."},{"key":"e_1_3_2_2_31_1","volume-title":"LAION-SG: An Enhanced Large-Scale Dataset for Training Complex Image-Text Models with Structural Annotations. ArXiv","author":"Li Zejian","year":"2024","unstructured":"Zejian Li, Chenye Meng, Yize Li, Ling Yang, Shengyuan Zhang, Jiarui Ma, Jiayi Li, Guang Yang, Changyuan Yang, Zhiyuan Yang, Jinxiong Chang, and Lingyun Sun. 2024b. LAION-SG: An Enhanced Large-Scale Dataset for Training Complex Image-Text Models with Structural Annotations. ArXiv, Vol. abs\/2412.08580 (2024)."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"crossref","unstructured":"Zhengqi Li Richard Tucker Noah Snavely and Aleksander Holynski. 2024c. Generative Image Dynamics. 24142-24153.","DOI":"10.1109\/CVPR52733.2024.02279"},{"key":"e_1_3_2_2_33_1","volume-title":"Aesthetic Post-Training Diffusion Models from Generic Preferences with Step-by-step Preference Optimization. ArXiv","author":"Liang Zhanhao","year":"2025","unstructured":"Zhanhao Liang, Yuhui Yuan, Shuyang Gu, Bohan Chen, Tiankai Hang, Mingxi Cheng, Ji Li, and Liang Zheng. 2025. Aesthetic Post-Training Diffusion Models from Generic Preferences with Step-by-step Preference Optimization. ArXiv, Vol. abs\/2406.04314 (2025)."},{"volume-title":"R3CD: Scene Graph to Image Generation with Relation-Aware Compositional Contrastive Control Diffusion","author":"Liu Jinxiu","key":"e_1_3_2_2_34_1","unstructured":"Jinxiu Liu and Qi Liu. 2024. R3CD: Scene Graph to Image Generation with Relation-Aware Compositional Contrastive Control Diffusion, Vol. 38. 3657-3665."},{"volume-title":"Compositional Visual Generation with Composable Diffusion Models. In European Conference on Computer Vision. 423-439","author":"Liu Nan","key":"e_1_3_2_2_35_1","unstructured":"Nan Liu, Shuang Li, Yilun Du, Antonio Torralba, and Joshua B. Tenenbaum. 2022. Compositional Visual Generation with Composable Diffusion Models. In European Conference on Computer Vision. 423-439."},{"key":"e_1_3_2_2_36_1","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. arXiv:1711.05101 [cs.LG] https:\/\/arxiv.org\/abs\/1711.05101"},{"key":"e_1_3_2_2_37_1","unstructured":"Yunhong Lu Qichao Wang Hengyuan Cao Xierui Wang Xiaoyin Xu and Min Zhang. 2025a. InPO: Inversion Preference Optimization with Reparametrized DDIM for Efficient Diffusion Model Alignment. arXiv:2503.18454 [cs.CV] https:\/\/arxiv.org\/abs\/2503.18454"},{"key":"e_1_3_2_2_38_1","unstructured":"Yunhong Lu Qichao Wang Hengyuan Cao Xiaoyin Xu and Min Zhang. 2025b. Smoothed Preference Optimization via ReNoise Inversion for Aligning Diffusion Models with Varied Human Preferences. arXiv:2506.02698 [cs.CV] https:\/\/arxiv.org\/abs\/2506.02698"},{"key":"e_1_3_2_2_39_1","unstructured":"Nanye Ma Shangyuan Tong Haolin Jia Hexiang Hu Yu-Chuan Su Mingda Zhang Xuan Yang Yandong Li Tommi Jaakkola Xuhui Jia and Saining Xie. 2025. Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps. arXiv:2501.09732 [cs.CV] https:\/\/arxiv.org\/abs\/2501.09732"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681688"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.5555\/3618408.3619447"},{"key":"e_1_3_2_2_42_1","first-page":"27730","volume-title":"Advances in Neural Information Processing Systems","volume":"35","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul F Christiano, Jan Leike, and Ryan Lowe. 2022. Training language models to follow instructions with human feedback. In Advances in Neural Information Processing Systems, Vol. 35. Curran Associates, Inc., 27730-27744."},{"key":"e_1_3_2_2_43_1","volume-title":"SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis. ArXiv","author":"Podell Dustin","year":"2023","unstructured":"Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas M\u00fcller, Joe Penna, and Robin Rombach. 2023. SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis. ArXiv, Vol. abs\/2307.01952 (2023). https:\/\/arxiv.org\/abs\/2307.01952"},{"key":"e_1_3_2_2_44_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 139). 8748-8763. https:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"e_1_3_2_2_45_1","first-page":"53728","article-title":"Direct preference optimization: Your language model is secretly a reward model","volume":"36","author":"Rafailov Rafael","year":"2023","unstructured":"Rafael Rafailov, Archit Sharma, Eric Mitchell, Christopher D Manning, Stefano Ermon, and Chelsea Finn. 2023. Direct preference optimization: Your language model is secretly a reward model. Advances in Neural Information Processing Systems, Vol. 36 (2023), 53728-53741.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_46_1","volume-title":"Hierarchical Text-Conditional Image Generation with CLIP Latents. ArXiv","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical Text-Conditional Image Generation with CLIP Latents. ArXiv, Vol. abs\/2204.06125 (2022). https:\/\/api.semanticscholar.org\/CorpusID:248097655"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2022. High-Resolution Image Synthesis with Latent Diffusion Models. 10674-10685.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_2_48_1","first-page":"36479","article-title":"Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding","volume":"35","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, Jonathan Ho, David J Fleet, and Mohammad Norouzi. 2022. Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding. In Advances in Neural Information Processing Systems, Vol. 35. 36479-36494.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_49_1","first-page":"25278","article-title":"LAION-5B: An open large-scale dataset for training next generation image-text models","volume":"35","author":"Schuhmann Christoph","year":"2022","unstructured":"Christoph Schuhmann, Romain Beaumont, Richard Vencu, Cade Gordon, Ross Wightman, Mehdi Cherti, Theo Coombes, Aarush Katta, Clayton Mullis, Mitchell Wortsman, Patrick Schramowski, Srivatsa Kundurthy, Katherine Crowson, Ludwig Schmidt, Robert Kaczmarczyk, and Jenia Jitsev. 2022. LAION-5B: An open large-scale dataset for training next generation image-text models. In Advances in Neural Information Processing Systems, Vol. 35. 25278-25294.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_50_1","volume-title":"Adafactor: Adaptive Learning Rates with Sublinear Memory Cost. arXiv:1804.04235 [cs.LG] https:\/\/arxiv.org\/abs\/1804.04235","author":"Shazeer Noam","year":"2018","unstructured":"Noam Shazeer and Mitchell Stern. 2018. Adafactor: Adaptive Learning Rates with Sublinear Memory Cost. arXiv:1804.04235 [cs.LG] https:\/\/arxiv.org\/abs\/1804.04235"},{"key":"e_1_3_2_2_51_1","volume-title":"Guangyong Chen, Yijun Li, and Ying cong Chen.","author":"Shen Guibao","year":"2024","unstructured":"Guibao Shen, Luozhou Wang, Jiantao Lin, Wenhang Ge, Chaozhe Zhang, Xin Tao, Yuanhui Zhang, Pengfei Wan, Zhong ming Wang, Guangyong Chen, Yijun Li, and Ying cong Chen. 2024. SG-Adapter: Enhancing Text-to-Image Generation with Scene Graph Guidance. ArXiv, Vol. abs\/2405.15321 (2024). https:\/\/api.semanticscholar.org\/CorpusID:270045693"},{"key":"e_1_3_2_2_52_1","volume-title":"Denoising Diffusion Implicit Models. In International Conference on Learning Representations.","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2020. Denoising Diffusion Implicit Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3658157"},{"key":"e_1_3_2_2_54_1","volume-title":"Diffusion-Sharpening: Fine-tuning Diffusion Models with Denoising Trajectory Sharpening. ArXiv","author":"Tian Ye","year":"2025","unstructured":"Ye Tian, Ling Yang, Xinchen Zhang, Yunhai Tong, Mengdi Wang, and Bin Cui. 2025. Diffusion-Sharpening: Fine-tuning Diffusion Models with Denoising Trajectory Sharpening. ArXiv, Vol. abs\/2502.12146 (2025)."},{"key":"e_1_3_2_2_55_1","volume-title":"Proceedings of the 41st International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"48918","author":"Uehara Masatoshi","year":"2024","unstructured":"Masatoshi Uehara, Yulai Zhao, Kevin Black, Ehsan Hajiramezanali, Gabriele Scalia, Nathaniel Lee Diamant, Alex M Tseng, Sergey Levine, and Tommaso Biancalani. 2024. Feedback Efficient Online Fine-Tuning of Diffusion Models. In Proceedings of the 41st International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 235). PMLR, 48892-48918."},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1162\/NECO_a_00142"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"crossref","unstructured":"Bram Wallace Meihua Dang Rafael Rafailov Linqi Zhou Aaron Lou Senthil Purushwalkam Stefano Ermon Caiming Xiong Shafiq Joty and Nikhil Naik. 2024. Diffusion Model Alignment Using Direct Preference Optimization. 8228-8238.","DOI":"10.1109\/CVPR52733.2024.00786"},{"volume-title":"Compositional Text-to-Image Synthesis with Attention Map Control of Diffusion Models","author":"Wang Ruichen","key":"e_1_3_2_2_58_1","unstructured":"Ruichen Wang, Zekang Chen, Chen Chen, Jian Ma, Haonan Lu, and Xiaodong Lin. 2024a. Compositional Text-to-Image Synthesis with Attention Map Control of Diffusion Models, Vol. 38. 5544-5552."},{"key":"e_1_3_2_2_59_1","volume-title":"Scene Graph Disentanglement and Composition for Generalizable Complex Image Generation. In The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Wang Yunnan","year":"2024","unstructured":"Yunnan Wang, Ziqiang Li, Wenyao Zhang, Zequn Zhang, Baao Xie, Xihui Liu, Wenjun Zeng, and Xin Jin. 2024b. Scene Graph Disentanglement and Composition for Generalizable Complex Image Generation. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611769"},{"volume-title":"Scene Graph to Image Synthesis via Knowledge Consensus","author":"Wu Yang","key":"e_1_3_2_2_61_1","unstructured":"Yang Wu, Pengxu Wei, and Liang Lin. 2023. Scene Graph to Image Synthesis via Knowledge Consensus, Vol. 37. 2856-2865."},{"key":"e_1_3_2_2_62_1","first-page":"7452","article-title":"BoxDiff","author":"Xie Jinheng","year":"2023","unstructured":"Jinheng Xie, Yuexiang Li, Yawen Huang, Haozhe Liu, Wentian Zhang, Yefeng Zheng, and Mike Zheng Shou. 2023. BoxDiff: Text-to-Image Synthesis with Training-Free Box-Constrained Diffusion. 7452-7461.","journal-title":"Text-to-Image Synthesis with Training-Free Box-Constrained Diffusion."},{"key":"e_1_3_2_2_63_1","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems. Article 700","author":"Xu Jiazheng","year":"2024","unstructured":"Jiazheng Xu, Xiao Liu, Yuchen Wu, Yuxuan Tong, Qinkai Li, Ming Ding, Jie Tang, and Yuxiao Dong. 2024. ImageReward: learning and evaluating human preferences for text-to-image generation. In Proceedings of the 37th International Conference on Neural Information Processing Systems. Article 700, 33 pages."},{"key":"e_1_3_2_2_64_1","unstructured":"Yilun Xu Shangyuan Tong and T. Jaakkola. 2023. Stable Target Field for Reduced Variance Score Estimation in Diffusion Models. In ArXiv."},{"key":"e_1_3_2_2_65_1","doi-asserted-by":"crossref","unstructured":"Kai Yang Jian Tao Jiafei Lyu Chunjiang Ge Jiaxin Chen Weihan Shen Xiaolong Zhu and Xiu Li. 2024b. Using Human Feedback to Fine-tune Diffusion Models without Any Reward Model. 8941-8951.","DOI":"10.1109\/CVPR52733.2024.00854"},{"key":"e_1_3_2_2_66_1","volume-title":"Diffusion-Based Scene Graph to Image Generation with Masked Contrastive Pre-Training. ArXiv","author":"Yang Ling","year":"2022","unstructured":"Ling Yang, Zhilin Huang, Yang Song, Shenda Hong, G. Li, Wentao Zhang, Bin Cui, Bernard Ghanem, and Ming-Hsuan Yang. 2022. Diffusion-Based Scene Graph to Image Generation with Masked Contrastive Pre-Training. ArXiv, Vol. abs\/2211.11138 (2022). https:\/\/api.semanticscholar.org\/CorpusID:253734954"},{"key":"e_1_3_2_2_67_1","volume-title":"Proceedings of the 41st International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"56721","author":"Yang Ling","year":"2024","unstructured":"Ling Yang, Zhaochen Yu, Chenlin Meng, Minkai Xu, Stefano Ermon, and Bin Cui. 2024c. Mastering Text-to-Image Diffusion: Recaptioning, Planning, and Generating with Multimodal LLMs. In Proceedings of the 41st International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 235). 56704-56721. https:\/\/proceedings.mlr.press\/v235\/yang24ai.html"},{"key":"e_1_3_2_2_68_1","volume-title":"Proceedings of the 41st International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"56032","author":"Yang Shentao","year":"2024","unstructured":"Shentao Yang, Tianqi Chen, and Mingyuan Zhou. 2024a. A Dense Reward View on Aligning Text-to-Image Diffusion with Preference. In Proceedings of the 41st International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 235). PMLR, 55998-56032."},{"key":"e_1_3_2_2_69_1","volume-title":"Training-Free Diffusion Model Alignment with Sampling Demons. In The Thirteenth International Conference on Learning Representations.","author":"Yeh Po-Hung","year":"2025","unstructured":"Po-Hung Yeh, Kuang-Huei Lee, and Jun cheng Chen. 2025. Training-Free Diffusion Model Alignment with Sampling Demons. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_2_70_1","volume-title":"RealCompo: Balancing Realism and Compositionality Improves Text-to-Image Diffusion Models. ArXiv","author":"Zhang Xinchen","year":"2024","unstructured":"Xinchen Zhang, Ling Yang, Yaqi Cai, Zhaochen Yu, Kaini Wang, Jiake Xie, Ye Tian, Minkai Xu, Yong Tang, Yujiu Yang, and Bin Cui. 2024b. RealCompo: Balancing Realism and Compositionality Improves Text-to-Image Diffusion Models. ArXiv, Vol. abs\/2402.12908 (2024)."},{"key":"e_1_3_2_2_71_1","volume-title":"IterComp: Iterative Composition-Aware Feedback Learning from Model Gallery for Text-to-Image Generation. ArXiv","author":"Zhang Xinchen","year":"2024","unstructured":"Xinchen Zhang, Ling Yang, Guohao Li, Yaqi Cai, Jiake Xie, Yong Tang, Yujiu Yang, Mengdi Wang, and Bin Cui. 2024c. IterComp: Iterative Composition-Aware Feedback Learning from Model Gallery for Text-to-Image Generation. ArXiv, Vol. abs\/2410.07171 (2024)."},{"key":"e_1_3_2_2_72_1","volume-title":"Aligning Few-Step Diffusion Models with Dense Reward Difference Learning. ArXiv","author":"Zhang Ziyi","year":"2024","unstructured":"Ziyi Zhang, Li Shen, Sen Zhang, Deheng Ye, Yong Luo, Miaojing Shi, Bo Du, and Dacheng Tao. 2024a. Aligning Few-Step Diffusion Models with Dense Reward Difference Learning. ArXiv, Vol. abs\/2411.11727 (2024)."},{"key":"e_1_3_2_2_73_1","volume-title":"Migc: Advanced multi-instance generation controller for image synthesis. ArXiv","author":"Zhou Dewei","year":"2024","unstructured":"Dewei Zhou, You Li, Fan Ma, Zongxin Yang, and Yi Yang. 2024a. Migc: Advanced multi-instance generation controller for image synthesis. ArXiv, Vol. abs\/2407.02329 (2024)."},{"key":"e_1_3_2_2_74_1","volume-title":"Migc: Multi-instance generation controller for text-to-image synthesis. 6818-6828.","author":"Zhou Dewei","year":"2024","unstructured":"Dewei Zhou, You Li, Fan Ma, Xiaoting Zhang, and Yi Yang. 2024b. Migc: Multi-instance generation controller for text-to-image synthesis. 6818-6828."},{"key":"e_1_3_2_2_75_1","doi-asserted-by":"publisher","unstructured":"Shengzhe Zhou Zejian Li Shengyuan Zhang Lefan Hou Changyuan Yang Guang Yang Zhiyuan Yang and Lingyun Sun. 2024c. Reducing spatial fitting error in distillation of denoising diffusion models. In Proceedings of the Thirty-Eighth AAAI Conference on Artificial Intelligence and Thirty-Sixth Conference on Innovative Applications of Artificial Intelligence and Fourteenth Symposium on Educational Advances in Artificial Intelligence (AAAI'24\/IAAI'24\/EAAI'24). AAAI Press Article 854 9 pages. doi:10.1609\/aaai.v38i7.28602","DOI":"10.1609\/aaai.v38i7.28602"},{"key":"e_1_3_2_2_76_1","doi-asserted-by":"crossref","unstructured":"Yufan Zhou Bingchen Liu Yizhe Zhu Xiao Yang Changyou Chen and Jinhui Xu. 2023. Shifted Diffusion for Text-to-image Generation. 10157-10166.","DOI":"10.1109\/CVPR52729.2023.00979"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755220","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:49:09Z","timestamp":1765309749000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755220"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":76,"alternative-id":["10.1145\/3746027.3755220","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755220","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}