{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:50:17Z","timestamp":1778082617259,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,10]]},"DOI":"10.1145\/3721238.3730613","type":"proceedings-article","created":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T08:40:47Z","timestamp":1753260047000},"page":"1-10","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["InstanceGen: Image Generation with Instance-level Instructions"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-0079-0046","authenticated-orcid":false,"given":"Etai","family":"Sella","sequence":"first","affiliation":[{"name":"Tel Aviv University, Tel Aviv, Israel and Meta, London, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6004-1299","authenticated-orcid":false,"given":"Yanir","family":"Kleiman","sequence":"additional","affiliation":[{"name":"Meta, London, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3476-0940","authenticated-orcid":false,"given":"Hadar","family":"Averbuch-Elor","sequence":"additional","affiliation":[{"name":"Cornell Tech, New York, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,7,27]]},"reference":[{"key":"e_1_3_3_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01834"},{"key":"e_1_3_3_2_3_1","unstructured":"Omer Bar-Tal Lior Yariv Yaron Lipman and Tali Dekel. 2023. Multidiffusion: Fusing diffusion paths for controlled image generation. (2023)."},{"key":"e_1_3_3_2_4_1","unstructured":"Lital Binyamin Yoad Tewel Hilit Segev Eran Hirsch Royi Rassin and Gal Chechik. 2024. Make It Count: Text-to-Image Generation with an Accurate Number of Objects. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.10210 (2024)."},{"key":"e_1_3_3_2_5_1","unstructured":"Kevin Black Michael Janner Yilun Du Ilya Kostrikov and Sergey Levine. 2023. Training diffusion models with reinforcement learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.13301 (2023)."},{"key":"e_1_3_3_2_6_1","doi-asserted-by":"crossref","unstructured":"Hila Chefer Yuval Alaluf Yael Vinker Lior Wolf and Daniel Cohen-Or. 2023. Attend-and-excite: Attention-based semantic guidance for text-to-image diffusion models. ACM Transactions on Graphics (TOG) 42 4 (2023) 1\u201310.","DOI":"10.1145\/3592116"},{"key":"e_1_3_3_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00526"},{"key":"e_1_3_3_2_8_1","unstructured":"Xiaohui Chen Yongfei Liu Yingxiang Yang Jianbo Yuan Quanzeng You Li-Ping Liu and Hongxia Yang. 2023. Reason out your layout: Evoking the layout master from large language models for text-to-image synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.17126 (2023)."},{"key":"e_1_3_3_2_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72630-9_25"},{"key":"e_1_3_3_2_10_1","unstructured":"Xiaoliang Dai Ji Hou Chih-Yao Ma Sam Tsai Jialiang Wang Rui Wang Peizhao Zhang Simon Vandenhende Xiaofang Wang Abhimanyu Dubey Matthew Yu Abhishek Kadian Filip Radenovic Dhruv Mahajan Kunpeng Li Yue Zhao Vladan Petrovic Mitesh\u00a0Kumar Singh Simran Motwani Yi Wen Yiwen Song Roshan Sumbaly Vignesh Ramanathan Zijian He Peter Vajda and Devi Parikh. 2023. Emu: Enhancing Image Generation Models Using Photogenic Needles in a Haystack. arxiv:https:\/\/arXiv.org\/abs\/2309.15807\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2309.15807"},{"key":"e_1_3_3_2_11_1","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. Advances in neural information processing systems 34 (2021) 8780\u20138794."},{"key":"e_1_3_3_2_12_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et\u00a0al. 2024. The llama 3 herd of models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.21783 (2024)."},{"key":"e_1_3_3_2_13_1","volume-title":"Forty-first International Conference on Machine Learning","author":"Esser Patrick","year":"2024","unstructured":"Patrick Esser, Sumith Kulal, Andreas Blattmann, Rahim Entezari, Jonas M\u00fcller, Harry Saini, Yam Levi, Dominik Lorenz, Axel Sauer, Frederic Boesel, et\u00a0al. 2024. Scaling rectified flow transformers for high-resolution image synthesis. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_3_2_14_1","doi-asserted-by":"crossref","unstructured":"Dhruba Ghosh Hannaneh Hajishirzi and Ludwig Schmidt. 2024. Geneval: An object-focused framework for evaluating text-to-image alignment. Advances in Neural Information Processing Systems 36 (2024).","DOI":"10.52202\/075280-2270"},{"key":"e_1_3_3_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"e_1_3_3_2_16_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020) 6840\u20136851."},{"key":"e_1_3_3_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_3_2_18_1","unstructured":"Black\u00a0Forest Labs. 2023. FLUX. https:\/\/github.com\/black-forest-labs\/flux."},{"key":"e_1_3_3_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"e_1_3_3_2_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72673-6_20"},{"key":"e_1_3_3_2_21_1","unstructured":"Yaron Lipman Ricky\u00a0TQ Chen Heli Ben-Hamu Maximilian Nickel and Matt Le. 2022. Flow matching for generative modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.02747 (2022)."},{"key":"e_1_3_3_2_22_1","unstructured":"Koichi Namekata Amirmojtaba Sabour Sanja Fidler and Seung\u00a0Wook Kim. 2024. Emerdiff: Emerging pixel-level semantic knowledge in diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.11739 (2024)."},{"key":"e_1_3_3_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02107"},{"key":"e_1_3_3_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00758"},{"key":"e_1_3_3_2_25_1","unstructured":"Dustin Podell Zion English Kyle Lacey Andreas Blattmann Tim Dockhorn Jonas M\u00fcller Joe Penna and Robin Rombach. 2023. SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis. arxiv:https:\/\/arXiv.org\/abs\/2307.01952\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2307.01952"},{"key":"e_1_3_3_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_2_27_1","doi-asserted-by":"crossref","unstructured":"Chitwan Saharia William Chan Saurabh Saxena Lala Li Jay Whang Emily\u00a0L Denton Kamyar Ghasemipour Raphael Gontijo\u00a0Lopes Burcu Karagol\u00a0Ayan Tim Salimans et\u00a0al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in neural information processing systems 35 (2022) 36479\u201336494.","DOI":"10.52202\/068431-2643"},{"key":"e_1_3_3_2_28_1","unstructured":"A Vaswani. 2017. Attention is all you need. Advances in Neural Information Processing Systems (2017)."},{"key":"e_1_3_3_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00786"},{"key":"e_1_3_3_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00596"},{"key":"e_1_3_3_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00605"},{"key":"e_1_3_3_2_32_1","unstructured":"Xindi Wu Dingli Yu Yangsibo Huang Olga Russakovsky and Sanjeev Arora. 2024b. Conceptmix: A compositional image generation benchmark with controllable difficulty. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.14339 (2024)."},{"key":"e_1_3_3_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00685"},{"key":"e_1_3_3_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"}],"event":{"name":"SIGGRAPH Conference Papers '25: Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers","location":"Vancouver BC Canada","acronym":"SIGGRAPH Conference Papers '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721238.3730613","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T14:58:35Z","timestamp":1774018715000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721238.3730613"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,27]]},"references-count":33,"alternative-id":["10.1145\/3721238.3730613","10.1145\/3721238"],"URL":"https:\/\/doi.org\/10.1145\/3721238.3730613","relation":{},"subject":[],"published":{"date-parts":[[2025,7,27]]},"assertion":[{"value":"2025-07-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}