{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,13]],"date-time":"2026-06-13T07:14:47Z","timestamp":1781334887061,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":72,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,25]],"date-time":"2025-04-25T00:00:00Z","timestamp":1745539200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,26]]},"DOI":"10.1145\/3706598.3713801","type":"proceedings-article","created":{"date-parts":[[2025,4,24]],"date-time":"2025-04-24T03:20:47Z","timestamp":1745464847000},"page":"1-19","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":18,"title":["SketchFlex: Facilitating Spatial-Semantic Coherence in Text-to-Image Generation with Region-Based Sketches"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-9564-3632","authenticated-orcid":false,"given":"Haichuan","family":"Lin","sequence":"first","affiliation":[{"name":"Thrust of Computational Media and Arts, The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8874-5928","authenticated-orcid":false,"given":"Yilin","family":"Ye","sequence":"additional","affiliation":[{"name":"Thrust of Computational Media and Arts, The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China and Academy of Interdisciplinary Studies, The Hong Kong University of Science and Technology, Hong Kong SAR, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4629-6268","authenticated-orcid":false,"given":"Jiazhi","family":"Xia","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Central South University, Changsha, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5600-8824","authenticated-orcid":false,"given":"Wei","family":"Zeng","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China and The Hong Kong University of Science and Technology, Hong Kong SAR, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,4,25]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"2005. Krita. https:\/\/krita.org\/."},{"key":"e_1_3_3_2_3_2","unstructured":"2022. Civitai. https:https:\/\/civitai.com. https:https:\/\/civitai.com"},{"key":"e_1_3_3_2_4_2","unstructured":"2023. Adobe Firefly. https:\/\/www.adobe.com\/uk\/products\/firefly.html."},{"key":"e_1_3_3_2_5_2","unstructured":"2023. colorful-realistic-xl-v1-sdxl. https:\/\/huggingface.co\/John6666\/colorful-realistic-xl-v1-sdxl."},{"key":"e_1_3_3_2_6_2","unstructured":"2023. Midjourney. https:\/\/www.midjourney.com\/."},{"key":"e_1_3_3_2_7_2","unstructured":"2023. Stable Diffusion. https:\/\/github.com\/AUTOMATIC1111\/stable-diffusion-webui."},{"key":"e_1_3_3_2_8_2","unstructured":"2024. InternVL2-Llama3-76B. https:\/\/huggingface.co\/OpenGVLab\/InternVL2-Llama3-76B\/."},{"key":"e_1_3_3_2_9_2","unstructured":"2024. Qwen2-VL. https:\/\/github.com\/QwenLM\/Qwen2-VL\/."},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3092919.3092926"},{"key":"e_1_3_3_2_11_2","volume-title":"Proceedings of the International Conference on Machine Learning","author":"Bar-Tal Omer","year":"2023","unstructured":"Omer Bar-Tal, Lior Yariv, Yaron Lipman, and Tali Dekel. 2023. MultiDiffusion: fusing diffusion paths for controlled image generation. In Proceedings of the International Conference on Machine Learning. Article 74, 16\u00a0pages."},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3641889"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606725"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00132"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","unstructured":"J Canny. 1986. A Computational Approach to Edge Detection. IEEE Trans. Pattern Anal. Mach. Intell. (1986) 679\u2013698. 10.1109\/TPAMI.1986.4767851","DOI":"10.1109\/TPAMI.1986.4767851"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-industry.1"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00526"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3563657.3596001"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606777"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3501819"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/2757226.2764773"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","unstructured":"Yuki Endo. 2023. Masked-attention diffusion guidance for spatially controlling text-to-image generation. The Visual Computer (2023) 6033\u20136045. 10.1007\/s00371-023-03151-y","DOI":"10.1007\/s00371-023-03151-y"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","unstructured":"Yingchaojie Feng Xingbo Wang Kam\u00a0Kwai Wong Sijia Wang Yuhong Lu Minfeng Zhu Baicheng Wang and Wei Chen. 2023. PromptMagician: Interactive Prompt Engineering for Text-to-Image Creation. IEEE Trans. Vis. Comput. Graph. (2023) 295\u2013305. 10.1109\/TVCG.2023.3327168","DOI":"10.1109\/TVCG.2023.3327168"},{"key":"e_1_3_3_2_24_2","unstructured":"Qingyan Guo Rui Wang Junliang Guo Bei Li Kaitao Song Xu Tan Guoqing Liu Jiang Bian and Yujiu Yang. 2023. Connecting large language models with evolutionary algorithms yields powerful prompt optimizers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.08532 (2023)."},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"crossref","unstructured":"Yuhan Guo Hanning Shao Can Liu Kai Xu and Xiaoru Yuan. 2024. PrompTHis: Visualizing the Process and Influence of Prompt Editing during Text-to-Image Creation. IEEE Trans. Vis. Comput. Graph. (2024) 1\u201312.","DOI":"10.1109\/TVCG.2024.3408255"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.5555\/3666122.3669045"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642824"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","unstructured":"James Hutson and Peter Cotroneo. 2023. Generative AI tools in art education: Exploring prompt engineering and iterative processes for enhanced creativity. Metaverse 4 1 (2023) 1\u201314. 10.54517\/m.v4i1.2164","DOI":"10.54517\/m.v4i1.2164"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600211.3604681"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00708"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"crossref","unstructured":"Alexander Kirillov Eric Mintun Nikhila Ravi Hanzi Mao Chloe Rolland Laura Gustafson Tete Xiao Spencer Whitehead Alexander\u00a0C. Berg Wan-Yen Lo Piotr Doll\u00e1r and Ross Girshick. 2023. Segment Anything. arXiv:https:\/\/arXiv.org\/abs\/2304.02643 (2023).","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581641.3584078"},{"key":"e_1_3_3_2_34_2","unstructured":"Akio Kodaira Chenfeng Xu Toshiki Hazama Takanori Yoshimoto Kohei Ohno Shogo Mitsuhori Soichi Sugano Hanying Cho Zhijian Liu and Kurt Keutzer. 2023. Streamdiffusion: A pipeline-level solution for real-time interactive generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.12491 (2023)."},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00662"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","unstructured":"Ranjay Krishna Yuke Zhu Oliver Groth Justin Johnson Kenji Hata Joshua Kravitz Stephanie Chen Yannis Kalantidis Li-Jia Li David\u00a0A. Shamma Michael\u00a0S. Bernstein and Li Fei-Fei. 2017. Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations. Int. J. Comput. Vision 123 1 (2017) 32\u201373. 10.1007\/s11263-016-0981-7","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_3_2_37_2","unstructured":"Katrin Lasinger Ren\u00e9 Ranftl Konrad Schindler and Vladlen Koltun. 2019. Towards robust monocular depth estimation: Mixing datasets for zero-shot cross-dataset transfer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1907.01341 (2019)."},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3563657.3595977"},{"key":"e_1_3_3_2_39_2","unstructured":"Jaerin Lee Daniel\u00a0Sungho Jung Kanggeon Lee and Kyoung\u00a0Mu Lee. 2024. StreamMultiDiffusion: Real-Time Interactive Generation with Region-Based Semantic Control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.09055 (2024)."},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","unstructured":"Zejian Li Ying Zhang Shengzhe Zhou Qi Liu Jiesi Zhang Haoran Xu Shuyao Chen Xiaoyu Chen and Lingyun Sun. 2024. RealtimeGen: An Intervenable AI Image Generation System for Commercial Digital Art Asset Creators. International Journal of Human\u2013Computer Interaction (2024) 1\u201324. 10.1080\/10447318.2024.2382508","DOI":"10.1080\/10447318.2024.2382508"},{"key":"e_1_3_3_2_42_2","unstructured":"Shilong Liu Zhaoyang Zeng Tianhe Ren Feng Li Hao Zhang Jie Yang Chunyuan Li Jianwei Yang Hang Su Jun Zhu et\u00a0al. 2023. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.05499 (2023)."},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3501825"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3563657.3596098"},{"key":"e_1_3_3_2_45_2","unstructured":"Simian Luo Yiqin Tan Longbo Huang Jian Li and Hang Zhao. 2023. Latent consistency models: Synthesizing high-resolution images with few-step inference. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.04378 (2023)."},{"key":"e_1_3_3_2_46_2","unstructured":"Simian Luo Yiqin Tan Suraj Patil Daniel Gu Patrick von Platen Apolin\u00e1rio Passos Longbo Huang Jian Li and Hang Zhao. 2023. Lcm-lora: A universal stable-diffusion acceleration module. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.05556 (2023)."},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"crossref","unstructured":"Chong Mou Xintao Wang Liangbin Xie Yanze Wu Jian Zhang Zhongang Qi Ying Shan and Xiaohu Qie. 2023. T2i-adapter: Learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.08453 (2023).","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"crossref","unstructured":"Yasumasa Onoe Sunayana Rane Zachary Berger Yonatan Bitton Jaemin Cho Roopal Garg Alexander Ku Zarana Parekh Jordi Pont-Tuset Garrett Tanzer et\u00a0al. 2024. DOCCI: Descriptions of Connected and Contrasting Images. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.19753 (2024).","DOI":"10.1007\/978-3-031-73027-6_17"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","unstructured":"Jonas Oppenlaender. 2023. A taxonomy of prompt modifiers for text-to-image generation. Behaviour & Information Technology (2023) 1\u201314. 10.1080\/0144929X.2023.2286532","DOI":"10.1080\/0144929X.2023.2286532"},{"key":"e_1_3_3_2_50_2","unstructured":"Jonas Oppenlaender Rhema Linder and Johanna Silvennoinen. 2023. Prompting ai art: An investigation into the creative skill of prompt engineering. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.13534 (2023)."},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01282"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"crossref","unstructured":"Reid Pryzant Dan Iter Jerry Li Yin\u00a0Tat Lee Chenguang Zhu and Michael Zeng. 2023. Automatic prompt optimization with gradient descent and beam search. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.03495 (2023).","DOI":"10.18653\/v1\/2023.emnlp-main.494"},{"key":"e_1_3_3_2_53_2","unstructured":"Aditya Ramesh Prafulla Dhariwal Alex Nichol Casey Chu and Mark Chen. 2022. Hierarchical Text-Conditional Image Generation with CLIP Latents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.06125 (2022)."},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"publisher","DOI":"10.5555\/3600270.3602913"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"publisher","unstructured":"Yang Shi Tian Gao Xiaohan Jiao and Nan Cao. 2023. Understanding design collaboration between designers and artificial intelligence: a systematic literature review. Proceedings of the ACM on Human-Computer Interaction 7 Article 368 (2023) 35\u00a0pages. 10.1145\/3610217","DOI":"10.1145\/3610217"},{"key":"e_1_3_3_2_57_2","unstructured":"Omost Team. 2024. Omost GitHub Page. https:\/\/github.com\/lllyasviel\/Omost."},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"publisher","unstructured":"Veera Vimpari Annakaisa Kultima Perttu H\u00e4m\u00e4l\u00e4inen and Christian Guckelsberger. 2023. \u201cAn Adapt-or-Die Type of Situation\u201d: Perception Adoption and Use of Text-to-Image-Generation AI by Game Industry Professionals. Proc. ACM Hum.-Comput. Interact. 7 CHI PLAY Article 379 (2023) 34\u00a0pages. 10.1145\/3611025","DOI":"10.1145\/3611025"},{"key":"e_1_3_3_2_59_2","volume-title":"International Conference on Machine Learning","author":"Wang Ruochen","year":"2024","unstructured":"Ruochen Wang, Ting Liu, Cho-Jui Hsieh, and Boqing Gong. 2024. On Discrete Prompt Optimization for Diffusion Models. In International Conference on Machine Learning."},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00596"},{"key":"e_1_3_3_2_61_2","unstructured":"Xinyi Wang Wanrong Zhu and William\u00a0Yang Wang. 2023. Large language models are implicitly topic models: Explaining and finding good demonstrations for in-context learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2301.11916 (2023)."},{"key":"e_1_3_3_2_62_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642803"},{"key":"e_1_3_3_2_63_2","unstructured":"Zijie\u00a0J Wang Evan Montoya David Munechika Haoyang Yang Benjamin Hoover and Duen\u00a0Horng Chau. 2022. Diffusiondb: A large-scale prompt gallery dataset for text-to-image generative models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.14896 (2022)."},{"key":"e_1_3_3_2_64_2","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems","author":"Wei Jason","year":"2024","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Brian Ichter, Fei Xia, Ed\u00a0H. Chi, Quoc\u00a0V. Le, and Denny Zhou. 2024. Chain-of-thought prompting elicits reasoning in large language models. In Proceedings of the International Conference on Neural Information Processing Systems."},{"key":"e_1_3_3_2_65_2","doi-asserted-by":"publisher","unstructured":"Shishi Xiao Suizi Huang Yue Lin Yilin Ye and Wei Zeng. 2023. Let the chart spark: Embedding semantic context into chart with text-to-image generative model. IEEE Trans. Vis. Comput. Graph. (2023). 10.1109\/TVCG.2023.3326913","DOI":"10.1109\/TVCG.2023.3326913"},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00685"},{"key":"e_1_3_3_2_67_2","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3587430"},{"key":"e_1_3_3_2_68_2","volume-title":"International Conference on Machine Learning","author":"Yang Ling","year":"2024","unstructured":"Ling Yang, Zhaochen Yu, Chenlin Meng, Minkai Xu, Stefano Ermon, and Bin Cui. 2024. Mastering Text-to-Image Diffusion: Recaptioning, Planning, and Generating with Multimodal LLMs. In International Conference on Machine Learning."},{"key":"e_1_3_3_2_69_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642165"},{"key":"e_1_3_3_2_70_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_3_2_71_2","unstructured":"Xu Zhao Wenchao Ding Yongqi An Yinglong Du Tao Yu Min Li Ming Tang and Jinqiao Wang. 2023. Fast segment anything. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.12156 (2023)."},{"key":"e_1_3_3_2_72_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02154"},{"key":"e_1_3_3_2_73_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.544"}],"event":{"name":"CHI 2025: CHI Conference on Human Factors in Computing Systems","location":"Yokohama Japan","acronym":"CHI '25","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 2025 CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706598.3713801","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3706598.3713801","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,4]],"date-time":"2025-07-04T04:47:23Z","timestamp":1751604443000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706598.3713801"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,25]]},"references-count":72,"alternative-id":["10.1145\/3706598.3713801","10.1145\/3706598"],"URL":"https:\/\/doi.org\/10.1145\/3706598.3713801","relation":{},"subject":[],"published":{"date-parts":[[2025,4,25]]},"assertion":[{"value":"2025-04-25","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}