{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T14:03:43Z","timestamp":1772719423556,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":72,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Center of Data Intelligence: Technologies, Applications, and Systems, National Taiwan University","award":["113L900901, 113L900902, 113L900903"],"award-info":[{"award-number":["113L900901, 113L900902, 113L900903"]}]},{"name":"National Science and Technology Council, Taiwan","award":["NSTC-112-2628-E-002-033-MY4, NSTC-112-2634-F-002-002-MBK, NSTC-112-2221-E-A49-059-MY3, NSTC-112-2221-E-A49-094-MY3,"],"award-info":[{"award-number":["NSTC-112-2628-E-002-033-MY4, NSTC-112-2634-F-002-002-MBK, NSTC-112-2221-E-A49-059-MY3, NSTC-112-2221-E-A49-094-MY3,"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680936","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"9465-9474","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["ReCorD: Reasoning and Correcting Diffusion for HOI Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-5739-4228","authenticated-orcid":false,"given":"Jian-Yu","family":"Jiang-Lin","sequence":"first","affiliation":[{"name":"Natl. Taiwan Univ., Taipei, Taiwan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1268-5214","authenticated-orcid":false,"given":"Kang-Yang","family":"Huang","sequence":"additional","affiliation":[{"name":"Natl. Taiwan Univ., Taipei, Taiwan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9471-8528","authenticated-orcid":false,"given":"Ling","family":"Lo","sequence":"additional","affiliation":[{"name":"Natl. Yang Ming Chiao Tung Univ., Hsinchu, Taiwan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2328-2986","authenticated-orcid":false,"given":"Yi-Ning","family":"Huang","sequence":"additional","affiliation":[{"name":"Natl. Yang Ming Chiao Tung Univ., Hsinchu, Taiwan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7512-9624","authenticated-orcid":false,"given":"Terence","family":"Lin","sequence":"additional","affiliation":[{"name":"Natl. Yang Ming Chiao Tung Univ., Hsinchu, Taiwan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4071-3980","authenticated-orcid":false,"given":"Jhih-Ciang","family":"Wu","sequence":"additional","affiliation":[{"name":"Natl. Taiwan Univ., Taipei, Taiwan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2216-077X","authenticated-orcid":false,"given":"Hong-Han","family":"Shuai","sequence":"additional","affiliation":[{"name":"Natl. Yang Ming Chiao Tung Univ., Hsinchu, Taiwan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4662-7875","authenticated-orcid":false,"given":"Wen-Huang","family":"Cheng","sequence":"additional","affiliation":[{"name":"Natl. Taiwan Univ., Taipei, Taiwan"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_2_2_1","volume-title":"A-STAR: Test-time Attention Segregation and Retention for Text-to-image Synthesis. In Int. Conf. Comput. Vis.","author":"Agarwal Aishwarya","year":"2023","unstructured":"Aishwarya Agarwal, Srikrishna Karanam, K. J. Joseph, Apoorv Saxena, Koustava Goswami, and Balaji Vasan Srinivasan. 2023. A-STAR: Test-time Attention Segregation and Retention for Text-to-image Synthesis. In Int. Conf. Comput. Vis."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01762"},{"key":"e_1_3_2_2_4_1","volume-title":"MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation. In Int. Conf. Mach. Learn.","author":"Bar-Tal Omer","year":"2023","unstructured":"Omer Bar-Tal, Lior Yariv, Yaron Lipman, and Tali Dekel. 2023. MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation. In Int. Conf. Mach. Learn."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00048"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592116"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612524"},{"key":"e_1_3_2_2_9_1","volume-title":"ILVR: Conditioning Method for Denoising Diffusion Probabilistic Models. In Int. Conf. Comput. Vis.","author":"Choi Jooyoung","year":"2021","unstructured":"Jooyoung Choi, Sungwon Kim, Yonghyun Jeong, Youngjune Gwon, and Sungroh Yoon. 2021. ILVR: Conditioning Method for Denoising Diffusion Probabilistic Models. In Int. Conf. Comput. Vis."},{"key":"e_1_3_2_2_10_1","volume-title":"Int. Conf. Learn. Represent.","author":"Couairon Guillaume","year":"2023","unstructured":"Guillaume Couairon, Jakob Verbeek, Holger Schwenk, and Matthieu Cord. 2023. DiffEdit: Diffusion-based semantic image editing with mask guidance. In Int. Conf. Learn. Represent."},{"key":"e_1_3_2_2_11_1","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. In Adv. Neural Inform. Process. Syst."},{"key":"e_1_3_2_2_12_1","volume-title":"PixelFace: Towards Controllable Face Generation and Manipulation with Text Descriptions and Segmentation Masks. In ACM Int. Conf. Multimedia.","author":"Du Xiaoxiong","year":"2023","unstructured":"Xiaoxiong Du, Jun Peng, Yiyi Zhou, Jinlu Zhang, Siting Chen, Guannan Jiang, Xiaoshuai Sun, and Rongrong Ji. 2023. PixelFace: Towards Controllable Face Generation and Manipulation with Text Descriptions and Segmentation Masks. In ACM Int. Conf. Multimedia."},{"key":"e_1_3_2_2_13_1","volume-title":"Frido: Feature pyramid diffusion for complex scene image synthesis. In AAAI.","author":"Fan Wan-Cyuan","year":"2023","unstructured":"Wan-Cyuan Fan, Yen-Chun Chen, DongDong Chen, Yu Cheng, Lu Yuan, and Yu-Chiang Frank Wang. 2023. Frido: Feature pyramid diffusion for complex scene image synthesis. In AAAI."},{"key":"e_1_3_2_2_14_1","volume-title":"Xin Eric Wang, and William Yang Wang","author":"Feng Weixi","year":"2024","unstructured":"Weixi Feng, Wanrong Zhu, Tsu-jui Fu, Varun Jampani, Arjun Akula, Xuehai He, Sugato Basu, Xin Eric Wang, and William Yang Wang. 2024. Layoutgpt: Compositional visual planning and generation with large language models. In Adv. Neural Inform. Process. Syst."},{"key":"e_1_3_2_2_15_1","volume-title":"Ranni: Taming Text-to-Image Diffusion for Accurate Instruction Following. In IEEE Conf. Comput. Vis. Pattern Recog.","author":"Feng Yutong","year":"2024","unstructured":"Yutong Feng, Biao Gong, Di Chen, Yujun Shen, Yu Liu, and Jingren Zhou. 2024. Ranni: Taming Text-to-Image Diffusion for Accurate Instruction Following. In IEEE Conf. Comput. Vis. Pattern Recog."},{"key":"e_1_3_2_2_16_1","volume-title":"Rectify: A Training-Free Layout Calibration System for Text-to-Image Generation. In IEEE Conf. Comput. Vis. Pattern Recog.","author":"Gong Biao","year":"2024","unstructured":"Biao Gong, Siteng Huang, Yutong Feng, Shiwei Zhang, Yuyuan Li, and Yu Liu. 2024. Check, Locate, Rectify: A Training-Free Layout Calibration System for Text-to-Image Generation. In IEEE Conf. Comput. Vis. Pattern Recog."},{"key":"e_1_3_2_2_17_1","volume-title":"Visual semantic role labeling. arXiv preprint arXiv:1505.04474","author":"Gupta Saurabh","year":"2015","unstructured":"Saurabh Gupta and Jitendra Malik. 2015. Visual semantic role labeling. arXiv preprint arXiv:1505.04474 (2015)."},{"key":"e_1_3_2_2_18_1","volume-title":"Prompt-to-Prompt Image Editing with Cross-Attention Control. In Int. Conf. Learn. Represent.","author":"Hertz Amir","year":"2023","unstructured":"Amir Hertz, Ron Mokady, Jay Tenenbaum, Kfir Aberman, Yael Pritch, and Daniel Cohen-Or. 2023. Prompt-to-Prompt Image Editing with Cross-Attention Control. In Int. Conf. Learn. Represent."},{"key":"e_1_3_2_2_19_1","volume-title":"Ronan Le Bras, and Yejin Choi","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan Le Bras, and Yejin Choi. 2021. Clipscore: A reference-free evaluation metric for image captioning. In EMNLP."},{"key":"e_1_3_2_2_20_1","unstructured":"Martin Heusel Hubert Ramsauer Thomas Unterthiner Bernhard Nessler and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. In Adv. Neural Inform. Process. Syst."},{"key":"e_1_3_2_2_21_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. In Adv. Neural Inform. Process. Syst."},{"key":"e_1_3_2_2_22_1","volume-title":"InteractDiffusion: Interaction Control in Text-to-Image Diffusion Models. In IEEE Conf. Comput. Vis. Pattern Recog.","author":"Hoe Jiun Tian","year":"2024","unstructured":"Jiun Tian Hoe, Xudong Jiang, Chee Seng Chan, Yap-Peng Tan, and Weipeng Hu. 2024. InteractDiffusion: Interaction Control in Text-to-Image Diffusion Models. In IEEE Conf. Comput. Vis. Pattern Recog."},{"key":"e_1_3_2_2_23_1","unstructured":"Kaiyi Huang Kaiyue Sun Enze Xie Zhenguo Li and Xihui Liu. 2024. T2i-compbench: A comprehensive benchmark for open-world compositional text-to-image generation. In Adv. Neural Inform. Process. Syst."},{"key":"e_1_3_2_2_24_1","volume-title":"Composer: Creative and Controllable Image Synthesis with Composable Conditions. In Int. Conf. Mach. Learn.","author":"Huang Lianghua","year":"2023","unstructured":"Lianghua Huang, Di Chen, Yu Liu, Yujun Shen, Deli Zhao, and Jingren Zhou. 2023. Composer: Creative and Controllable Image Synthesis with Composable Conditions. In Int. Conf. Mach. Learn."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548282"},{"key":"e_1_3_2_2_26_1","volume-title":"SmartEdit: Exploring Complex Instruction-based Image Editing with Multimodal Large Language Models. In IEEE Conf. Comput. Vis. Pattern Recog.","author":"Huang Yuzhou","year":"2024","unstructured":"Yuzhou Huang, Liangbin Xie, Xintao Wang, Ziyang Yuan, Xiaodong Cun, Yixiao Ge, Jiantao Zhou, Chao Dong, Rui Huang, Ruimao Zhang, et al. 2024. SmartEdit: Exploring Complex Instruction-based Image Editing with Multimodal Large Language Models. In IEEE Conf. Comput. Vis. Pattern Recog."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00708"},{"key":"e_1_3_2_2_28_1","volume-title":"Pick-a-pic: An open dataset of user preferences for text-to-image generation. In Adv. Neural Inform. Process. Syst.","author":"Kirstain Yuval","year":"2024","unstructured":"Yuval Kirstain, Adam Polyak, Uriel Singer, Shahbuland Matiana, Joe Penna, and Omer Levy. 2024. Pick-a-pic: An open dataset of user preferences for text-to-image generation. In Adv. Neural Inform. Process. Syst."},{"key":"e_1_3_2_2_29_1","volume-title":"Int. Conf. Mach. Learn.","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In Int. Conf. Mach. Learn."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"e_1_3_2_2_31_1","article-title":"LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models. In","author":"Lian Long","year":"2024","unstructured":"Long Lian, Boyi Li, Adam Yala, and Trevor Darrell. 2024. LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models. In Trans. Mach. Learn Res.","journal-title":"Trans. Mach. Learn Res."},{"key":"e_1_3_2_2_32_1","volume-title":"Videodirectorgpt: Consistent multi-scene video generation via llm-guided planning. arXiv preprint arXiv:2309.15091","author":"Lin Han","year":"2023","unstructured":"Han Lin, Abhay Zala, Jaemin Cho, and Mohit Bansal. 2023. Videodirectorgpt: Consistent multi-scene video generation via llm-guided planning. arXiv preprint arXiv:2309.15091 (2023)."},{"key":"e_1_3_2_2_33_1","volume-title":"Microsoft COCO: Common Objects in Context. In Eur. Conf. Comput. Vis.","author":"Lin Tsung-Yi","unstructured":"Tsung-Yi Lin, Michael Maire, Serge J. Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C. Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In Eur. Conf. Comput. Vis."},{"key":"e_1_3_2_2_34_1","volume-title":"Towards Understanding Cross and Self-Attention in Stable Diffusion for Text-Guided Image Editing. In IEEE Conf. Comput. Vis. Pattern Recog.","author":"Liu Bingyan","year":"2024","unstructured":"Bingyan Liu, Chengyu Wang, Tingfeng Cao, Kui Jia, and Jun Huang. 2024. Towards Understanding Cross and Self-Attention in Stable Diffusion for Text-Guided Image Editing. In IEEE Conf. Comput. Vis. Pattern Recog."},{"key":"e_1_3_2_2_35_1","volume-title":"Int. Conf. Learn. Represent.","author":"Liu Fuxiao","year":"2023","unstructured":"Fuxiao Liu, Kevin Lin, Linjie Li, Jianfeng Wang, Yaser Yacoob, and Lijuan Wang. 2023. Mitigating hallucination in large multi-modal models via robust instruction tuning. In Int. Conf. Learn. Represent."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02309"},{"key":"e_1_3_2_2_37_1","volume-title":"Coarse-to-Fine Latent Diffusion for Pose-Guided Person Image Synthesis. In IEEE Conf. Comput. Vis. Pattern Recog.","author":"Lu Yanzuo","year":"2024","unstructured":"Yanzuo Lu, Manlin Zhang, Andy J Ma, Xiaohua Xie, and Jian-Huang Lai. 2024. Coarse-to-Fine Latent Diffusion for Pose-Guided Person Image Synthesis. In IEEE Conf. Comput. Vis. Pattern Recog."},{"key":"e_1_3_2_2_38_1","volume-title":"SDEdit: Guided Image Synthesis and Editing with Stochastic Differential Equations. In Int. Conf. Learn. Represent.","author":"Meng Chenlin","year":"2021","unstructured":"Chenlin Meng, Yang Song, Jiaming Song, Jiajun Wu, Jun-Yan Zhu, and Stefano Ermon. 2021. SDEdit: Guided Image Synthesis and Editing with Stochastic Differential Equations. In Int. Conf. Learn. Represent."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"crossref","unstructured":"Sewon Min Xinxi Lyu Ari Holtzman Mikel Artetxe Mike Lewis Hannaneh Hajishirzi and Luke Zettlemoyer. 2022. Rethinking the Role of Demonstrations: What makes In-context Learning Work?. In EMNLP.","DOI":"10.18653\/v1\/2022.emnlp-main.759"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01428"},{"key":"e_1_3_2_2_41_1","volume-title":"T2i-adapter: Learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:2302.08453","author":"Mou Chong","year":"2023","unstructured":"Chong Mou, Xintao Wang, Liangbin Xie, Jian Zhang, Zhongang Qi, Ying Shan, and Xiaohu Qie. 2023. T2i-adapter: Learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:2302.08453 (2023)."},{"key":"e_1_3_2_2_42_1","volume-title":"DreamMatcher: Appearance Matching Self-Attention for Semantically-Consistent Text-to-Image Personalization. In IEEE Conf. Comput. Vis. Pattern Recog.","author":"Nam Jisu","year":"2024","unstructured":"Jisu Nam, Heesu Kim, DongJae Lee, Siyoon Jin, Seungryong Kim, and Seunggyu Chang. 2024. DreamMatcher: Appearance Matching Self-Attention for Semantically-Consistent Text-to-Image Personalization. In IEEE Conf. Comput. Vis. Pattern Recog."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSMC.1979.4310076"},{"key":"e_1_3_2_2_44_1","volume-title":"SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis. In Int. Conf. Learn. Represent.","author":"Podell Dustin","year":"2024","unstructured":"Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas M\u00fcller, Joe Penna, and Robin Rombach. 2024. SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis. In Int. Conf. Learn. Represent."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612012"},{"key":"e_1_3_2_2_46_1","volume-title":"Int. Conf. Mach. Learn.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In Int. Conf. Mach. Learn."},{"key":"e_1_3_2_2_47_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_2_48_1","volume-title":"Int. Conf. Mach. Learn.","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In Int. Conf. Mach. Learn."},{"key":"e_1_3_2_2_49_1","unstructured":"Royi Rassin Eran Hirsch Daniel Glickman Shauli Ravfogel Yoav Goldberg and Gal Chechik. 2023. Linguistic Binding in Diffusion Models: Enhancing Attribute Correspondence through Attention Map Alignment. In Adv. Neural Inform. Process. Syst."},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_2_51_1","volume-title":"Burcu Karagol Ayan, Tim Salimans, et al.","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. In Adv. Neural Inform. Process. Syst."},{"key":"e_1_3_2_2_52_1","unstructured":"Christoph Schuhmann Romain Beaumont Richard Vencu Cade Gordon Ross Wightman Mehdi Cherti Theo Coombes Aarush Katta Clayton Mullis Mitchell Wortsman et al. 2022. Laion-5b: An open large-scale dataset for training next generation image-text models. In Adv. Neural Inform. Process. Syst."},{"key":"e_1_3_2_2_53_1","volume-title":"A picture is worth a thousand words: Principled recaptioning improves image generation. arXiv preprint arXiv:2310.16656","author":"Segalis Eyal","year":"2023","unstructured":"Eyal Segalis, Dani Valevski, Danny Lumen, Yossi Matias, and Yaniv Leviathan. 2023. A picture is worth a thousand words: Principled recaptioning improves image generation. arXiv preprint arXiv:2310.16656 (2023)."},{"key":"e_1_3_2_2_54_1","volume-title":"Denoising Diffusion Implicit Models. In Int. Conf. Learn. Represent.","author":"Song Jiaming","year":"2021","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2021. Denoising Diffusion Implicit Models. In Int. Conf. Learn. Represent."},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613806"},{"key":"e_1_3_2_2_56_1","volume-title":"AnyText: Multilingual Visual Text Generation and Editing. In Int. Conf. Learn. Represent.","author":"Tuo Yuxiang","year":"2024","unstructured":"Yuxiang Tuo, Wangmeng Xiang, Jun-Yan He, Yifeng Geng, and Xuansong Xie. 2024. AnyText: Multilingual Visual Text Generation and Editing. In Int. Conf. Learn. Represent."},{"key":"e_1_3_2_2_57_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Adv. Neural Inform. Process. Syst."},{"key":"e_1_3_2_2_58_1","volume-title":"Sketch-guided text-to-image diffusion models","author":"Voynov Andrey","unstructured":"Andrey Voynov, Kfir Aberman, and Daniel Cohen-Or. 2023. Sketch-guided text-to-image diffusion models. In ACM Special Interest Group on Computer Graphics."},{"key":"e_1_3_2_2_59_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. 2022. Chain-of-thought prompting elicits reasoning in large language models. In Adv. Neural Inform. Process. Syst."},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00605"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00685"},{"key":"e_1_3_2_2_62_1","volume-title":"Yefeng Zheng, Linlin Shen, and Mike Zheng Shou.","author":"Xie Jinheng","year":"2023","unstructured":"Jinheng Xie, Kai Ye, Yudong Li, Yuexiang Li, Kevin Qinghong Lin, Yefeng Zheng, Linlin Shen, and Mike Zheng Shou. 2023. VisorGPT: Learning Visual Prior via Generative Pre-Training. In Adv. Neural Inform. Process. Syst."},{"key":"e_1_3_2_2_63_1","unstructured":"Ling Yang Jingwei Liu Shenda Hong Zhilong Zhang Zhilin Huang Zheming Cai Wentao Zhang and Bin Cui. 2024. Improving diffusion-based image synthesis with context prediction. In Adv. Neural Inform. Process. Syst."},{"key":"e_1_3_2_2_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612200"},{"key":"e_1_3_2_2_65_1","volume-title":"The Fabrication of Reality and Fantasy: Scene Generation with LLM-Assisted Prompt Interpretation. In Eur. Conf. Comput. Vis.","author":"Yao Yi","year":"2024","unstructured":"Yi Yao, Chan-Feng Hsu, Jhe-Hao Lin, Hongxia Xie, Terence Lin, Yi-Ning Huang, Hong-Han Shuai, and Wen-Huang Cheng. 2024. The Fabrication of Reality and Fantasy: Scene Generation with LLM-Assisted Prompt Interpretation. In Eur. Conf. Comput. Vis."},{"key":"e_1_3_2_2_66_1","volume-title":"Int. Conf. Learn. Represent.","author":"Yuksekgonul Mert","year":"2023","unstructured":"Mert Yuksekgonul, Federico Bianchi, Pratyusha Kalluri, Dan Jurafsky, and James Zou. 2023. When and why vision-language models behave like bags-of-words, and what to do about it?. In Int. Conf. Learn. Represent."},{"key":"e_1_3_2_2_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_2_2_68_1","volume-title":"Controllable Text-to-Image Generation with GPT-4. arXiv preprint arXiv:2305.18583","author":"Zhang Tianjun","year":"2023","unstructured":"Tianjun Zhang, Yi Zhang, Vibhav Vineet, Neel Joshi, and Xin Wang. 2023. Controllable Text-to-Image Generation with GPT-4. arXiv preprint arXiv:2305.18583 (2023)."},{"key":"e_1_3_2_2_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01894"},{"key":"e_1_3_2_2_70_1","volume-title":"Movq: Modulating quantized vectors for high-fidelity image generation. In Adv. Neural Inform. Process. Syst.","author":"Zheng Chuanxia","year":"2022","unstructured":"Chuanxia Zheng, Tung-Long Vuong, Jianfei Cai, and Dinh Phung. 2022. Movq: Modulating quantized vectors for high-fidelity image generation. In Adv. Neural Inform. Process. Syst."},{"key":"e_1_3_2_2_71_1","volume-title":"LayoutDiffusion: Controllable Diffusion Model for Layout-to-image Generation. In IEEE Conf. Comput. Vis. Pattern Recog.","author":"Zheng Guangcong","year":"2023","unstructured":"Guangcong Zheng, Xianpan Zhou, Xuewei Li, Zhongang Qi, Ying Shan, and Xi Li. 2023. LayoutDiffusion: Controllable Diffusion Model for Layout-to-image Generation. In IEEE Conf. Comput. Vis. Pattern Recog."},{"key":"e_1_3_2_2_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548298"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680936","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680936","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:34Z","timestamp":1750295854000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680936"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":72,"alternative-id":["10.1145\/3664647.3680936","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680936","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}