{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:50:46Z","timestamp":1765309846985,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755241","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:38Z","timestamp":1761377198000},"page":"9940-9949","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["PLATO: Generating Objects from Part Lists via Synthesized Layouts"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-4234-9780","authenticated-orcid":false,"given":"Amruta","family":"Muthal","sequence":"first","affiliation":[{"name":"CVIT, International Institute of Information Technology, Hyderabad, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1327-0036","authenticated-orcid":false,"given":"Varghese P","family":"Kuruvilla","sequence":"additional","affiliation":[{"name":"CVIT, International Institute of Information Technology, Hyderabad, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4134-1154","authenticated-orcid":false,"given":"Ravi Kiran","family":"Sarvadevabhatla","sequence":"additional","affiliation":[{"name":"CVIT, International Institute of Information Technology, Hyderabad, India"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"volume-title":"Proceedings of the 29th ACM International Conference on Multimedia. 318-326","author":"Baghel Rishabh","key":"e_1_3_2_1_1_1","unstructured":"Rishabh Baghel, Abhishek Trivedi, and Ravi Kiran Ravichandran, Tejas a nd Sarvadevabhatla. 2021. Meronymnet: A hierarchical model for unified and controllable multi-category object generation. In Proceedings of the 29th ACM International Conference on Multimedia. 318-326."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Xianjie Chen Roozbeh Mottaghi Xiaobai Liu Sanja Fidler Raquel Urtasun and Alan Yuille. 2014. Detect What You Can: Detecting and Representing Objects using Holistic Models and Body Parts. arXiv:1406.2031 [cs.CV] https:\/\/arxiv.org\/abs\/1406.2031","DOI":"10.1109\/CVPR.2014.254"},{"key":"e_1_3_2_1_3_1","unstructured":"Patrick Esser Sumith Kulal Andreas Blattmann Rahim Entezari Jonas M\u00fcller Harry Saini Yam Levi Dominik Lorenz Axel Sauer Frederic Boesel Dustin Podell Tim Dockhorn Zion English Kyle Lacey Alex Goodwin Yannik Marek and Robin Rombach. 2024. Scaling Rectified Flow Transformers for High-Resolution Image Synthesis. arXiv:2403.03206 [cs.CV] https:\/\/arxiv.org\/abs\/2403.03206"},{"key":"e_1_3_2_1_4_1","volume-title":"Varun Jampani, Arjun Akula, Xuehai He, Sugato Basu, Xin Eric Wang, and William Yang Wang.","author":"Feng Weixi","year":"2023","unstructured":"Weixi Feng, Wanrong Zhu, Tsu jui Fu, Varun Jampani, Arjun Akula, Xuehai He, Sugato Basu, Xin Eric Wang, and William Yang Wang. 2023. LayoutGPT: Compositional Visual Planning and Generation with Large Language Models. arXiv:2305.15393 [cs.CV] https:\/\/arxiv.org\/abs\/2305.15393"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00671"},{"key":"e_1_3_2_1_6_1","volume-title":"OLAF: A Plug-and-Play Framework for Enhanced Multi-object Multi-part Scene Parsing. arXiv:2411.02858 [cs.CV] https:\/\/arxiv.org\/abs\/2411.02858","author":"Gupta Pranav","year":"2024","unstructured":"Pranav Gupta, Rishubh Singh, Pradeep Shenoy, and Ravikiran Sarvadevabhatla. 2024. OLAF: A Plug-and-Play Framework for Enhanced Multi-object Multi-part Scene Parsing. arXiv:2411.02858 [cs.CV] https:\/\/arxiv.org\/abs\/2411.02858"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Ju He Shuo Yang Shaokang Yang Adam Kortylewski Xiaoding Yuan Jie-Neng Chen Shuai Liu Cheng Yang Qihang Yu and Alan Yuille. 2022. PartImageNet: A Large High-Quality Dataset of Parts. arXiv:2112.00933 [cs.CV] https:\/\/arxiv.org\/abs\/2112.00933","DOI":"10.1007\/978-3-031-20074-8_8"},{"key":"e_1_3_2_1_8_1","volume-title":"Ronan Le Bras, and Yejin Choi","author":"Hessel Jack","year":"2022","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan Le Bras, and Yejin Choi. 2022. CLIPScore: A Reference-free Evaluation Metric for Image Captioning. arXiv:2104.08718 [cs.CV] https:\/\/arxiv.org\/abs\/2104.08718"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00015"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00193"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00980"},{"key":"e_1_3_2_1_12_1","volume-title":"Jian-Guang Lou, and Dongmei Zhang.","author":"Jiang Zhaoyun","year":"2023","unstructured":"Zhaoyun Jiang, Jiaqi Guo, Shizhao Sun, Huayu Deng, Zhongkai Wu, Vuksan Mijovic, Zijiang James Yang, Jian-Guang Lou, and Dongmei Zhang. 2023. LayoutFormer: Conditional Graphic Layout Generation via Constraint Serialization and Decoding Space Restriction. arXiv:2208.08037 [cs.CV] https:\/\/arxiv.org\/abs\/2208.08037"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i1.19994"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475497"},{"key":"e_1_3_2_1_15_1","volume-title":"Kipf and Max Welling","author":"Thomas","year":"2016","unstructured":"Thomas N. Kipf and Max Welling. 2016. Variational Graph Auto-Encoders. arXiv:1611.07308 [stat.ML] https:\/\/arxiv.org\/abs\/1611.07308"},{"key":"e_1_3_2_1_16_1","volume-title":"Kipf and Max Welling","author":"Thomas","year":"2017","unstructured":"Thomas N. Kipf and Max Welling. 2017. Semi-Supervised Classification with Graph Convolutional Networks. arXiv:1609.02907 [cs.LG] https:\/\/arxiv.org\/abs\/1609.02907"},{"key":"e_1_3_2_1_17_1","volume-title":"BLT: Bidirectional Layout Transformer for Controllable Layout Generation. arXiv:2112.05112 [cs.CV] https:\/\/arxiv.org\/abs\/2112.05112","author":"Kong Xiang","year":"2022","unstructured":"Xiang Kong, Lu Jiang, Huiwen Chang, Han Zhang, Yuan Hao, Haifeng Gong, and Irfan Essa. 2022. BLT: Bidirectional Layout Transformer for Controllable Layout Generation. arXiv:2112.05112 [cs.CV] https:\/\/arxiv.org\/abs\/2112.05112"},{"key":"e_1_3_2_1_18_1","unstructured":"Black Forest Labs. 2024. FLUX. https:\/\/github.com\/black-forest-labs\/flux."},{"key":"e_1_3_2_1_19_1","unstructured":"Hsin-Ying Lee Lu Jiang Irfan Essa Phuong B Le Haifeng Gong Ming-Hsuan Yang and Weilong Yang. 2020. Neural Design Network: Graphic Layout Generation with Constraints. arXiv:1912.09421 [cs.CV] https:\/\/arxiv.org\/abs\/1912.09421"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"e_1_3_2_1_21_1","volume-title":"LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models. arXiv preprint arXiv:2305.13655","author":"Lian Long","year":"2023","unstructured":"Long Lian, Boyi Li, Adam Yala, and Trevor Darrell. 2023. LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models. arXiv preprint arXiv:2305.13655 (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"LayoutPrompter: Awaken the Design Ability of Large Language Models. In Thirty-seventh Conference on Neural Information Processing Systems.","author":"Lin Jiawei","year":"2023","unstructured":"Jiawei Lin, Jiaqi Guo, Shizhao Sun, Zijiang James Yang, Jian-Guang Lou, and Dongmei Zhang. 2023. LayoutPrompter: Awaken the Design Ability of Large Language Models. In Thirty-seventh Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_26"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01117"},{"key":"e_1_3_2_1_25_1","unstructured":"Kam Woh Ng Xiatian Zhu Yi-Zhe Song and Tao Xiang. 2024. PartCraft: Crafting Creative Objects by Parts. In ECCV."},{"key":"e_1_3_2_1_26_1","volume-title":"GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. arXiv:2112.10741 [cs.CV] https:\/\/arxiv.org\/abs\/2112.10741","author":"Nichol Alex","year":"2022","unstructured":"Alex Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, and Mark Chen. 2022. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. arXiv:2112.10741 [cs.CV] https:\/\/arxiv.org\/abs\/2112.10741"},{"key":"e_1_3_2_1_27_1","volume-title":"SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis. arXiv:2307.01952 [cs.CV] https:\/\/arxiv.org\/abs\/2307.01952","author":"Podell Dustin","year":"2023","unstructured":"Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas M\u00fcller, Joe Penna, and Robin Rombach. 2023. SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis. arXiv:2307.01952 [cs.CV] https:\/\/arxiv.org\/abs\/2307.01952"},{"key":"e_1_3_2_1_28_1","unstructured":"Leigang Qu Shengqiong Wu Hao Fei Liqiang Nie and Tat-Seng Chua. 2023. LayoutLLM-T2I: Eliciting Layout Guidance from LLM for Text-to-Image Generation. arXiv:2308.05095 [cs.CV] https:\/\/arxiv.org\/abs\/2308.05095"},{"key":"e_1_3_2_1_29_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arXiv:2103.00020 [cs.CV] https:\/\/arxiv.org\/abs\/2103.00020"},{"key":"e_1_3_2_1_30_1","unstructured":"Aditya Ramesh Prafulla Dhariwal Alex Nichol Casey Chu and Mark Chen. 2022. Hierarchical Text-Conditional Image Generation with CLIP Latents. arXiv:2204.06125 [cs.CV] https:\/\/arxiv.org\/abs\/2204.06125"},{"key":"e_1_3_2_1_31_1","unstructured":"Aditya Ramesh Mikhail Pavlov Gabriel Goh Scott Gray Chelsea Voss Alec Radford Mark Chen and Ilya Sutskever. 2021. Zero-Shot Text-to-Image Generation. arXiv:2102.12092 [cs.CV] https:\/\/arxiv.org\/abs\/2102.12092"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Hamid Rezatofighi Nathan Tsoi JunYoung Gwak Amir Sadeghian Ian Reid and Silvio Savarese. 2019. Generalized Intersection over Union: A Metric and A Loss for Bounding Box Regression. arXiv:1902.09630 [cs.CV] https:\/\/arxiv.org\/abs\/1902.09630","DOI":"10.1109\/CVPR.2019.00075"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_34_1","volume-title":"Burcu Karagol Ayan, S. Sara Mahdavi, Rapha Gontijo Lopes, Tim Salimans, Jonathan Ho, David J Fleet, and Mohammad Norouzi.","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour, Burcu Karagol Ayan, S. Sara Mahdavi, Rapha Gontijo Lopes, Tim Salimans, Jonathan Ho, David J Fleet, and Mohammad Norouzi. 2022. Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding. arXiv:2205.11487 [cs.CV] https:\/\/arxiv.org\/abs\/2205.11487"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Jaejung Seol Seojun Kim and Jaejun Yoo. 2024. PosterLlama: Bridging Design Ability of Langauge Model to Contents-Aware Layout Generation. arXiv:2404.00995 [cs.CV] https:\/\/arxiv.org\/abs\/2404.00995","DOI":"10.1007\/978-3-031-73007-8_26"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00621"},{"key":"e_1_3_2_1_37_1","volume-title":"Fourier features let networks learn high frequency functions in low dimensional domains. Advances in neural information processing systems","author":"Tancik Matthew","year":"2020","unstructured":"Matthew Tancik, Pratul Srinivasan, Ben Mildenhall, Sara Fridovich-Keil, Nithin Raghavan, Utkarsh Singhal, Ravi Ramamoorthi, Jonathan Barron, and Ren Ng. 2020. Fourier features let networks learn high frequency functions in low dimensional domains. Advances in neural information processing systems, Vol. 33 (2020), 7537-7547."},{"key":"e_1_3_2_1_38_1","unstructured":"Zecheng Tang Chenfei Wu Juntao Li and Nan Duan. 2023. LayoutNUWA: Revealing the Hidden Layout Expertise of Large Language Models. arXiv:2309.09506 [cs.CV] https:\/\/arxiv.org\/abs\/2309.09506"},{"key":"e_1_3_2_1_39_1","volume-title":"Rohit Girdhar, and Ishan Misra.","author":"Wang Xudong","year":"2024","unstructured":"Xudong Wang, Trevor Darrell, Sai Saketh Rambhatla, Rohit Girdhar, and Ishan Misra. 2024. InstanceDiffusion: Instance-level Control for Image Generation. arXiv:2402.03290 [cs.CV] https:\/\/arxiv.org\/abs\/2402.03290"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02154"},{"key":"e_1_3_2_1_42_1","unstructured":"Zhaohui Zheng Ping Wang Wei Liu Jinze Li Rongguang Ye and Dongwei Ren. 2019. Distance-IoU Loss: Faster and Better Learning for Bounding Box Regression. arXiv:1911.08287 [cs.CV] https:\/\/arxiv.org\/abs\/1911.08287"},{"key":"e_1_3_2_1_43_1","unstructured":"Zhaohui Zheng Ping Wang Dongwei Ren Wei Liu Rongguang Ye Qinghua Hu and Wangmeng Zuo. 2021. Enhancing Geometric Factors in Model Learning and Inference for Object Detection and Instance Segmentation. arXiv:2005.03572 [cs.CV] https:\/\/arxiv.org\/abs\/2005.03572"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Bolei Zhou Hang Zhao Xavier Puig Tete Xiao Sanja Fidler Adela Barriuso and Antonio Torralba. 2018. Semantic Understanding of Scenes through the ADE20K Dataset. arXiv:1608.05442 [cs.CV] https:\/\/arxiv.org\/abs\/1608.05442","DOI":"10.1007\/s11263-018-1140-0"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755241","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:46:42Z","timestamp":1765309602000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755241"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":44,"alternative-id":["10.1145\/3746027.3755241","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755241","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}