{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:21:32Z","timestamp":1776889292047,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":72,"publisher":"ACM","funder":[{"name":"the National Natural Science Foundation of China","award":["62276256, U2441251"],"award-info":[{"award-number":["62276256, U2441251"]}]},{"name":"the Young Elite Scientists Sponsorship Program by CAST","award":["2023QNRC001"],"award-info":[{"award-number":["2023QNRC001"]}]},{"name":"the Young Scientists Fund of the State Key Laboratory of Multimodal Artificial Intelligence Systems","award":["ES2P100117"],"award-info":[{"award-number":["ES2P100117"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755064","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:50:47Z","timestamp":1761371447000},"page":"7709-7718","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Uni-Layout: Integrating Human Feedback in Unified Layout Generation and Evaluation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-7547-3169","authenticated-orcid":false,"given":"Shuo","family":"Lu","sequence":"first","affiliation":[{"name":"NLPR &amp; MAIS, CASIA, Beijing, China and School of AI, UCAS, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5508-3605","authenticated-orcid":false,"given":"Yanyin","family":"Chen","sequence":"additional","affiliation":[{"name":"JD.COM, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8890-4956","authenticated-orcid":false,"given":"Wei","family":"Feng","sequence":"additional","affiliation":[{"name":"JD.COM, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4665-7152","authenticated-orcid":false,"given":"Jiahao","family":"Fan","sequence":"additional","affiliation":[{"name":"JD.COM, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2145-9256","authenticated-orcid":false,"given":"Fengheng","family":"Li","sequence":"additional","affiliation":[{"name":"JD.COM, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6391-4814","authenticated-orcid":false,"given":"Zheng","family":"Zhang","sequence":"additional","affiliation":[{"name":"JD.COM, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5518-7077","authenticated-orcid":false,"given":"Jingjing","family":"Lv","sequence":"additional","affiliation":[{"name":"JD.COM, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6983-5213","authenticated-orcid":false,"given":"Junjie","family":"Shen","sequence":"additional","affiliation":[{"name":"JD.COM, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3275-2528","authenticated-orcid":false,"given":"Ching","family":"Law","sequence":"additional","affiliation":[{"name":"JD.COM, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3890-1894","authenticated-orcid":false,"given":"Jian","family":"Liang","sequence":"additional","affiliation":[{"name":"NLPR &amp; MAIS, CASIA, Beijing, China and School of AI, UCAS, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"Anthropic. 2024. Claude 3.5 Sonnet. https:\/\/www.anthropic.com\/news\/claude-3-5-sonnet Accessed: 2025-04-05."},{"key":"e_1_3_2_1_3_1","first-page":"13642","article-title":"Variational transformer networks for layout generation","author":"Arroyo Diego Martin","year":"2021","unstructured":"Diego Martin Arroyo, Janis Postels, and Federico Tombari. 2021. Variational transformer networks for layout generation. In Proc. CVPR. 13642-13652.","journal-title":"Proc. CVPR."},{"key":"e_1_3_2_1_4_1","volume-title":"Training diffusion models with reinforcement learning. arXiv preprint arXiv:2305.13301","author":"Black Kevin","year":"2023","unstructured":"Kevin Black, Michael Janner, Yilun Du, Ilya Kostrikov, and Sergey Levine. 2023. Training diffusion models with reinforcement learning. arXiv preprint arXiv:2305.13301 (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"Relactrl: Relevance-guided efficient control for diffusion transformers. arXiv preprint arXiv:2502.14377","author":"Cao Ke","year":"2025","unstructured":"Ke Cao, Jing Wang, Ao Ma, Jiasong Feng, Zhanjie Zhang, Xuanhua He, Shanyuan Liu, Bo Cheng, Dawei Leng, Yuhui Yin, et al., 2025. Relactrl: Relevance-guided efficient control for diffusion transformers. arXiv preprint arXiv:2502.14377 (2025)."},{"key":"e_1_3_2_1_6_1","volume-title":"PAID: A Framework of Product-Centric Advertising Image Design. arXiv preprint arXiv:2501.14316","author":"Chen Hongyu","year":"2025","unstructured":"Hongyu Chen, Min Zhou, Jing Jiang, Jiale Chen, Yang Lu, Bo Xiao, Tiezheng Ge, and Bo Zheng. 2025b. PAID: A Framework of Product-Centric Advertising Image Design. arXiv preprint arXiv:2501.14316 (2025)."},{"key":"e_1_3_2_1_7_1","volume-title":"Proc. ICLR.","author":"Chen Jian","year":"2024","unstructured":"Jian Chen, Ruiyi Zhang, Yufan Zhou, Rajiv Jain, Zhiqiang Xu, Ryan Rossi, and Changyou Chen. 2024b. Towards aligned layout generation via diffusion model with aesthetic constraints. In Proc. ICLR."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1631\/FITEE.2300312"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3696410.3714836"},{"key":"e_1_3_2_1_10_1","volume-title":"Deep reinforcement learning from human preferences. Advances in neural information processing systems","author":"Christiano Paul F","year":"2017","unstructured":"Paul F Christiano, Jan Leike, Tom Brown, Miljan Martic, Shane Legg, and Dario Amodei. 2017. Deep reinforcement learning from human preferences. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_11_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_12_1","volume-title":"European Conference on Computer Vision. Springer, 399-415","author":"Du Zhenbang","year":"2024","unstructured":"Zhenbang Du, Wei Feng, Haohan Wang, Yaoyu Li, Jingsen Wang, Jian Li, Zheng Zhang, Jingjing Lv, Xin Zhu, Junsheng Jin, et al., 2024. Towards reliable advertising image generation using human feedback. In European Conference on Computer Vision. Springer, 399-415."},{"key":"e_1_3_2_1_13_1","volume-title":"Fancyvideo: Towards dynamic and consistent video generation via cross-frame textual guidance. arXiv preprint arXiv:2408.08189","author":"Feng Jiasong","year":"2024","unstructured":"Jiasong Feng, Ao Ma, Jing Wang, Bo Cheng, Xiaodan Liang, Dawei Leng, and Yuhui Yin. 2024. Fancyvideo: Towards dynamic and consistent video generation via cross-frame textual guidance. arXiv preprint arXiv:2408.08189 (2024)."},{"key":"e_1_3_2_1_14_1","volume-title":"Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv preprint arXiv:2406.12793","author":"Aohan Zeng Team GLM","year":"2024","unstructured":"Team GLM, Aohan Zeng, Bin Xu, Bowen Wang, Chenhui Zhang, Da Yin, Dan Zhang, Diego Rojas, Guanyu Feng, Hanlin Zhao, et al., 2024. Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv preprint arXiv:2406.12793 (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Proc","author":"Andrade Guerreiro Julian Jorge","unstructured":"Julian Jorge Andrade Guerreiro, Naoto Inoue, Kento Masui, Mayu Otani, and Hideki Nakayama. 2024. LayoutFlow: flow matching for layout generation. In Proc. ECCV. Springer, 56-72."},{"key":"e_1_3_2_1_16_1","unstructured":"Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiao Bi et al. 2025b. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:2501.12948 (2025)."},{"key":"e_1_3_2_1_17_1","volume-title":"ContentDM: A Layout Diffusion Model for Content-Aware Layout Generation","author":"Guo Honglin","year":"2025","unstructured":"Honglin Guo, Weizhi Nie, Ruidong Chen, Lanjun Wang, Guoqing Jin, and Anan Liu. 2025a. ContentDM: A Layout Diffusion Model for Content-Aware Layout Generation. IEEE Transactions on Artificial Intelligence (2025)."},{"key":"e_1_3_2_1_18_1","volume-title":"Proc","author":"He Liu","unstructured":"Liu He, Yijuan Lu, John Corring, Dinei Florencio, and Cha Zhang. 2023. Diffusion-based document layout generation. In Proc. ICDAR. Springer, 361-378."},{"key":"e_1_3_2_1_19_1","volume-title":"PlanGen: Towards Unified Layout Planning and Image Generation in Auto-Regressive Vision Language Models. arXiv preprint arXiv:2503.10127","author":"He Runze","year":"2025","unstructured":"Runze He, Bo Cheng, Yuhang Ma, Qingxiang Jia, Shanyuan Liu, Ao Ma, Xiaoyu Wu, Liebucha Wu, Dawei Leng, and Yuhui Yin. 2025. PlanGen: Towards Unified Layout Planning and Image Generation in Auto-Regressive Vision Language Models. arXiv preprint arXiv:2503.10127 (2025)."},{"key":"e_1_3_2_1_20_1","volume-title":"The real-world-weight cross-entropy loss function: Modeling the costs of mislabeling","author":"Ho Yaoshiang","year":"2019","unstructured":"Yaoshiang Ho and Samuel Wookey. 2019. The real-world-weight cross-entropy loss function: Modeling the costs of mislabeling. IEEE access, Vol. 8 (2019), 4806-4813."},{"key":"e_1_3_2_1_21_1","first-page":"67","article-title":"Retrieval-augmented layout transformer for content-aware layout generation","author":"Horita Daichi","year":"2024","unstructured":"Daichi Horita, Naoto Inoue, Kotaro Kikuchi, Kota Yamaguchi, and Kiyoharu Aizawa. 2024. Retrieval-augmented layout transformer for content-aware layout generation. In Proc. CVPR. 67-76.","journal-title":"Proc. CVPR."},{"key":"e_1_3_2_1_22_1","first-page":"6018","article-title":"Posterlayout: A new benchmark and approach for content-aware visual-textual presentation layout","author":"Hsu Hsiao Yuan","year":"2023","unstructured":"Hsiao Yuan Hsu, Xiangteng He, Yuxin Peng, Hao Kong, and Qing Zhang. 2023. Posterlayout: A new benchmark and approach for content-aware visual-textual presentation layout. In Proc. CVPR. 6018-6026.","journal-title":"Proc. CVPR."},{"key":"e_1_3_2_1_23_1","first-page":"3","article-title":"Lora: Low-rank adaptation of large language models","volume":"1","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al., 2022. Lora: Low-rank adaptation of large language models.. In Proc. ICLR, Vol. 1. 3.","journal-title":"Proc. ICLR"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00980"},{"key":"e_1_3_2_1_25_1","first-page":"42689","article-title":"Res-tuning: A flexible and efficient tuning paradigm via unbinding tuner from backbone","volume":"36","author":"Jiang Zeyinzi","year":"2023","unstructured":"Zeyinzi Jiang, Chaojie Mao, Ziyuan Huang, Ao Ma, Yiliang Lv, Yujun Shen, Deli Zhao, and Jingren Zhou. 2023. Res-tuning: A flexible and efficient tuning paradigm via unbinding tuner from backbone. Advances in Neural Information Processing Systems, Vol. 36 (2023), 42689-42716.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1372-x"},{"key":"e_1_3_2_1_27_1","volume-title":"Multimodal Markup Document Models for Graphic Design Completion. arXiv preprint arXiv:2409.19051","author":"Kikuchi Kotaro","year":"2024","unstructured":"Kotaro Kikuchi, Naoto Inoue, Mayu Otani, Edgar Simo-Serra, and Kota Yamaguchi. 2024. Multimodal Markup Document Models for Graphic Design Completion. arXiv preprint arXiv:2409.19051 (2024)."},{"key":"e_1_3_2_1_28_1","first-page":"88","article-title":"Constrained graphic layout generation via latent optimization","author":"Kikuchi Kotaro","year":"2021","unstructured":"Kotaro Kikuchi, Edgar Simo-Serra, Mayu Otani, and Kota Yamaguchi. 2021. Constrained graphic layout generation via latent optimization. In ACMMM. 88-96.","journal-title":"ACMMM."},{"key":"e_1_3_2_1_29_1","first-page":"36652","article-title":"Pick-a-pic: An open dataset of user preferences for text-to-image generation","volume":"36","author":"Kirstain Yuval","year":"2023","unstructured":"Yuval Kirstain, Adam Polyak, Uriel Singer, Shahbuland Matiana, Joe Penna, and Omer Levy. 2023. Pick-a-pic: An open dataset of user preferences for text-to-image generation. Advances in Neural Information Processing Systems, Vol. 36 (2023), 36652-36663.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_30_1","volume-title":"Proc","author":"Kong Xiang","unstructured":"Xiang Kong, Lu Jiang, Huiwen Chang, Han Zhang, Yuan Hao, Haifeng Gong, and Irfan Essa. 2022. BLT: bidirectional layout transformer for controllable layout generation. In Proc. ECCV. Springer, 474-490."},{"key":"e_1_3_2_1_31_1","unstructured":"S Kulkarni. [n.d.]. 11. future of technology chatgpt: Optimizing language models for dialogue.''. ICP Monogram on Digital Technology in Clinical Medicine ([n.d.]) 62."},{"key":"e_1_3_2_1_32_1","first-page":"1249","article-title":"Relation-aware diffusion model for controllable poster layout generation","author":"Li Fengheng","year":"2023","unstructured":"Fengheng Li, An Liu, Wei Feng, Honghe Zhu, Yaoyu Li, Zheng Zhang, Jingjing Lv, Xin Zhu, Junjie Shen, Zhangang Lin, et al., 2023b. Relation-aware diffusion model for controllable poster layout generation. In Proc. CIKM. 1249-1258.","journal-title":"Proc. CIKM."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2020.2999335"},{"key":"e_1_3_2_1_34_1","volume-title":"Silkie: Preference distillation for large visual language models. arXiv preprint arXiv:2312.10665","author":"Li Lei","year":"2023","unstructured":"Lei Li, Zhihui Xie, Mukai Li, Shunian Chen, Peiyi Wang, Liang Chen, Yazheng Yang, Benyou Wang, and Lingpeng Kong. 2023c. Silkie: Preference distillation for large visual language models. arXiv preprint arXiv:2312.10665 (2023)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627673.3679557"},{"key":"e_1_3_2_1_36_1","unstructured":"Zhaochen Li Fengheng Li Wei Feng Honghe Zhu Yaoyu Li Zheng Zhang Jingjing Lv Junjie Shen Zhangang Lin Jingping Shao et al. 2023a. Planning and Rendering: Towards Product Poster Generation with Diffusion Models. arXiv preprint arXiv:2312.08822 (2023)."},{"key":"e_1_3_2_1_37_1","volume-title":"Proc. NeurIPS.","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. In Proc. NeurIPS."},{"key":"e_1_3_2_1_38_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_39_1","first-page":"61","volume-title":"Proceedings of SmartGraphics","volume":"2001","author":"Lok Simon","year":"2001","unstructured":"Simon Lok and Steven Feiner. 2001. A survey of automated layout techniques for information presentations. Proceedings of SmartGraphics, Vol. 2001 (2001), 61-68."},{"key":"e_1_3_2_1_40_1","volume-title":"Realhf: Optimized rlhf training for large language models through parameter reallocation. arXiv preprint arXiv:2406.14088","author":"Mei Zhiyu","year":"2024","unstructured":"Zhiyu Mei, Wei Fu, Kaiwei Li, Guangju Wang, Huanchen Zhang, and Yi Wu. 2024. Realhf: Optimized rlhf training for large language models through parameter reallocation. arXiv preprint arXiv:2406.14088 (2024)."},{"key":"e_1_3_2_1_41_1","volume-title":"Proc. NeurIPS.","author":"Meng Yu","year":"2024","unstructured":"Yu Meng, Mengzhou Xia, and Danqi Chen. 2024. Simpo: Simple preference optimization with a reference-free reward. In Proc. NeurIPS."},{"key":"e_1_3_2_1_42_1","volume-title":"Mark Rowland, Zhaohan Daniel Guo, Yunhao Tang, Matthieu Geist, Thomas Mesnard, Andrea Michi, et al.","author":"Munos R\u00e9mi","year":"2023","unstructured":"R\u00e9mi Munos, Michal Valko, Daniele Calandriello, Mohammad Gheshlaghi Azar, Mark Rowland, Zhaohan Daniel Guo, Yunhao Tang, Matthieu Geist, Thomas Mesnard, Andrea Michi, et al., 2023. Nash learning from human feedback. arXiv preprint arXiv:2312.00886, Vol. 18 (2023)."},{"key":"e_1_3_2_1_43_1","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray et al. 2022. Training language models to follow instructions with human feedback. Advances in neural information processing systems Vol. 35 (2022) 27730-27744."},{"key":"e_1_3_2_1_44_1","first-page":"53728","article-title":"Direct preference optimization: Your language model is secretly a reward model","volume":"36","author":"Rafailov Rafael","year":"2023","unstructured":"Rafael Rafailov, Archit Sharma, Eric Mitchell, Christopher D Manning, Stefano Ermon, and Chelsea Finn. 2023. Direct preference optimization: Your language model is secretly a reward model. In Proc. NeurIPS, Vol. 36. 53728-53741.","journal-title":"Proc. NeurIPS"},{"key":"e_1_3_2_1_45_1","first-page":"91","article-title":"An automated layout approach for model-driven WIMP-UI generation","author":"Raneburger David","year":"2012","unstructured":"David Raneburger, Roman Popp, and Jean Vanderdonckt. 2012. An automated layout approach for model-driven WIMP-UI generation. In Proc. CHI. 91-100.","journal-title":"Proc. CHI."},{"key":"e_1_3_2_1_46_1","volume-title":"Posterllama: Bridging design ability of langauge model to contents-aware layout generation. arXiv preprint arXiv:2404.00995","author":"Seol Jaejung","year":"2024","unstructured":"Jaejung Seol, Seojun Kim, and Jaejun Yoo. 2024. Posterllama: Bridging design ability of langauge model to contents-aware layout generation. arXiv preprint arXiv:2404.00995 (2024)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29865"},{"key":"e_1_3_2_1_48_1","unstructured":"Zhiqing Sun Sheng Shen Shengcao Cao Haotian Liu Chunyuan Li Yikang Shen Chuang Gan Liang-Yan Gui Yu-Xiong Wang Yiming Yang et al. 2023. Aligning large multimodal models with factually augmented rlhf. arXiv preprint arXiv:2309.14525 (2023)."},{"key":"e_1_3_2_1_49_1","first-page":"1","article-title":"Automatic layout generation for graphical design magazines","author":"Tabata Sou","year":"2019","unstructured":"Sou Tabata, Hiroki Yoshihara, Haruka Maeda, and Kei Yokoyama. 2019. Automatic layout generation for graphical design magazines. In Proc. SIGGRAPH. 1-2.","journal-title":"Proc. SIGGRAPH."},{"key":"e_1_3_2_1_50_1","volume-title":"Layoutnuwa: Revealing the hidden layout expertise of large language models. arXiv preprint arXiv:2309.09506","author":"Tang Zecheng","year":"2023","unstructured":"Zecheng Tang, Chenfei Wu, Juntao Li, and Nan Duan. 2023. Layoutnuwa: Revealing the hidden layout expertise of large language models. arXiv preprint arXiv:2309.09506 (2023)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00786"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888013"},{"key":"e_1_3_2_1_53_1","volume-title":"Wisa: World simulator assistant for physics-aware text-to-video generation. arXiv preprint arXiv:2503.08153","author":"Wang Jing","year":"2025","unstructured":"Jing Wang, Ao Ma, Ke Cao, Jun Zheng, Zhanjie Zhang, Jiasong Feng, Shanyuan Liu, Yuhang Ma, Bo Cheng, Dawei Leng, et al., 2025b. Wisa: World simulator assistant for physics-aware text-to-video generation. arXiv preprint arXiv:2503.08153 (2025)."},{"key":"e_1_3_2_1_54_1","volume-title":"Qihoo-t2x: An efficiency-focused diffusion transformer via proxy tokens for text-to-any-task. arXiv e-prints","author":"Wang Jing","year":"2024","unstructured":"Jing Wang, Ao Ma, Jiasong Feng, Dawei Leng, Yuhui Yin, and Xiaodan Liang. 2024c. Qihoo-t2x: An efficiency-focused diffusion transformer via proxy tokens for text-to-any-task. arXiv e-prints (2024), arXiv-2409."},{"key":"e_1_3_2_1_55_1","first-page":"10716","article-title":"Prompt2Poster: Automatically Artistic Chinese Poster Creation from Prompt Only","author":"Wang Shaodong","year":"2024","unstructured":"Shaodong Wang, Yunyang Ge, Liuhan Chen, Haiyang Zhou, Qian Wang, Xinhua Cheng, and Li Yuan. 2024b. Prompt2Poster: Automatically Artistic Chinese Poster Creation from Prompt Only. In Proc. ACM MM. 10716-10724.","journal-title":"Proc. ACM MM."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"crossref","unstructured":"Xiyao Wang Jiuhai Chen Zhaoyang Wang Yuhang Zhou Yiyang Zhou Huaxiu Yao Tianyi Zhou Tom Goldstein Parminder Bhatia Furong Huang et al. 2024a. Enhancing visual-language modality alignment in large vision language models via self-improvement. arXiv preprint arXiv:2405.15973 (2024).","DOI":"10.18653\/v1\/2025.findings-naacl.15"},{"key":"e_1_3_2_1_57_1","volume-title":"Proc. NeurIPS.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al., 2022. Chain-of-thought prompting elicits reasoning in large language models. In Proc. NeurIPS."},{"key":"e_1_3_2_1_58_1","first-page":"12721","article-title":"Desigen: A pipeline for controllable design template generation","author":"Weng Haohan","year":"2024","unstructured":"Haohan Weng, Danqing Huang, Yu Qiao, Zheng Hu, Chin-Yew Lin, Tong Zhang, and CL Chen. 2024. Desigen: A pipeline for controllable design template generation. In Proc. CVPR. 12721-12732.","journal-title":"Proc. CVPR."},{"key":"e_1_3_2_1_59_1","volume-title":"Human preference score v2: A solid benchmark for evaluating human preferences of text-to-image synthesis. arXiv preprint arXiv:2306.09341","author":"Wu Xiaoshi","year":"2023","unstructured":"Xiaoshi Wu, Yiming Hao, Keqiang Sun, Yixiong Chen, Feng Zhu, Rui Zhao, and Hongsheng Li. 2023. Human preference score v2: A solid benchmark for evaluating human preferences of text-to-image synthesis. arXiv preprint arXiv:2306.09341 (2023)."},{"key":"e_1_3_2_1_60_1","volume-title":"Image-aware layout generation with user constraints for poster design. The Visual Computer","author":"Xu Chenchen","year":"2024","unstructured":"Chenchen Xu, Kaixin Han, and Weiwei Xu. 2024a. Image-aware layout generation with user constraints for poster design. The Visual Computer (2024), 1-14."},{"key":"e_1_3_2_1_61_1","volume-title":"Visionreward: Fine-grained multi-dimensional human preference learning for image and video generation. arXiv preprint arXiv:2412.21059","author":"Xu Jiazheng","year":"2024","unstructured":"Jiazheng Xu, Yu Huang, Jiale Cheng, Yuanming Yang, Jiajun Xu, Yuan Wang, Wenbo Duan, Shen Yang, Qunlin Jin, Shurun Li, et al., 2024b. Visionreward: Fine-grained multi-dimensional human preference learning for image and video generation. arXiv preprint arXiv:2412.21059 (2024)."},{"key":"e_1_3_2_1_62_1","first-page":"15903","article-title":"Imagereward: Learning and evaluating human preferences for text-to-image generation","volume":"36","author":"Xu Jiazheng","year":"2023","unstructured":"Jiazheng Xu, Xiao Liu, Yuchen Wu, Yuxuan Tong, Qinkai Li, Ming Ding, Jie Tang, and Yuxiao Dong. 2023. Imagereward: Learning and evaluating human preferences for text-to-image generation. Advances in Neural Information Processing Systems, Vol. 36 (2023), 15903-15935.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01310"},{"key":"e_1_3_2_1_64_1","volume-title":"Proc. ICML.","author":"Zeng Yongcheng","year":"2024","unstructured":"Yongcheng Zeng, Guoqing Liu, Weiyu Ma, Ning Yang, Haifeng Zhang, and Jun Wang. 2024. Token-level direct preference optimization. In Proc. ICML."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2024.123136"},{"key":"e_1_3_2_1_66_1","volume-title":"Mavis: Mathematical visual instruction tuning with an automatic data engine. arXiv preprint arXiv:2407.08739","author":"Zhang Renrui","year":"2024","unstructured":"Renrui Zhang, Xinyu Wei, Dongzhi Jiang, Ziyu Guo, Shicheng Li, Yichi Zhang, Chengzhuo Tong, Jiaming Liu, Aojun Zhou, Bin Wei, et al., 2024b. Mavis: Mathematical visual instruction tuning with an automatic data engine. arXiv preprint arXiv:2407.08739 (2024)."},{"key":"e_1_3_2_1_67_1","volume-title":"Mm-rlhf: The next step forward in multimodal llm alignment. arXiv preprint arXiv:2502.10391","author":"Zhang Yi-Fan","year":"2025","unstructured":"Yi-Fan Zhang, Tao Yu, Haochen Tian, Chaoyou Fu, Peiyan Li, Jianshu Zeng, Wulin Xie, Yang Shi, Huanyu Zhang, Junkang Wu, et al., 2025. Mm-rlhf: The next step forward in multimodal llm alignment. arXiv preprint arXiv:2502.10391 (2025)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28570"},{"key":"e_1_3_2_1_69_1","first-page":"46595","article-title":"Judging llm-as-a-judge with mt-bench and chatbot arena","volume":"36","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric Xing, et al., 2023. Judging llm-as-a-judge with mt-bench and chatbot arena. Advances in Neural Information Processing Systems, Vol. 36 (2023), 46595-46623.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_70_1","volume-title":"Scientific poster generation: A new dataset and approach. Pattern Recognition","author":"Zhong Xinyi","year":"2025","unstructured":"Xinyi Zhong, Zusheng Tan, Jing Li, Shen Gao, Jing Ma, Shanshan Feng, and Billy Chiu. 2025. Scientific poster generation: A new dataset and approach. Pattern Recognition (2025), 111507."},{"key":"e_1_3_2_1_71_1","first-page":"1015","article-title":"Publaynet: largest dataset ever for document layout analysis","author":"Zhong Xu","year":"2019","unstructured":"Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. 2019. Publaynet: largest dataset ever for document layout analysis. In Proc. ICDAR. IEEE, 1015-1022.","journal-title":"Proc. ICDAR. IEEE"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/692"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755064","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:22:27Z","timestamp":1765308147000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755064"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":72,"alternative-id":["10.1145\/3746027.3755064","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755064","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}