{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T15:24:41Z","timestamp":1778167481675,"version":"3.51.4"},"reference-count":86,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"5","license":[{"start":{"date-parts":[[2025,5,1]],"date-time":"2025-05-01T00:00:00Z","timestamp":1746057600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62402406"],"award-info":[{"award-number":["62402406"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"HKU IDS research Seed Fund"},{"name":"HKU Fintech Academy R&amp;D Funding"},{"name":"HKU Seed Fund for Basic Research"},{"name":"HKU Seed Fund for Translational and Applied Research"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2025,5]]},"DOI":"10.1109\/tpami.2025.3531907","type":"journal-article","created":{"date-parts":[[2025,1,20]],"date-time":"2025-01-20T18:49:42Z","timestamp":1737398982000},"page":"3563-3579","source":"Crossref","is-referenced-by-count":23,"title":["T2I-CompBench++: An Enhanced and Comprehensive Benchmark for Compositional Text-to-Image Generation"],"prefix":"10.1109","volume":"47","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4884-4387","authenticated-orcid":false,"given":"Kaiyi","family":"Huang","sequence":"first","affiliation":[{"name":"The University of Hong Kong, Pok Fu Lam, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5737-019X","authenticated-orcid":false,"given":"Chengqi","family":"Duan","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Pok Fu Lam, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8219-2084","authenticated-orcid":false,"given":"Kaiyue","family":"Sun","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Pok Fu Lam, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6890-1049","authenticated-orcid":false,"given":"Enze","family":"Xie","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab, Montreal, QC, Canada"}]},{"given":"Zhenguo","family":"Li","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab, Montreal, QC, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1831-9952","authenticated-orcid":false,"given":"Xihui","family":"Liu","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Pok Fu Lam, Hong Kong"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref2","first-page":"47:1","article-title":"Cascaded diffusion models for high fidelity image generation","volume":"23","author":"Ho","year":"2022","journal-title":"J. Mach. Learn. Res."},{"key":"ref3","first-page":"36479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Saharia"},{"key":"ref4","first-page":"8780","article-title":"Diffusion models beat GANs on image synthesis","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Dhariwal"},{"key":"ref5","first-page":"8162","article-title":"Improved denoising diffusion probabilistic models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Nichol"},{"key":"ref6","article-title":"Muse: Text-to-image generation via masked generative transformers","author":"Chang","year":"2023"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_26"},{"key":"ref8","article-title":"Training-free structured diffusion guidance for compositional text-to-image synthesis","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Feng"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3592116"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00714"},{"key":"ref11","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"ref13","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/springerreference_9081"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"ref16","article-title":"The dawn of LMMs: Preliminary explorations with GPT-4V (ision)","author":"Yang","year":"2023"},{"key":"ref18","article-title":"Scaling rectified flow transformers for high-resolution image synthesis","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","author":"Esser","year":"2024"},{"issue":"3","key":"ref19","first-page":"1","article-title":"Improving image generation with better captions","volume-title":"Comput. Sci.","volume":"2","author":"Betker","year":"2023"},{"key":"ref20","article-title":"Pixart-$alpha$alpha: Fast training of diffusion transformer for photorealistic text-to-image synthesis","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Chen"},{"key":"ref21","article-title":"SDXL: Improving latent diffusion models for high-resolution image synthesis","author":"Podell","year":"2023"},{"key":"ref22","first-page":"78723","article-title":"T2I-CompBench: A comprehensive benchmark for open-world compositional text-to-image generation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Huang"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01834"},{"key":"ref24","first-page":"1060","article-title":"Generative adversarial text to image synthesis","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Reed"},{"key":"ref25","first-page":"217","article-title":"Learning what and where to draw","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Reed"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.629"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00143"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00595"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00089"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.5555\/2969033.2969125"},{"key":"ref31","article-title":"Hierarchical text-conditional image generation with clip latents","author":"Ramesh","year":"2022"},{"key":"ref32","first-page":"16784","article-title":"GLIDE: Towards photorealistic image generation and editing with text-guided diffusion models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Nichol"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19784-0_6"},{"key":"ref34","article-title":"Matryoshka diffusion models","volume-title":"Proc. 12th Int. Conf. Learn. Representations","author":"Gu","year":"2023"},{"key":"ref35","first-page":"8821","article-title":"Zero-shot text-to-image generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Ramesh"},{"key":"ref36","article-title":"HIVE: Harnessing human feedback for instructional visual editing","author":"Zhang","year":"2023"},{"key":"ref37","article-title":"Aligning text-to-image models using human feedback","author":"Lee","year":"2023"},{"key":"ref38","article-title":"RAFT: Reward ranked finetuning for generative foundation model alignment","author":"Dong","year":"2023"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01766"},{"key":"ref40","article-title":"ECLIPSE: A resource-efficient text-to-image prior for image generations","author":"Patel","year":"2023"},{"key":"ref41","article-title":"Referee can play: An alternative approach to conditional generation via model inversion","author":"Liu","year":"2024"},{"key":"ref42","article-title":"Benchmark for compositional text-to-image synthesis","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Park"},{"key":"ref43","article-title":"LLM-grounded diffusion: Enhancing prompt understanding of text-to-image diffusion models with large language models","author":"Lian","year":"2023"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00526"},{"key":"ref45","article-title":"Compositional text-to-image synthesis with attention map control of diffusion models","author":"Wang","year":"2023"},{"key":"ref46","article-title":"Conform: Contrast is all you need for high-fidelity text-to-image diffusion models","author":"Meral","year":"2023"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00708"},{"key":"ref48","first-page":"3536","article-title":"Linguistic binding in diffusion models: Enhancing attribute correspondence through attention map alignment","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Rassin"},{"key":"ref49","article-title":"LLM blueprint: Enabling text-to-image generation with complex and detailed prompts","author":"Gani","year":"2023"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"ref51","article-title":"Box it to bind it: Unified layout control and attribute binding in T2I diffusion models","author":"Taghipour","year":"2024"},{"key":"ref52","article-title":"Divide and conquer: Language models can plan and self-correct for compositional text-to-image generation","author":"Wang","year":"2024"},{"key":"ref53","article-title":"Reason out your layout: Evoking the layout master from large language models for text-to-image synthesis","author":"Chen","year":"2023"},{"key":"ref54","article-title":"Mastering text-to-image diffusion: Recaptioning, planning, and generating with multimodal llms","author":"Yang","year":"2024"},{"key":"ref55","article-title":"RealCompo: Dynamic equilibrium between realism and compositionality improves text-to-image diffusion models","author":"Zhang","year":"2024"},{"key":"ref56","article-title":"The caltech-UCSD birds-200-2011 dataset","author":"Wah","year":"2011"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00283"},{"key":"ref60","article-title":"Human evaluation of text-to-image models on a multi-task benchmark","author":"Petsiuk","year":"2022"},{"key":"ref61","first-page":"2226","article-title":"Improved techniques for training GANs","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Salimans"},{"key":"ref62","first-page":"6626","article-title":"GANs trained by a two time-scale update rule converge to a local nash equilibrium","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Heusel"},{"key":"ref63","article-title":"LLMScore: Unveiling the power of large language models in text-to-image synthesis evaluation","author":"Lu","year":"2023"},{"key":"ref64","article-title":"X-IQE: Explainable image quality evaluation for text-to-image generation with visual large language models","author":"Chen","year":"2023"},{"key":"ref65","article-title":"Improving compositional text-to-image generation with large vision-language models","author":"Wen","year":"2023"},{"key":"ref66","first-page":"15903","article-title":"ImageReward: Learning and evaluating human preferences for text-to-image generation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Xu"},{"key":"ref67","article-title":"DreamSync: Aligning text-to-image generation with image understanding feedback","author":"Sun","year":"2023"},{"key":"ref68","first-page":"36652","article-title":"Pick-a-pic: An open dataset of user preferences for text-to-image generation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","volume":"36","author":"Kirstain"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00200"},{"key":"ref70","article-title":"Human preference score v2: A solid benchmark for evaluating human preferences of text-to-image synthesis","author":"Wu","year":"2023"},{"key":"ref71","article-title":"Rich human feedback for text-to-image generation","author":"Liang","year":"2023"},{"key":"ref72","article-title":"ImagenHub: Standardizing the evaluation of conditional image generation models","author":"Ku","year":"2023"},{"key":"ref73","article-title":"VIEScore: Towards explainable metrics for conditional image synthesis evaluation","author":"Ku","year":"2023"},{"key":"ref74","first-page":"69981","article-title":"Holistic evaluation of text-to-image models","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Lee"},{"key":"ref76","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023"},{"key":"ref77","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Wei"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00742"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"ref80","article-title":"GPT-4 technical report","author":"Achiam","year":"2023"},{"key":"ref81","article-title":"LoRa: Low-rank adaptation of large language models","author":"Hu","year":"2021"},{"key":"ref82","first-page":"407","article-title":"On the value of out-of-distribution testing: An example of goodhart\u2019s law","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Teney"},{"key":"ref83","article-title":"Grounding DINO: Marrying DINO with grounded pre-training for open-set object detection","author":"Liu","year":"2023"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00852"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01316-z"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.534"},{"key":"ref89","article-title":"Decoupled weight decay regularization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Loshchilov"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/10958761\/10847875.pdf?arnumber=10847875","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,10]],"date-time":"2025-04-10T17:13:29Z","timestamp":1744305209000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10847875\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5]]},"references-count":86,"journal-issue":{"issue":"5"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3531907","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,5]]}}}