{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T03:28:51Z","timestamp":1777865331418,"version":"3.51.4"},"reference-count":113,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62271308,62401365,62225112,62132006,U24A20220"],"award-info":[{"award-number":["62271308,62401365,62225112,62132006,U24A20220"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002858","name":"China Postdoctoral Science Foundation","doi-asserted-by":"publisher","award":["BX20250411,2025M773473"],"award-info":[{"award-number":["BX20250411,2025M773473"]}],"id":[{"id":"10.13039\/501100002858","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01022","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"10982-10994","source":"Crossref","is-referenced-by-count":0,"title":["F-Bench: Rethinking Human Preference Evaluation Metrics for Benchmarking Face Generation, Customization, and Restoration"],"prefix":"10.1109","author":[{"given":"Lu","family":"Liu","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University,Shanghai,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Huiyu","family":"Duan","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,Shanghai,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qiang","family":"Hu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,Shanghai,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Liu","family":"Yang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,Shanghai,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chunlei","family":"Cai","sequence":"additional","affiliation":[{"name":"Bilibili Inc.,Shanghai,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tianxiao","family":"Ye","sequence":"additional","affiliation":[{"name":"Bilibili Inc.,Shanghai,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Huayu","family":"Liu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,Shanghai,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoyun","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,Shanghai,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guangtao","family":"Zhai","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,Shanghai,China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"issue":"3","key":"ref1","volume-title":"Deep Floyd","volume":"2","year":"2024"},{"key":"ref2","volume-title":"Dreamlike V2","volume":"5","year":"2024"},{"key":"ref3","volume-title":"Flux-dev","year":"2024"},{"key":"ref4","volume-title":"IP-Adapter-FaceID-SDXL","year":"2024"},{"key":"ref5","volume-title":"IP-Adapter-FaceID-Plus","year":"2024"},{"key":"ref6","volume-title":"Kolors","year":"2024"},{"key":"ref7","volume-title":"ProtoVision V6.6","year":"2024"},{"key":"ref8","volume-title":"Realistic Vision V5.1","year":"2024"},{"key":"ref9","volume-title":"SD3","year":"2024"},{"key":"ref10","volume-title":"GPT-4o","year":"2024"},{"key":"ref11","article-title":"Qwen2. 5-vl technical report","author":"Bai","year":"2025","journal-title":"arXiv preprint"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00565"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00961"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73232-4_14"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3043093"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr46437.2021.01172"},{"key":"ref17","article-title":"Pixart- \\alpha: Fast training of diffusion transformer for photorealistic text-to-image synthesis","volume-title":"arXiv preprint","author":"Chen","year":"2023"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00283"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00264"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00100"},{"key":"ref21","article-title":"Flashattention-2: Faster attention with better parallelism and work partitioning","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","volume":"7","author":"Dao"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00482"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00305"},{"key":"ref24","article-title":"An image is worth one word: Personalizing text-toimage generation using textual inversion","author":"Gal","year":"2022","journal-title":"arXiv preprint"},{"key":"ref25","first-page":"1220","article-title":"No-reference image Quality assessment via transformers, relative ranking, and self-consistency","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)","author":"Alireza","year":"2022"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.5555\/2969033.2969125"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/SiPS.2013.6674512"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19797-0_8"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref30","author":"He","year":"2024","journal-title":"Idanimator: Zero-shot identity-preserving human video generation"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICB45273.2019.8987255"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"ref33","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume-title":"Proceedings of the Advances in Neural Information Processing Systems (NeurIPS)","volume":"33","author":"Ho","year":"2020"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2010.579"},{"key":"ref35","article-title":"Lora: Lowrank adaptation of large language models","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Edward","year":"2021"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2025.3559140"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00090"},{"key":"ref38","article-title":"Labeled faces in the wild: A database for studying face recognition in unconstrained environments","author":"Huang","year":"2007","journal-title":"Technical Report 07\u201349, University of Massachusetts, Amherst"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.52202\/075280-3443"},{"key":"ref40","article-title":"Catekv: On sequential consistency for long-context 11 m inference acceleration","volume-title":"Proceedings of International Conference on Machine Learning (ICML)","author":"Jiang","year":"2025"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00344"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.224"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00453"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.527"},{"key":"ref46","article-title":"Progressive face super-resolution via attention to facial landmark","author":"Kim","journal-title":"arXiv preprint"},{"key":"ref47","author":"Diederik","year":"2013","journal-title":"Auto-encoding variational bayes"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1594"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2023.3319020"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2023.3319020"},{"key":"ref52","author":"Li","year":"2024","journal-title":"Playground v2.5: Three insights towards enhancing aesthetic Quality in text-to-image generation"},{"key":"ref53","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Preceedings of International conference on machine learning (ICML)","author":"Li","year":"2022"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_17"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00278"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3215251"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00825"},{"key":"ref58","article-title":"Hunyuandit: A powerful multi-resolution diffusion transformer with fine-grained chinese understanding","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01835"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCT46805.2019.8947255"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1405.0312"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73202-7_25"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2788206"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/TBC.2018.2816783"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4133-3"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2012.2227726"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.250"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00758"},{"key":"ref70","article-title":"W\u00fcrstchen: An efficient architecture for large-scale text-to-image diffusion models","author":"Pernias","year":"2023","journal-title":"arXiv preprint"},{"key":"ref71","article-title":"Sdxl: Improving latent diffusion models for high-resolution image synthesis","author":"Podell","year":"2023","journal-title":"arXiv preprint"},{"key":"ref72","author":"Ramesh","year":"2021","journal-title":"Zero-shot text-to-image generation"},{"key":"ref73","article-title":"Hierarchical text-conditional image generation with clip latents","author":"Ramesh","year":"2022","journal-title":"arXiv preprint"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.52202\/068431-2643"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.52202\/068431-2643"},{"issue":"13","key":"ref78","article-title":"Methodology for the subjective assessment of the Quality of television pictures","volume":"500","author":"Series","year":"2012","journal-title":"Recommendation ITU-R BT"},{"key":"ref79","article-title":"Very Deep Convolutional Networks for Large-Scale Image Recognition","author":"Simonyan","year":"2014","journal-title":"arXiv e-prints, art."},{"key":"ref80","author":"Song","year":"2020","journal-title":"Denoising diffusion implicit models"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00372"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3301276"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00569"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02866"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58542-6_11"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-99-9119-8_5"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02168-7"},{"key":"ref88","article-title":"Understanding and evaluating human preferences for ai generated images with instruction tuning","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02168-7"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00081"},{"key":"ref91","article-title":"Instantid: Zero-shot identity-preserving generation in seconds","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref92","article-title":"A survey of deep face restoration: Denoise, super-resolution, deblur, artifact removal","author":"Wang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00217"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00905"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00905"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00170"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01461"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01204"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00229"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02227-z"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2016.2585880"},{"key":"ref103","article-title":"Imagereward: Learning and evaluating human preferences for text-to-image generation","volume-title":"Proceedings of the Advances in Neural Information Processing Systems (NeurIPS)","volume":"36","author":"Xu","year":"2024"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP51287.2024.10647885"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.596"},{"key":"ref106","article-title":"MANIQA: Multi-dimension Attention Network for NoReference Image Quality Assessment","author":"Yang","year":"2022","journal-title":"arXiv e-prints, art"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00073"},{"key":"ref108","article-title":"Ipadapter: Text compatible image prompt adapter for text-toimage diffusion models","author":"Ye","year":"2023","journal-title":"arXiv preprint"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2024.3432651"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-019-2757-1"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2015.2426416"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.52202\/068431-2218"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11445722.pdf?arnumber=11445722","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:15:44Z","timestamp":1777529744000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11445722\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":113,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01022","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}