{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T07:11:32Z","timestamp":1778051492849,"version":"3.51.4"},"reference-count":73,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100006785","name":"Google","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100006785","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,3,6]]},"DOI":"10.1109\/wacv61042.2026.00358","type":"proceedings-article","created":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T19:59:32Z","timestamp":1778011172000},"page":"3668-3678","source":"Crossref","is-referenced-by-count":0,"title":["Autoregressive Styled Text Image Generation, but Make it Reliable"],"prefix":"10.1109","author":[{"given":"Carmine","family":"Zaccagnino","sequence":"first","affiliation":[{"name":"University of Modena and Reggio Emilia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fabio","family":"Quattrini","sequence":"additional","affiliation":[{"name":"University of Modena and Reggio Emilia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Vittorio","family":"Pippi","sequence":"additional","affiliation":[{"name":"University of Modena and Reggio Emilia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Silvia","family":"Cascianelli","sequence":"additional","affiliation":[{"name":"University of Modena and Reggio Emilia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alessio","family":"Tonioni","sequence":"additional","affiliation":[{"name":"Google"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rita","family":"Cucchiara","sequence":"additional","affiliation":[{"name":"University of Modena and Reggio Emilia"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"STCN: Stochastic Temporal Convolutional Networks","author":"Aksan","year":"2018","journal-title":"ICLR"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3173779"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00083"},{"key":"ref4","article-title":"RIMES evaluation campaign for handwritten mail processing","author":"Augustin","year":"2006","journal-title":"IWFHR"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00112"},{"key":"ref6","article-title":"Semi-Supervised Adaptation of Diffusion Models for Handwritten Text Generation","author":"Brandenbusch","year":"2024"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-89131-2_31"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/s10032-022-00401-y"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01103"},{"key":"ref10","author":"Chen","year":"2025","journal-title":"Context-aware autoregressive models for multi-conditional image generation"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00579"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73636-0_24"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.5244\/C.34.174"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"ref15","article-title":"Fluid: Scaling Autoregressive Text-to-image Generative Models with Continuous Tokens","author":"Fan","year":"2025","journal-title":"ICLR"},{"key":"ref16","article-title":"Unified autoregressive visual generation and understanding with continuous tokens","author":"Fan","year":"2025"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00438"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i9.16917"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3550070"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.31390\/gradschool_dissertations.4601"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/2886099"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01467"},{"key":"ref23","article-title":"GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium","author":"Heusel","year":"2017","journal-title":"NeurIPS"},{"key":"ref24","article-title":"beta-vae: Learning basic visual concepts with a constrained variational framework","author":"Higgins","year":"2017","journal-title":"ICLR"},{"key":"ref25","article-title":"Classifier-Free Diffusion Guidance","volume-title":"NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications","author":"Ho"},{"key":"ref26","article-title":"Straightening out the straight-through estimator: Overcoming optimization challenges in vector quantized networks","author":"Huh","year":"2023","journal-title":"ICML"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58592-1_17"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2021.3122572"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2013.117"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3239736"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i11.26538"},{"key":"ref32","article-title":"Autoregressive Image Generation without Vector Quantization","author":"Li","year":"2024","journal-title":"NeurIPS"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73226-3_21"},{"key":"ref34","article-title":"Glyph-byt5-v2: A strong aesthetic base-line for accurate multilingual visual text rendering","author":"Liu","year":"2024"},{"key":"ref35","article-title":"Diffusion Models for Handwriting Generation","author":"Luhman","year":"2020"},{"key":"ref36","article-title":"SLOGAN: Handwriting Style Synthesis for Arbitrary-Length and Out-of-Vocabulary Text","author":"Luo","year":"2022","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/s100320200071"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-025-02525-0"},{"key":"ref39","article-title":"Finite Scalar Quantization: VQ-VAE Made Simple","author":"Mentzer","year":"2024","journal-title":"ICLR"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00740"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-41679-8_22"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73013-9_24"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW69036.2025.00725"},{"key":"ref44","article-title":"Unconditional Priors Matter! Improving Conditional Generation of Fine-Tuned Diffusion Models","author":"Phunyaphibarn","year":"2025"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2023.06.003"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02151"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-41679-8_19"},{"key":"ref48","article-title":"HWD: A Novel Evaluation Score for Styled Handwritten Text Generation","author":"Pippi","year":"2023","journal-title":"BMVC"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW69036.2025.00775"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00741"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-92808-6_3"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-70543-4_1"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72986-7_14"},{"key":"ref54","first-page":"212","article-title":"\u03bc gat: Improving Single-Page Document Parsing by Providing Multi-page Context","volume-title":"Proceedings of the European Conference on Computer Vision Workshops","author":"Quattrini"},{"key":"ref55","first-page":"9","article-title":"Language models are unsupervised multitask learners","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref56","first-page":"1","article-title":"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer","author":"Raffel","year":"2020","journal-title":"Journal of Machine Learning Research"},{"key":"ref57","article-title":"Zero-shot text-to-image generation","author":"Ramesh","year":"2021","journal-title":"ICML"},{"key":"ref58","article-title":"Generating diverse high-fidelity images with vq-vae-2","author":"Razavi","year":"2019","journal-title":"NeurIPS"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-99-8141-0_7"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref61","article-title":"Autoregressive Model Beats Diffusion: Llama for Scalable Image Generation","author":"Sun","year":"2024"},{"key":"ref62","article-title":"NextStep-1: Toward Autoregressive Image Generation with Continuous Tokens at Scale","author":"Han","year":"2025"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72998-0_17"},{"key":"ref64","author":"Tuo","year":"2023","journal-title":"Anytext: Multilingual visual text generation and editing"},{"key":"ref65","author":"Tuo","year":"2024","journal-title":"Anytext2: Visual text generation and editing with customizable attributes"},{"key":"ref66","article-title":"Neural discrete representation learning","volume":"30","author":"Van Den Oord","year":"2017","journal-title":"NeurIPS"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2024.3481154"},{"key":"ref68","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"NeurIPS"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00461"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01472"},{"key":"ref71","article-title":"Scaling autoregressive models for content-rich text-to-image generation","author":"Yu","year":"2022","journal-title":"TMLR"},{"key":"ref72","author":"Zhou","year":"2025","journal-title":"Transfusion: Predict the next token and diffuse images with one multi-modal model"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01368"}],"event":{"name":"2026 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)","location":"Tucson, AZ, USA","start":{"date-parts":[[2026,3,6]]},"end":{"date-parts":[[2026,3,10]]}},"container-title":["2026 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11491838\/11491925\/11492093.pdf?arnumber=11492093","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T06:13:49Z","timestamp":1778048029000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11492093\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,6]]},"references-count":73,"URL":"https:\/\/doi.org\/10.1109\/wacv61042.2026.00358","relation":{},"subject":[],"published":{"date-parts":[[2026,3,6]]}}}