{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,25]],"date-time":"2026-06-25T05:10:42Z","timestamp":1782364242052,"version":"3.54.5"},"publisher-location":"Cham","reference-count":60,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726514","type":"print"},{"value":"9783031726521","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:00:00Z","timestamp":1730246400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:00:00Z","timestamp":1730246400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72652-1_23","type":"book-chapter","created":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T08:29:02Z","timestamp":1730190542000},"page":"386-402","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":43,"title":["TextDiffuser-2: Unleashing the\u00a0Power of\u00a0Language Models for\u00a0Text Rendering"],"prefix":"10.1007","author":[{"given":"Jingye","family":"Chen","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yupan","family":"Huang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tengchao","family":"Lv","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lei","family":"Cui","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qifeng","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Furu","family":"Wei","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,10,30]]},"reference":[{"key":"23_CR1","unstructured":"Balaji, Y., et\u00a0al.: ediffi: text-to-image diffusion models with an ensemble of expert denoisers. arXiv preprint arXiv:2211.01324 (2022)"},{"key":"23_CR2","unstructured":"Chen, C., et al.: Position-enhanced visual instruction tuning for multimodal large language models. arXiv preprint arXiv:2308.13437 (2023)"},{"key":"23_CR3","unstructured":"Chen, J., Huang, Y., Lv, T., Cui, L., Chen, Q., Wei, F.: Textdiffuser: diffusion models as text painters. In: NeurIPS (2023)"},{"key":"23_CR4","unstructured":"Chen, J., et\u00a0al.: Pixart-$$\\alpha $$: Fast training of diffusion transformer for photorealistic text-to-image synthesis. arXiv preprint arXiv:2310.00426 (2023)"},{"key":"23_CR5","unstructured":"Chen, T., Saxena, S., Li, L., Fleet, D.J., Hinton, G.: Pix2seq: a language modeling framework for object detection. In: ICLR (2021)"},{"key":"23_CR6","unstructured":"Chen, T., Saxena, S., Li, L., Lin, T.Y., Fleet, D.J., Hinton, G.E.: A unified sequence interface for vision tasks. In: NeurIPS (2022)"},{"key":"23_CR7","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing gpt-4 with 90%* chatgpt quality (2023). https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"23_CR8","unstructured":"DALLE-3: Link: https:\/\/openai.com\/dall-e-3. (2023)"},{"key":"23_CR9","unstructured":"Daras, G., Dimakis, A.G.: Discovering the hidden vocabulary of dalle-2. arXiv preprint arXiv:2206.00169 (2022)"},{"key":"23_CR10","unstructured":"Das, A., Roy, P., Bhattacharya, S., Ghosh, S., Pal, U., Blumenstein, M.: Fast: font-agnostic scene text editing. arXiv preprint arXiv:2308.02905 (2023)"},{"key":"23_CR11","unstructured":"DeepFloyd: Github link: https:\/\/github.com\/deep-floyd\/if (2023)"},{"key":"23_CR12","doi-asserted-by":"crossref","unstructured":"Deka, B., et al.: Rico: a mobile app dataset for building data-driven design applications. In: UIST (2017)","DOI":"10.1145\/3126594.3126651"},{"key":"23_CR13","unstructured":"Feng, W., et al.: Layoutgpt: compositional visual planning and generation with large language models. In: NeurIPS (2023)"},{"key":"23_CR14","unstructured":"GPT-4: Link: https:\/\/openai.com\/gpt-4 (2023)"},{"key":"23_CR15","doi-asserted-by":"crossref","unstructured":"Gu, S., et al.: Vector quantized diffusion model for text-to-image synthesis. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01043"},{"key":"23_CR16","doi-asserted-by":"crossref","unstructured":"Gupta, K., Lazarow, J., Achille, A., Davis, L.S., Mahadevan, V., Shrivastava, A.: Layouttransformer: layout generation and completion with self-attention. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00104"},{"key":"23_CR17","doi-asserted-by":"crossref","unstructured":"He, L., Lu, Y., Corring, J., Florencio, D., Zhang, C.: Diffusion-based document layout generation. In: ICDAR (2023)","DOI":"10.1007\/978-3-031-41676-7_21"},{"key":"23_CR18","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: NeurIPS (2020)"},{"key":"23_CR19","unstructured":"ideogram: Link: https:\/\/ideogram.ai\/ (2023)"},{"key":"23_CR20","unstructured":"Ji, J., et al.: Improving diffusion models for scene text editing with dual encoders. Transactions on Machine Learning Research (2024)"},{"key":"23_CR21","doi-asserted-by":"crossref","unstructured":"Jyothi, A.A., Durand, T., He, J., Sigal, L., Mori, G.: Layoutvae: stochastic scene layout generation from a label set. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00999"},{"key":"23_CR22","doi-asserted-by":"crossref","unstructured":"Krishnan, P., Kovvuri, R., Pang, G., Vassilev, B., Hassner, T.: Textstylebrush: transfer of text aesthetics from a single example. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)","DOI":"10.1109\/TPAMI.2023.3239736"},{"key":"23_CR23","unstructured":"Lee, J., et al.: Rewritenet: reliable scene text editing with implicit decomposition of text contents and styles. arXiv preprint arXiv:2107.11041 (2021)"},{"key":"23_CR24","unstructured":"Li, J., Yang, J., Hertzmann, A., Zhang, J., Xu, T.: Layoutgan: generating graphic layouts with wireframe discriminators. In: ICLR (2019)"},{"key":"23_CR25","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Gligen: open-set grounded text-to-image generation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"23_CR26","unstructured":"Lin, J., Guo, J., Sun, S., Yang, Z., Lou, J.G., Zhang, D.: Layoutprompter: awaken the design ability of large language models. In: NeurIPS (2024)"},{"key":"23_CR27","unstructured":"Lin, J., Guo, J., Sun, S., Yang, Z.J., Lou, J.G., Zhang, D.: Layoutprompter: awaken the design ability of large language models. In: NeurIPS (2023)"},{"key":"23_CR28","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., et al.: Microsoft coco: Common objects in context. In: ECCV (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"23_CR29","doi-asserted-by":"crossref","unstructured":"Liu, R., et al.: Character-aware models improve visual text rendering. In: ACL (2023)","DOI":"10.18653\/v1\/2023.acl-long.900"},{"key":"23_CR30","unstructured":"Lv, T., et\u00a0al.: Kosmos-2.5: a multimodal literate model. arXiv preprint arXiv:2309.11419 (2023)"},{"key":"23_CR31","unstructured":"Ma, J., et al.: Glyphdraw: learning to draw Chinese characters in image synthesis models coherently. arXiv preprint arXiv:2303.17870 (2023)"},{"key":"23_CR32","unstructured":"Midjourney-v6: (2023). https:\/\/www.midjourney-v6.com\/"},{"key":"23_CR33","doi-asserted-by":"crossref","unstructured":"Patil, A.G., Ben-Eliezer, O., Perel, O., Averbuch-Elor, H.: Read: Recursive autoencoders for document layout generation. In: CVPRW (2020)","DOI":"10.1109\/CVPRW50498.2020.00280"},{"key":"23_CR34","unstructured":"Peng, Z., et al.: Kosmos-2: grounding multimodal large language models to the world. In: ICLR (2024)"},{"key":"23_CR35","unstructured":"Pernias, P., Rampas, D., Richter, M.L., Pal, C., Aubreville, M.: W\u00fcrstchen: an efficient architecture for large-scale text-to-image diffusion models. In: ICLR (2024)"},{"key":"23_CR36","unstructured":"Podell, D., et al.: Sdxl: improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)"},{"key":"23_CR37","doi-asserted-by":"crossref","unstructured":"Qu, Y., Tan, Q., Xie, H., Xu, J., Wang, Y., Zhang, Y.: Exploring stroke-level modifications for scene text editing. In: AAAI (2023)","DOI":"10.1609\/aaai.v37i2.25305"},{"key":"23_CR38","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"23_CR39","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21, 1\u201367 (2020)"},{"key":"23_CR40","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"23_CR41","doi-asserted-by":"crossref","unstructured":"Roy, P., Bhattacharya, S., Ghosh, S., Pal, U.: Stefann: scene text editor using font adaptive neural network. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01324"},{"key":"23_CR42","doi-asserted-by":"crossref","unstructured":"Saharia, C., et al.: Palette: image-to-image diffusion models. In: SIGGRAPH (2022)","DOI":"10.1145\/3528233.3530757"},{"key":"23_CR43","doi-asserted-by":"crossref","unstructured":"Saharia, C., et\u00a0al.: Photorealistic text-to-image diffusion models with deep language understanding. In: NeurIPS (2022)","DOI":"10.1145\/3528233.3530757"},{"key":"23_CR44","doi-asserted-by":"crossref","unstructured":"Sennrich, R., Haddow, B., Birch, A.: Neural machine translation of rare words with subword units. In: ACL (2016)","DOI":"10.18653\/v1\/P16-1162"},{"key":"23_CR45","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. In: ICLR (2021)"},{"key":"23_CR46","unstructured":"Touvron, H., et\u00a0al.: Llama: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"23_CR47","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"23_CR48","unstructured":"Tuo, Y., Xiang, W., He, J.Y., Geng, Y., Xie, X.: Anytext: multilingual visual text generation and editing. arXiv preprint arXiv:2311.03054 (2023)"},{"key":"23_CR49","doi-asserted-by":"crossref","unstructured":"Wu, L., et al.: Editing text in the wild. In: ACM MM (2019)","DOI":"10.1145\/3343031.3350929"},{"key":"23_CR50","doi-asserted-by":"crossref","unstructured":"Xue, L., et al.: Byt5: towards a token-free future with pre-trained byte-to-byte models. Transactions of the Association for Computational Linguistics (2022)","DOI":"10.1162\/tacl_a_00461"},{"key":"23_CR51","doi-asserted-by":"crossref","unstructured":"Yang, F., Su, T., Zhou, X., Di, D., Wang, Z., Li, S.: Self-supervised cross-language scene text editing. In: ACM MM (2023)","DOI":"10.1145\/3581783.3612174"},{"key":"23_CR52","unstructured":"Yang, Y., Gui, D., Yuan, Y., Ding, H., Hu, H., Chen, K.: Glyphcontrol: glyph conditional control for visual text generation. In: NeurIPS (2023)"},{"key":"23_CR53","unstructured":"You, H., et al.: Ferret: refer and ground anything anywhere at any granularity. arXiv preprint arXiv:2310.07704 (2023)"},{"key":"23_CR54","unstructured":"Yu, Y., Zeng, Z., Hua, H., Fu, J., Luo, J.: Promptfix: you prompt and we fix the photo. arXiv preprint arXiv:2405.16785 (2024)"},{"key":"23_CR55","doi-asserted-by":"crossref","unstructured":"Zhang, L., Chen, X., Wang, Y., Lu, Y., Qiao, Y.: Brush your text: synthesize any scene text on images via diffusion model. arXiv preprint arXiv:2312.12232 (2023)","DOI":"10.1609\/aaai.v38i7.28550"},{"key":"23_CR56","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"23_CR57","unstructured":"Zhang, S., et al.: Gpt4roi: instruction tuning large language model on region-of-interest. arXiv preprint arXiv:2307.03601 (2023)"},{"key":"23_CR58","unstructured":"Zhao, S., et al.: Uni-controlnet: all-in-one control to text-to-image diffusion models. In: NeurIPS (2023)"},{"key":"23_CR59","unstructured":"Zheng, L., et al.: Judging llm-as-a-judge with mt-bench and chatbot arena (2023)"},{"key":"23_CR60","unstructured":"Zhou, Q., Yu, C., Zhang, S., Wu, S., Wang, Z., Wang, F.: Regionblip: a unified multi-modal pre-training framework for holistic and regional comprehension. arXiv preprint arXiv:2308.02299 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72652-1_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T08:38:35Z","timestamp":1730191115000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72652-1_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,30]]},"ISBN":["9783031726514","9783031726521"],"references-count":60,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72652-1_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,30]]},"assertion":[{"value":"30 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}