{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:03:38Z","timestamp":1777655018506,"version":"3.51.4"},"publisher-location":"Cham","reference-count":53,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730030","type":"print"},{"value":"9783031730047","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73004-7_5","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T17:02:14Z","timestamp":1730394134000},"page":"70-86","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Bridging Different Language Models and\u00a0Generative Vision Models for\u00a0Text-to-Image Generation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-8910-4649","authenticated-orcid":false,"given":"Shihao","family":"Zhao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8551-3608","authenticated-orcid":false,"given":"Shaozhe","family":"Hao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bojia","family":"Zi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Huaizhe","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8560-9007","authenticated-orcid":false,"given":"Kwan-Yee K.","family":"Wong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,1]]},"reference":[{"key":"5_CR1","doi-asserted-by":"crossref","unstructured":"Bao, F., Li, C., Cao, Y., Zhu, J.: All are worth words: a ViT backbone for score-based diffusion models. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02171"},{"key":"5_CR2","unstructured":"Betker, J., et al.: Improving image generation with better captions (2023). https:\/\/cdn.openai.com\/papers\/dall-e-3.pdf"},{"key":"5_CR3","doi-asserted-by":"crossref","unstructured":"Brooks, T., Holynski, A., Efros, A.A.: InstructPix2Pix: learning to follow image editing instructions. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"5_CR4","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners. In: NeurIPS (2020)"},{"key":"5_CR5","unstructured":"Chen, J., Huang, Y., Lv, T., Cui, L., Chen, Q., Wei, F.: TextDiffuser: diffusion models as text painters. In: NeurIPS (2024)"},{"key":"5_CR6","doi-asserted-by":"crossref","unstructured":"Chen, J., et\u00a0al.: PixArt-$$\\alpha $$: fast training of diffusion transformer for photorealistic text-to-image synthesis. In: ICLR (2024)","DOI":"10.1007\/978-3-031-73411-3_5"},{"key":"5_CR7","doi-asserted-by":"crossref","unstructured":"Croitoru, F.A., Hondru, V., Ionescu, R.T., Shah, M.: Diffusion models in vision: a survey. TPAMI (2023)","DOI":"10.1109\/TPAMI.2023.3261988"},{"key":"5_CR8","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL-HLT (2018)"},{"key":"5_CR9","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. In: NeurIPS (2021)"},{"key":"5_CR10","unstructured":"Ding, M., et\u00a0al.: CogView: mastering text-to-image generation via transformers. In: NeurIPS (2021)"},{"key":"5_CR11","unstructured":"Dosovitskiy, A., et al.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. In: ICLR (2021)"},{"key":"5_CR12","unstructured":"Gal, R., et al.: An image is worth one word: personalizing text-to-image generation using textual inversion. arXiv preprint arXiv:2208.01618 (2022)"},{"key":"5_CR13","doi-asserted-by":"crossref","unstructured":"Ge, S., Park, T., Zhu, J.Y., Huang, J.B.: Expressive text-to-image generation with rich text. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00694"},{"key":"5_CR14","unstructured":"Goodfellow, I., et al.: Generative adversarial nets. In: NeurIPS (2014)"},{"key":"5_CR15","unstructured":"Guo, Y., et al.: AnimateDiff: animate your personalized text-to-image diffusion models without specific tuning. In: ICLR (2024)"},{"key":"5_CR16","unstructured":"Hao, S., Han, K., Zhao, S., Wong, K.Y.K.: ViCo: detail-preserving visual condition for personalized text-to-image generation. arXiv preprint arXiv:2306.00971 (2023)"},{"key":"5_CR17","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local nash equilibrium. In: NeurIPS (2017)"},{"key":"5_CR18","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: NeurIPS (2020)"},{"key":"5_CR19","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. In: NeurIPS Workshop on Deep Generative Models and Downstream Applications (2021)"},{"key":"5_CR20","unstructured":"Hu, E.J., et al.: LoRA: Low-rank adaptation of large language models. In: ICLR (2022)"},{"key":"5_CR21","unstructured":"Huang, K., Sun, K., Xie, E., Li, Z., Liu, X.: T2I-CompBench: a comprehensive benchmark for open-world compositional text-to-image generation. arXiv preprint arXiv: 2307.06350 (2023)"},{"key":"5_CR22","doi-asserted-by":"crossref","unstructured":"Kawar, B., et al.: Imagic: text-based real image editing with diffusion models. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"5_CR23","unstructured":"LAION-AI: aesthetic-predictor (2022). https:\/\/github.com\/LAION-AI\/aesthetic-predictor"},{"key":"5_CR24","doi-asserted-by":"crossref","unstructured":"Lewis, M., et al.: BART: denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461 (2019)","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"5_CR25","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"5_CR26","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: ICLR (2019)"},{"key":"5_CR27","unstructured":"Meng, C., et al.: SDEdit: guided image synthesis and editing with stochastic differential equations. In: ICLR (2022)"},{"key":"5_CR28","doi-asserted-by":"crossref","unstructured":"Mou, C., et al.: T2I-adapter: learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:2302.08453 (2023)","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"5_CR29","unstructured":"Nichol, A., et al.: GLIDE: towards photorealistic image generation and editing with text-guided diffusion models. In: ICML (2021)"},{"key":"5_CR30","doi-asserted-by":"crossref","unstructured":"Peebles, W., Xie, S.: Scalable diffusion models with transformers. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"5_CR31","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"5_CR32","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et\u00a0al.: Improving language understanding by generative pre-training. OpenAI blog (2018)"},{"key":"5_CR33","unstructured":"Radford, A., et\u00a0al.: Language models are unsupervised multitask learners. OpenAI blog (2019)"},{"key":"5_CR34","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. JMLR (2020)"},{"key":"5_CR35","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)"},{"key":"5_CR36","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: ICML (2021)"},{"key":"5_CR37","unstructured":"Reed, S., Akata, Z., Yan, X., Logeswaran, L., Schiele, B., Lee, H.: Generative adversarial text to image synthesis. In: ICML (2016)"},{"key":"5_CR38","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"5_CR39","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"5_CR40","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: DreamBooth: fine tuning text-to-image diffusion models for subject-driven generation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"5_CR41","doi-asserted-by":"crossref","unstructured":"Saharia, C., et\u00a0al.: Photorealistic text-to-image diffusion models with deep language understanding. In: NeurIPS (2022)","DOI":"10.1145\/3528233.3530757"},{"key":"5_CR42","unstructured":"Schuhmann, C., et al.: LAION-400M: open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)"},{"key":"5_CR43","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. In: ICLR (2021)"},{"key":"5_CR44","unstructured":"Song, Y., Dhariwal, P., Chen, M., Sutskever, I.: Consistency models. In: ICML (2023)"},{"key":"5_CR45","unstructured":"Song, Y., Sohl-Dickstein, J., Kingma, D.P., Kumar, A., Ermon, S., Poole, B.: Score-based generative modeling through stochastic differential equations. In: ICLR (2021)"},{"key":"5_CR46","unstructured":"Sutskever, I., Vinyals, O., Le, Q.V.: Sequence to sequence learning with neural networks. In: NeurIPS (2014)"},{"key":"5_CR47","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"5_CR48","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS (2017)"},{"key":"5_CR49","unstructured":"Wu, W., et al.: Paragraph-to-image generation with information-enriched diffusion model. arXiv preprint arXiv:2311.14284 (2023)"},{"key":"5_CR50","unstructured":"Xue, Z., et al.: RAPHAEL: text-to-image generation via large mixture of diffusion paths. In: NeurIPS (2024)"},{"key":"5_CR51","doi-asserted-by":"crossref","unstructured":"Zhang, H., et al.: StackGAN: text to photo-realistic image synthesis with stacked generative adversarial networks. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.629"},{"key":"5_CR52","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"5_CR53","unstructured":"Zhao, S., et al.: Uni-ControlNet: all-in-one control to text-to-image diffusion models. In: NeurIPS (2024)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73004-7_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T16:16:32Z","timestamp":1732983392000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73004-7_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,1]]},"ISBN":["9783031730030","9783031730047"],"references-count":53,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73004-7_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,1]]},"assertion":[{"value":"1 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}