{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,30]],"date-time":"2026-07-30T14:12:14Z","timestamp":1785420734277,"version":"3.56.0"},"publisher-location":"Cham","reference-count":52,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031198359","type":"print"},{"value":"9783031198366","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19836-6_6","type":"book-chapter","created":{"date-parts":[[2022,10,21]],"date-time":"2022-10-21T09:04:58Z","timestamp":1666343098000},"page":"88-105","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":223,"title":["VQGAN-CLIP: Open Domain Image Generation and\u00a0Editing with\u00a0Natural Language Guidance"],"prefix":"10.1007","author":[{"given":"Katherine","family":"Crowson","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Stella","family":"Biderman","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Daniel","family":"Kornis","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dashiell","family":"Stander","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Eric","family":"Hallahan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Louis","family":"Castricato","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Edward","family":"Raff","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2022,10,22]]},"reference":[{"key":"6_CR1","unstructured":"Ali, S., Parikh, D.: Telling creative stories using generative visual aids (2021). arXiv: 2110.14810v1 [cs.HC]"},{"key":"6_CR2","doi-asserted-by":"crossref","unstructured":"Avrahami, O., Lischinski, D., Fried, O.: Blended diffusion for text-driven editing of natural images (2021). arXiv: 2111.14818v1 [cs.CV]","DOI":"10.1109\/CVPR52688.2022.01767"},{"key":"6_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"351","DOI":"10.1007\/978-3-030-58452-8_21","volume-title":"Computer Vision \u2013 ECCV 2020","author":"D Bau","year":"2020","unstructured":"Bau, D., Liu, S., Wang, T., Zhu, J.-Y., Torralba, A.: Rewriting a deep generative model. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 351\u2013369. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_21"},{"key":"6_CR4","doi-asserted-by":"crossref","unstructured":"Black, S., et al.: GPT-NeoX-20B: an open-source autoregressive language model. Preprint (2022)","DOI":"10.18653\/v1\/2022.bigscience-1.9"},{"key":"6_CR5","unstructured":"Chen, G., Dumay, A., Tang, M.: diffvg+CLIP: generating painting trajectories from text. Preprint (2021)"},{"key":"6_CR6","doi-asserted-by":"crossref","unstructured":"Couairon, G., Grechka, A., Verbeek, J., Schwenk, H., Cord, M.: FlexIT: towards flexible semantic image translation (2022). arXiv: 2203.04705 [cs.CV]","DOI":"10.1109\/CVPR52688.2022.01773"},{"key":"6_CR7","doi-asserted-by":"crossref","unstructured":"De Cao, N., Aziz, W., Titov, I.: Editing factual knowledge in language models (2021). arXiv: 2104.08164v2 [cs.CL]","DOI":"10.18653\/v1\/2021.emnlp-main.522"},{"key":"6_CR8","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255 (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"6_CR9","doi-asserted-by":"crossref","unstructured":"Dong, H., Yu, S., Wu, C., Guo, Y.: Semantic image synthesis via adversarial learning. In: 2017 IEEE International Conference on Computer Vision (ICCV), pp. 5706\u20135714 (2017)","DOI":"10.1109\/ICCV.2017.608"},{"key":"6_CR10","unstructured":"Eichenberg, C., Black, S., Weinbach, S., Parcalabescu, L., Frank, A.: MAGMA \u2013 multimodal augmentation of generative models through adapter-based finetuning (2021). arXiv: 2112.05253v1 [cs.CV]"},{"key":"6_CR11","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12873\u201312883 (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"6_CR12","unstructured":"Fei, N., et al.: WenLan 2.0: make AI imagine via a multimodal foundation model (2021). arXiv: 2110.14378v1 [cs.AI]"},{"key":"6_CR13","unstructured":"Frans, K., Soros, L.B., Witkowski, O.: CLIPDraw: exploring text-to-drawing synthesis through language-image encoders (2021). arXiv: 2106.14843v1 [cs.CV]"},{"key":"6_CR14","doi-asserted-by":"crossref","unstructured":"Galanos, T., Liapis, A., Yannakakis, G.N.: AffectGAN: affect-based generative art driven by semantics. In: 9th International Conference on Affective Computing and Intelligent Interaction Workshops and Demos (ACIIW) (2021)","DOI":"10.1109\/ACIIW52867.2021.9666317"},{"key":"6_CR15","doi-asserted-by":"crossref","unstructured":"Gu, S., et al.: Vector quantized diffusion model for text-to-image synthesis (2021). arXiv:2111.14822v3 [cs.CV]","DOI":"10.1109\/CVPR52688.2022.01043"},{"key":"6_CR16","unstructured":"Houlsby, N., et al.: Parameter-efficient transfer learning for NLP. In: International Conference on Machine Learning, pp. 2790\u20132799 (2019)"},{"key":"6_CR17","doi-asserted-by":"crossref","unstructured":"Hu, X., Yu, P., Knight, K., Ji, H., Li, B., Shi, H.: MUSE: textual attributes guided portrait painting generation. In: 2021 IEEE 4th International Conference on Multimedia Information Processing and Retrieval (MIPR), pp. 386\u2013392 (2021)","DOI":"10.1109\/MIPR51284.2021.00072"},{"key":"6_CR18","unstructured":"Jang, J., Shin, S., Kim, Y.: Music2Video: automatic generation of music video with fusion of audio and text (2022). arXiv: 2201.03809v1 [cs.SD]"},{"key":"6_CR19","unstructured":"Kim, S., Cho, S., Kim, C., Lee, D., Baek, W.: minDALL-E on conceptual captions (2021). https:\/\/github.com\/kakaobrain\/minDALL-E"},{"key":"6_CR20","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization (2014). arXiv: 1412.6980v9 [cs.LG]"},{"key":"6_CR21","doi-asserted-by":"crossref","unstructured":"Kwon, G., Ye, J.C.: CLIPstyler: image style transfer with a single text condition (2021). arXiv: 2112.00374v2 [cs.CV]","DOI":"10.1109\/CVPR52688.2022.01753"},{"key":"6_CR22","doi-asserted-by":"crossref","unstructured":"Lester, B., Al-Rfou, R., Constant, N.: The power of scale for parameter-efficient prompt tuning (2021). arXiv: 2104.08691v2 [cs.CL]","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"6_CR23","doi-asserted-by":"crossref","unstructured":"Li, B., Qi, X., Lukasiewicz, T., Torr, P.H.S.: ManiGAN: text-guided image manipulation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7880\u20137889 (2020)","DOI":"10.1109\/CVPR42600.2020.00790"},{"key":"6_CR24","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1007\/978-3-030-58621-8_6","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Liu","year":"2020","unstructured":"Liu, X., et al.: Open-Edit: open-domain image manipulation with open-vocabulary instructions. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020, Part XI. LNCS, vol. 12356, pp. 89\u2013106. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58621-8_6"},{"key":"6_CR25","unstructured":"Liu, X., Gong, C., Lemeng, W., Zhang, S., Hao, S., Liu, Q.: FuseDream: training-free text-to-image generation with improved CLIP+GAN space optimization (2021). arXiv: 2112.01573v1 [cs.CV]"},{"key":"6_CR26","unstructured":"Matena, M., Raffel, C.: Merging models with fisher-weighted averaging (2021). arXiv: 2111.09832v1 [cs.LG]"},{"key":"6_CR27","doi-asserted-by":"crossref","unstructured":"Michel, O., Bar-On, R., Liu, R., Benaim, S., Hanocka, R.: Text2Mesh: text-driven neural stylization for meshes (2021). arXiv: 2112.03221v1 [cs.CV]","DOI":"10.1109\/CVPR52688.2022.01313"},{"key":"6_CR28","unstructured":"Mitchell, E., Lin, C., Bosselut, A., Finn, C., Manning, C.D.: Fast model editing at scale (2021). arXiv: 2110.11309v1 [cs.LG]"},{"key":"6_CR29","unstructured":"Mordvintsev, A., Olah, C., Tyka, M.: DeepDream - a code example for visualizing neural networks (2015). https:\/\/ai.googleblog.com\/2015\/07\/deepdream-code-example-for-visualizing.html"},{"key":"6_CR30","unstructured":"Murdock, R.: The taming transformers decoder really just goes! And this is with very little work. https:\/\/twitter.com\/advadnoun\/status\/1367556678896394240"},{"key":"6_CR31","unstructured":"Murdock, R.: Working on using the rn50x4 version of clip with the taming transformers VQGAN. https:\/\/twitter.com\/advadnoun\/status\/1368081153375105027"},{"key":"6_CR32","unstructured":"Nam, S., Kim, Y., Kim, S.J.: Text-adaptive generative adversarial networks: manipulating images with natural language. In: Bengio, S., Wallach, H., Larochelle, H., Grauman, K., Cesa-Bianchi, N., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol. 31, pp. 42\u201351. Curran Associates Inc. (2018). https:\/\/papers.neurips.cc\/paper\/2018\/hash\/d645920e395fedad7bbbed0eca3fe2e0-Abstract.html"},{"key":"6_CR33","unstructured":"Nichol, A., et al.: GLIDE: towards photorealistic image generation and editing with text-guided diffusion models (2021). arXiv: 2112.10741v3 [cs.CV]"},{"key":"6_CR34","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"394","DOI":"10.1007\/978-3-030-58542-6_24","volume-title":"Computer Vision \u2013 ECCV 2020","author":"E Ntavelis","year":"2020","unstructured":"Ntavelis, E., Romero, A., Kastanis, I., Van Gool, L., Timofte, R.: SESAME: semantic editing of scenes by adding, manipulating or erasing objects. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12367, pp. 394\u2013411. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58542-6_24"},{"key":"6_CR35","unstructured":"van den Oord, A., Vinyals, O., Kavukcuoglu, K.: Neural discrete representation learning. In: Advances in Neural Information Processing Systems, vol. 30, pp. 6309\u20136318. Curran Associates, Inc. (2017)"},{"key":"6_CR36","doi-asserted-by":"crossref","unstructured":"Patashnik, O., Wu, Z., Shechtman, E., Cohen-Or, D., Lischinski, D.: StyleCLIP: text-driven manipulation of StyleGAN imagery. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2085\u20132094 (2021)","DOI":"10.1109\/ICCV48922.2021.00209"},{"key":"6_CR37","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol. 139, pp. 8748\u20138763. PMLR (2021). https:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"6_CR38","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol. 139, pp. 8821\u20138831. PMLR (2021). https:\/\/proceedings.mlr.press\/v139\/ramesh21a.html"},{"key":"6_CR39","doi-asserted-by":"crossref","unstructured":"Riba, E., Mishkin, D., Ponsa, D., Rublee, E., Bradski, G.R.: Kornia: an open source differentiable computer vision library for PyTorch. In: 2020 IEEE Winter Conference on Applications of Computer Vision (WACV), pp. 3663\u20133672 (2020)","DOI":"10.1109\/WACV45572.2020.9093363"},{"key":"6_CR40","doi-asserted-by":"crossref","unstructured":"Sayers, D., et al.: The dawn of the human-machine era: a forecast of new and emerging language technologies (2021)","DOI":"10.17011\/jyx\/reports\/20210518\/1"},{"key":"6_CR41","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., et al.: Grad-CAM: visual explanations from deep networks via gradient-based localization. In 2017 IEEE International Conference on Computer Vision (ICCV), pp. 618\u2013626 (2017)","DOI":"10.1109\/ICCV.2017.74"},{"key":"6_CR42","unstructured":"Sharir, O., Peleg, B., Shoham, Y.: The Cost of training NLP models: a concise overview (2020)"},{"key":"6_CR43","doi-asserted-by":"publisher","unstructured":"Shocher, A., et al.: Semantic pyramid for image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7457\u20137466 (2020). https:\/\/doi.org\/10.1109\/CVPR42600.2020.00748","DOI":"10.1109\/CVPR42600.2020.00748"},{"key":"6_CR44","unstructured":"Simonyan, K., Vedaldi, A., Zisserman, A.: Deep inside convolutional networks: visualising image classification models and saliency maps (2014). arXiv:1312.6034v2 [cs.CV]"},{"key":"6_CR45","unstructured":"Snell, C.: Alien Dreams: An Emerging Art Scene (2020). https:\/\/ml.berkeley.edu\/blog\/posts\/clip-art\/"},{"key":"6_CR46","unstructured":"Tian, Y., Ha, D.: Modern evolution strategies for creativity: fitting concrete images and abstract concepts (2021). arXiv: 2109.08857v2 [cs.NE]"},{"key":"6_CR47","unstructured":"Tsimpoukelli, M., Menick, J., Cabi, S., Eslami, S.A., Vinyals, O., Hill, F.: Multimodal few-shot learning with frozen language models. In: Advances in Neural Information Processing Systems (2021)"},{"key":"6_CR48","unstructured":"Underwood, T.: Mapping the latent spaces of culture (2021). https:\/\/tedunderwood.com\/2021\/10\/21\/latent-spaces-of-culture\/"},{"key":"6_CR49","unstructured":"Wang, Z., Liu, W., He, Q.,Wu, X., Yi, Z.: CLIP-GEN: language-free training of a text-to-image generator with CLIP (2022). arXiv: 2203.00386v1 [cs.CV]"},{"key":"6_CR50","doi-asserted-by":"crossref","unstructured":"Wu, H.-H., Seetharaman, P., Kumar, K., Bello, J.P.: Wav2CLIP: learning robust audio representations from CLIP (2021). arXiv: 2110.11499v2 [cs.SD]","DOI":"10.31219\/osf.io\/r2vwf"},{"key":"6_CR51","doi-asserted-by":"crossref","unstructured":"Yang, Z., Buehler, M.J.: Words to matter: de novo architected materials design using transformer neural networks. Front. Mater. 8, 417 (2021)","DOI":"10.3389\/fmats.2021.740754"},{"key":"6_CR52","unstructured":"Yosinski, J., Clune, J., Nguyen, A., Fuchs, T., Lipson, H.: Understanding neural networks through deep visualization (2015). arXiv: 1506.06579v1 [cs.CV]"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19836-6_6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,24]],"date-time":"2022-10-24T23:04:50Z","timestamp":1666652690000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19836-6_6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198359","9783031198366"],"references-count":52,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19836-6_6","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"22 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}