{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T23:21:44Z","timestamp":1780356104561,"version":"3.54.1"},"publisher-location":"Cham","reference-count":46,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031197895","type":"print"},{"value":"9783031197901","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19790-1_26","type":"book-chapter","created":{"date-parts":[[2022,10,23]],"date-time":"2022-10-23T11:02:44Z","timestamp":1666522964000},"page":"423-439","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":258,"title":["Compositional Visual Generation with\u00a0Composable Diffusion Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1380-5428","authenticated-orcid":false,"given":"Nan","family":"Liu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7276-5032","authenticated-orcid":false,"given":"Shuang","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6792-5946","authenticated-orcid":false,"given":"Yilun","family":"Du","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Antonio","family":"Torralba","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Joshua B.","family":"Tenenbaum","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2022,10,24]]},"reference":[{"key":"26_CR1","unstructured":"Austin, J., Johnson, D.D., Ho, J., Tarlow, D., van den Berg, R.: Structured denoising diffusion models in discrete state-spaces. In: Advances in Neural Information Processing Systems (2021)"},{"key":"26_CR2","unstructured":"Bau, D., et al.: Paint by word. arXiv preprint. arXiv:2103.10951 (2021)"},{"key":"26_CR3","unstructured":"Chen, N., Zhang, Y., Zen, H., Weiss, R.J., Norouzi, M., Chan, W.: Wavegrad: estimating gradients for waveform generation. arXiv preprint. arXiv:2009.00713 (2020)"},{"key":"26_CR4","doi-asserted-by":"crossref","unstructured":"Chomsky, N.: Aspects of the Theory of Syntax. The MIT Press, Cambridge (1965). http:\/\/www.amazon.com\/Aspects-Theory-Syntax-Noam-Chomsky\/dp\/0262530074","DOI":"10.21236\/AD0616323"},{"key":"26_CR5","unstructured":"DCGM: Gender, age, and emotions extracted for flickr-faces-hq dataset (ffhq) (2020). https:\/\/github.com\/DCGM\/ffhq-features-dataset"},{"key":"26_CR6","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat gans on image synthesis. In: Advances in Neural Information Processing Systems, vol. 34 (2021)"},{"key":"26_CR7","unstructured":"Du, Y., Li, S., Mordatch, I.: Compositional visual generation with energy based models. In: Advances in Neural Information Processing Systems, vol. 33, pp. 6637\u20136647 (2020)"},{"key":"26_CR8","unstructured":"Du, Y., Li, S., Sharma, Y., Tenenbaum, J., Mordatch, I.: Unsupervised learning of compositional energy concepts. In: Advances in Neural Information Processing Systems, vol. 34 (2021)"},{"key":"26_CR9","unstructured":"Du, Y., Li, S., Tenenbaum, J., Mordatch, I.: Improved contrastive divergence training of energy based models. arXiv preprint. arXiv:2012.01316 (2020)"},{"key":"26_CR10","unstructured":"Du, Y., Mordatch, I.: Implicit generation and generalization in energy-based models. arXiv preprint. arXiv:1903.08689 (2019)"},{"key":"26_CR11","unstructured":"Gao, R., Song, Y., Poole, B., Wu, Y.N., Kingma, D.P.: Learning energy-based models by diffusion recovery likelihood. In: International Conference on Learning Representations (2021). https:\/\/openreview.net\/forum?id=v_1Soh8QUNc"},{"key":"26_CR12","unstructured":"Grathwohl, W., Wang, K.C., Jacobsen, J.H., Duvenaud, D., Zemel, R.: Learning the stein discrepancy for training and evaluating energy-based models without sampling. In: International Conference on Machine Learning (2020)"},{"key":"26_CR13","doi-asserted-by":"crossref","unstructured":"Gu, S., et al.: Vector quantized diffusion model for text-to-image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10696\u201310706 (2022)","DOI":"10.1109\/CVPR52688.2022.01043"},{"issue":"8","key":"26_CR14","doi-asserted-by":"publisher","first-page":"1771","DOI":"10.1162\/089976602760128018","volume":"14","author":"GE Hinton","year":"2002","unstructured":"Hinton, G.E.: Training products of experts by minimizing contrastive divergence. Neural Comput. 14(8), 1771\u20131800 (2002)","journal-title":"Neural Comput."},{"key":"26_CR15","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: Advances in Neural Information Processing Systems, vol. 33, pp. 6840\u20136851 (2020)"},{"key":"26_CR16","unstructured":"Janner, M., Du, Y., Tenenbaum, J., Levine, S.: Planning with diffusion for flexible behavior synthesis. In: International Conference on Machine Learning (2022)"},{"key":"26_CR17","doi-asserted-by":"crossref","unstructured":"Johnson, J., Hariharan, B., Van Der Maaten, L., Fei-Fei, L., Lawrence Zitnick, C., Girshick, R.: Clevr: a diagnostic dataset for compositional language and elementary visual reasoning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2901\u20132910 (2017)","DOI":"10.1109\/CVPR.2017.215"},{"key":"26_CR18","doi-asserted-by":"crossref","unstructured":"Karras, T., Laine, S., Aila, T.: A style-based generator architecture for generative adversarial networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4401\u20134410 (2019)","DOI":"10.1109\/CVPR.2019.00453"},{"key":"26_CR19","doi-asserted-by":"crossref","unstructured":"Karras, T., Laine, S., Aittala, M., Hellsten, J., Lehtinen, J., Aila, T.: Analyzing and improving the image quality of stylegan. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8110\u20138119 (2020)","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"26_CR20","unstructured":"Kim, G., Ye, J.C.: Diffusionclip: text-guided image manipulation using diffusion models (2021)"},{"issue":"6266","key":"26_CR21","doi-asserted-by":"publisher","first-page":"1332","DOI":"10.1126\/science.aab3050","volume":"350","author":"BM Lake","year":"2015","unstructured":"Lake, B.M., Salakhutdinov, R., Tenenbaum, J.B.: Human-level concept learning through probabilistic program induction. Science 350(6266), 1332\u20131338 (2015). https:\/\/doi.org\/10.1126\/science.aab3050","journal-title":"Science"},{"key":"26_CR22","doi-asserted-by":"crossref","unstructured":"LeCun, Y., Chopra, S., Hadsell, R., Ranzato, M., Huang, F.: A tutorial on energy-based learning. Predicting Struct. Data 1(0) (2006)","DOI":"10.7551\/mitpress\/7443.003.0014"},{"key":"26_CR23","unstructured":"Liu, N., Li, S., Du, Y., Tenenbaum, J., Torralba, A.: Learning to compose visual relations. In: Advances in Neural Information Processing Systems, vol. 34 (2021)"},{"key":"26_CR24","unstructured":"Marcus, G., Davis, E., Aaronson, S.: A very preliminary analysis of dall-e 2. arXiv preprint. arXiv:2204.13807 (2022)"},{"key":"26_CR25","unstructured":"Meng, C., et al.: Sdedit: guided image synthesis and editing with stochastic differential equations. In: International Conference on Learning Representations (2021)"},{"key":"26_CR26","unstructured":"Nichol, A., et al.: Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint. arXiv:2112.10741 (2021)"},{"key":"26_CR27","unstructured":"Nie, W., Vahdat, A., Anandkumar, A.: Controllable and compositional generation with latent-space energy-based models. In: Advances in Neural Information Processing Systems, vol. 34 (2021)"},{"key":"26_CR28","doi-asserted-by":"crossref","unstructured":"Nijkamp, E., Hill, M., Han, T., Zhu, S.C., Wu, Y.N.: On the anatomy of mcmc-based maximum likelihood learning of energy-based models. arXiv preprint. arXiv:1903.12370 (2019)","DOI":"10.1609\/aaai.v34i04.5973"},{"key":"26_CR29","doi-asserted-by":"crossref","unstructured":"Parmar, G., Zhang, R., Zhu, J.Y.: On aliased resizing and surprising subtleties in gan evaluation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01112"},{"key":"26_CR30","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint. arXiv:2204.06125 (2022)"},{"key":"26_CR31","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: International Conference on Machine Learning, pp. 8821\u20138831. PMLR (2021)"},{"key":"26_CR32","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"26_CR33","doi-asserted-by":"crossref","unstructured":"Saharia, C., et al.: Palette: Image-to-image diffusion models. arXiv preprint. arXiv:2111.05826 (2021)","DOI":"10.1145\/3528233.3530757"},{"key":"26_CR34","unstructured":"Salimans, T., Ho, J.: Should EBMs model the energy or the score? In: Energy Based Models Workshop-ICLR 2021 (2021)"},{"key":"26_CR35","doi-asserted-by":"crossref","unstructured":"Shoshan, A., Bhonker, N., Kviatkovsky, I., Medioni, G.: Gan-control: explicitly controllable gans. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 14083\u201314093 (2021)","DOI":"10.1109\/ICCV48922.2021.01382"},{"key":"26_CR36","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: International Conference on Machine Learning, pp. 2256\u20132265. PMLR (2015)"},{"key":"26_CR37","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. In: International Conference on Learning Representations (2021)"},{"key":"26_CR38","unstructured":"Song, Y., Sohl-Dickstein, J., Kingma, D.P., Kumar, A., Ermon, S., Poole, B.: Score-based generative modeling through stochastic differential equations. arXiv preprint. arXiv:2011.13456 (2020)"},{"key":"26_CR39","unstructured":"Swimmer963: What dall-e 2 can and cannot do (2022). https:\/\/www.lesswrong.com\/posts\/uKp6tBFStnsvrot5t\/what-dall-e-2-can-and-cannot-do"},{"key":"26_CR40","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z.: Rethinking the inception architecture for computer vision. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2818\u20132826 (2016)","DOI":"10.1109\/CVPR.2016.308"},{"issue":"7","key":"26_CR41","doi-asserted-by":"publisher","first-page":"1661","DOI":"10.1162\/NECO_a_00142","volume":"23","author":"P Vincent","year":"2011","unstructured":"Vincent, P.: A connection between score matching and denoising autoencoders. Neural Comput. 23(7), 1661\u20131674 (2011)","journal-title":"Neural Comput."},{"key":"26_CR42","doi-asserted-by":"crossref","unstructured":"Xiao, T., Hong, J., Ma, J.: Elegant: exchanging latent encodings with gan for transferring multiple face attributes. In: Proceedings of the European conference on computer vision (ECCV), pp. 168\u2013184 (2018)","DOI":"10.1007\/978-3-030-01249-6_11"},{"key":"26_CR43","doi-asserted-by":"crossref","unstructured":"Xu, T., Zhang, P., Huang, Q., Zhang, H., Gan, Z., Huang, X., He, X.: Attngan: fine-grained text to image generation with attentional generative adversarial networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1316\u20131324 (2018)","DOI":"10.1109\/CVPR.2018.00143"},{"key":"26_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, H., et al.: Stackgan: text to photo-realistic image synthesis with stacked generative adversarial networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5907\u20135915 (2017)","DOI":"10.1109\/ICCV.2017.629"},{"key":"26_CR45","doi-asserted-by":"crossref","unstructured":"Zhou, L., Du, Y., Wu, J.: 3D shape generation and completion through point-voxel diffusion. In: International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00577"},{"key":"26_CR46","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"592","DOI":"10.1007\/978-3-030-58520-4_35","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Zhu","year":"2020","unstructured":"Zhu, J., Shen, Y., Zhao, D., Zhou, B.: In-domain gan inversion for real image editing. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12362, pp. 592\u2013608. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58520-4_35"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19790-1_26","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,12]],"date-time":"2024-03-12T15:09:36Z","timestamp":1710256176000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19790-1_26"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031197895","9783031197901"],"references-count":46,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19790-1_26","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"24 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}