{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T17:00:20Z","timestamp":1780765220432,"version":"3.54.1"},"publisher-location":"Cham","reference-count":37,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729454","type":"print"},{"value":"9783031729461","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,2]],"date-time":"2024-10-02T00:00:00Z","timestamp":1727827200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,2]],"date-time":"2024-10-02T00:00:00Z","timestamp":1727827200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72946-1_4","type":"book-chapter","created":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T19:02:08Z","timestamp":1727809328000},"page":"55-71","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["Object-Conditioned Energy-Based Attention Map Alignment in\u00a0Text-to-Image Diffusion Models"],"prefix":"10.1007","author":[{"given":"Yasi","family":"Zhang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Peiyu","family":"Yu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ying Nian","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,10,2]]},"reference":[{"key":"4_CR1","doi-asserted-by":"crossref","unstructured":"Agarwal, A., Karanam, S., Joseph, K., Saxena, A., Goswami, K., Srinivasan, B.V.: A-star: test-time attention segregation and retention for text-to-image synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2283\u20132293 (2023)","DOI":"10.1109\/ICCV51070.2023.00217"},{"key":"4_CR2","unstructured":"Balaji, Y., et\u00a0al.: eDiffI: text-to-image diffusion models with an ensemble of expert denoisers. arXiv preprint arXiv:2211.01324 (2022)"},{"key":"4_CR3","doi-asserted-by":"crossref","unstructured":"Chang, Y., Zhang, Y., Fang, Z., Wu, Y., Bisk, Y., Gao, F.: Skews in the phenomenon space hinder generalization in text-to-image generation. arXiv preprint arXiv:2403.16394 (2024)","DOI":"10.1007\/978-3-031-73021-4_25"},{"issue":"4","key":"4_CR4","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3592116","volume":"42","author":"H Chefer","year":"2023","unstructured":"Chefer, H., Alaluf, Y., Vinker, Y., Wolf, L., Cohen-Or, D.: Attend-and-excite: attention-based semantic guidance for text-to-image diffusion models. ACM Trans. Graph. (TOG) 42(4), 1\u201310 (2023)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"4_CR5","unstructured":"Conwell, C., Ullman, T.: Testing relational understanding in text-guided image generation. arXiv preprint arXiv:2208.00005 (2022)"},{"key":"4_CR6","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12873\u201312883 (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"4_CR7","unstructured":"Feng, W., et al.: Training-free structured diffusion guidance for compositional text-to-image synthesis. In: The Eleventh International Conference on Learning Representations (2023). https:\/\/openreview.net\/forum?id=PUIqjT4rzq7"},{"key":"4_CR8","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., Cohen-or, D.: Prompt-to-prompt image editing with cross-attention control. In: The Eleventh International Conference on Learning Representations (2023). https:\/\/openreview.net\/forum?id=_CDixzkzeyb"},{"key":"4_CR9","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)"},{"key":"4_CR10","unstructured":"Honnibal, M., Montani, I.: spacy 2: natural language understanding with bloom embeddings, convolutional neural networks and incremental parsing. To appear 7(1), 411\u2013420 (2017)"},{"key":"4_CR11","unstructured":"Hoogeboom, E., Heek, J., Salimans, T.: Simple diffusion: end-to-end diffusion for high resolution images. arXiv preprint arXiv:2301.11093 (2023)"},{"key":"4_CR12","unstructured":"Hoover, B., et al.: Energy transformer. arXiv preprint arXiv:2302.07253 (2023)"},{"issue":"8","key":"4_CR13","doi-asserted-by":"publisher","first-page":"2554","DOI":"10.1073\/pnas.79.8.2554","volume":"79","author":"JJ Hopfield","year":"1982","unstructured":"Hopfield, J.J.: Neural networks and physical systems with emergent collective computational abilities. Proc. Natl. Acad. Sci. 79(8), 2554\u20132558 (1982)","journal-title":"Proc. Natl. Acad. Sci."},{"key":"4_CR14","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"4_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"4_CR16","doi-asserted-by":"publisher","unstructured":"Liu, N., Li, S., Du, Y., Torralba, A., Tenenbaum, J.B.: Compositional visual generation with composable diffusion models. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022. ECCV 2022. LNCS, vol. 13677, pp. 423\u2013439. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19790-1_26","DOI":"10.1007\/978-3-031-19790-1_26"},{"issue":"4","key":"4_CR17","doi-asserted-by":"publisher","first-page":"461","DOI":"10.1109\/TIT.1987.1057328","volume":"33","author":"R McEliece","year":"1987","unstructured":"McEliece, R., Posner, E., Rodemich, E., Venkatesh, S.: The capacity of the hopfield associative memory. IEEE Trans. Inf. Theory 33(4), 461\u2013482 (1987)","journal-title":"IEEE Trans. Inf. Theory"},{"key":"4_CR18","unstructured":"Mikolov, T., Sutskever, I., Chen, K., Corrado, G.S., Dean, J.: Distributed representations of words and phrases and their compositionality. Adv. Neural Inf. Process. Syst. 26 (2013)"},{"key":"4_CR19","unstructured":"Nichol, A., et al.: Glide: towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)"},{"key":"4_CR20","unstructured":"Park, G.Y., Kim, J., Kim, B., Lee, S.W., Ye, J.C.: Energy-based cross attention for Bayesian context update in text-to-image diffusion models. arXiv preprint arXiv:2306.09869 (2023)"},{"key":"4_CR21","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"4_CR22","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022). 1(2), 3"},{"key":"4_CR23","unstructured":"Ramsauer, H., et\u00a0al.: Hopfield networks is all you need. arXiv preprint arXiv:2008.02217 (2020)"},{"key":"4_CR24","unstructured":"Rassin, R., Hirsch, E., Glickman, D., Ravfogel, S., Goldberg, Y., Chechik, G.: Linguistic binding in diffusion models: enhancing attribute correspondence through attention map alignment (2023)"},{"key":"4_CR25","doi-asserted-by":"crossref","unstructured":"Rassin, R., Ravfogel, S., Goldberg, Y.: DALLE-2 is seeing double: flaws in word-to-concept mapping in text2image models. arXiv preprint arXiv:2210.10606 (2022)","DOI":"10.18653\/v1\/2022.blackboxnlp-1.28"},{"key":"4_CR26","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10684\u201310695, June 2022","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"4_CR27","doi-asserted-by":"crossref","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding (2022)","DOI":"10.1145\/3528233.3530757"},{"key":"4_CR28","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: Flava: a foundational language and vision alignment model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15638\u201315650 (2022)","DOI":"10.1109\/CVPR52688.2022.01519"},{"key":"4_CR29","unstructured":"Xie, J., Lu, Y., Zhu, S.C., Wu, Y.: A theory of generative convnet. In: International Conference on Machine Learning, pp. 2635\u20132644. PMLR (2016)"},{"key":"4_CR30","unstructured":"Yu, J., et al.: Scaling autoregressive models for content-rich text-to-image generation (2022)"},{"key":"4_CR31","unstructured":"Yu, P., et al.: Latent diffusion energy-based model for interpretable text modeling. arXiv preprint arXiv:2206.05895 (2022)"},{"key":"4_CR32","first-page":"14264","volume":"34","author":"P Yu","year":"2021","unstructured":"Yu, P., Xie, S., Ma, X., Zhu, Y., Wu, Y.N., Zhu, S.C.: Unsupervised foreground extraction via deep region competition. Adv. Neural Inf. Process. Syst. 34, 14264\u201314279 (2021)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"4_CR33","unstructured":"Yu, P., et\u00a0al.: Latent energy-based odyssey: black-box optimization via expanded exploration in the energy-based latent space. arXiv preprint arXiv:2405.16730 (2024)"},{"key":"4_CR34","unstructured":"Yu, P., et al.: Learning energy-based prior model with diffusion-amortized mcmc. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"4_CR35","unstructured":"Yuksekgonul, M., Bianchi, F., Kalluri, P., Jurafsky, D., Zou, J.: When and why vision-language models behave like bags-of-words, and what to do about it? In: The Eleventh International Conference on Learning Representations (2022)"},{"key":"4_CR36","unstructured":"Zeng, Y., Zhang, X., Li, H.: Multi-grained vision language pre-training: aligning texts with visual concepts. arXiv preprint arXiv:2111.08276 (2021)"},{"key":"4_CR37","unstructured":"Zhang, Y., et al.: Flow priors for linear inverse problems via iterative corrupted trajectory matching. arXiv preprint arXiv:2405.18816 (2024)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72946-1_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T23:35:21Z","timestamp":1732836921000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72946-1_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,2]]},"ISBN":["9783031729454","9783031729461"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72946-1_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,2]]},"assertion":[{"value":"2 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}