{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T07:26:25Z","timestamp":1742973985858,"version":"3.40.3"},"publisher-location":"Cham","reference-count":50,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031730207"},{"type":"electronic","value":"9783031730214"}],"license":[{"start":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T00:00:00Z","timestamp":1732147200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T00:00:00Z","timestamp":1732147200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73021-4_25","type":"book-chapter","created":{"date-parts":[[2024,11,20]],"date-time":"2024-11-20T09:21:18Z","timestamp":1732094478000},"page":"422-439","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Skews in\u00a0the\u00a0Phenomenon Space Hinder Generalization in\u00a0Text-to-Image Generation"],"prefix":"10.1007","author":[{"given":"Yingshan","family":"Chang","sequence":"first","affiliation":[]},{"given":"Yasi","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Zhiyuan","family":"Fang","sequence":"additional","affiliation":[]},{"given":"Ying Nian","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Yonatan","family":"Bisk","sequence":"additional","affiliation":[]},{"given":"Feng","family":"Gao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,21]]},"reference":[{"key":"25_CR1","doi-asserted-by":"crossref","unstructured":"Agrawal, H., et al.: Nocaps: novel object captioning at scale. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8948\u20138957 (2019)","DOI":"10.1109\/ICCV.2019.00904"},{"issue":"1","key":"25_CR2","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1162\/neco.1991.3.1.121","volume":"3","author":"V Ajjanagadde","year":"1991","unstructured":"Ajjanagadde, V., Shastri, L.: Rules and variables in neural nets. Neural Comput. 3(1), 121\u2013134 (1991)","journal-title":"Neural Comput."},{"key":"25_CR3","unstructured":"Betker, J., et\u00a0al.: Improving image generation with better captions. Comput. Sci. 2(3), 8 (2023). https:\/\/cdnopenai.com\/papers\/dall-e-3.pdf"},{"key":"25_CR4","unstructured":"Chang, H., et\u00a0al.: Muse: text-to-image generation via masked generative transformers. arXiv preprint arXiv:2301.00704 (2023)"},{"key":"25_CR5","unstructured":"Chung, H.W., et\u00a0al.: Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)"},{"key":"25_CR6","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"25_CR7","unstructured":"Goel, V., et al.: Pair-diffusion: object-level image editing with structure-and-appearance paired diffusion models. arXiv preprint arXiv:2303.17546 (2023)"},{"key":"25_CR8","unstructured":"Gokhale, T., et al.: Benchmarking spatial relationships in text-to-image generation. arXiv preprint arXiv:2212.10015 (2022)"},{"key":"25_CR9","unstructured":"Gui, L., et al.: Training vision-language transformers from captions. Trans. Mach. Learn. Res. (2023). https:\/\/openreview.net\/forum?id=xLnbSpozWS"},{"key":"25_CR10","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Bras, R.L., Choi, Y.: Clipscore: a reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"25_CR11","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"25_CR12","doi-asserted-by":"crossref","unstructured":"Holyoak, K.J.: Analogy and relational reasoning. In: The Oxford Handbook of Thinking and Reasoning, pp. 234\u2013259 (2012)","DOI":"10.1093\/oxfordhb\/9780199734689.013.0013"},{"key":"25_CR13","unstructured":"Huang, K., Sun, K., Xie, E., Li, Z., Liu, X.: T2i-compbench: a comprehensive benchmark for open-world compositional text-to-image generation. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"25_CR14","unstructured":"Hummel, J.E., et al.: A solution to the binding problem for compositional connectionism. In: AAAI Technical Report (3), pp. 31\u201334 (2004)"},{"key":"25_CR15","doi-asserted-by":"crossref","unstructured":"Johnson, J., et al.: Clevr: a diagnostic dataset for compositional language and elementary visual reasoning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2901\u20132910 (2017)","DOI":"10.1109\/CVPR.2017.215"},{"key":"25_CR16","doi-asserted-by":"crossref","unstructured":"Kamath, A., Hessel, J., Chang, K.W.: What\u2019s \u201cup\u201d with vision-language models? investigating their struggle with spatial reasoning. arXiv preprint arXiv:2310.19785 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.568"},{"key":"25_CR17","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)"},{"key":"25_CR18","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vision 123, 32\u201373 (2017)","journal-title":"Int. J. Comput. Vision"},{"key":"25_CR19","unstructured":"Lake, B., Baroni, M.: Generalization without systematicity: on the compositional skills of sequence-to-sequence recurrent networks. In: International Conference on Machine Learning, pp. 2873\u20132882. PMLR (2018)"},{"key":"25_CR20","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, pp. 19730\u201319742. PMLR (2023)"},{"key":"25_CR21","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Gligen: open-set grounded text-to-image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22511\u201322521 (2023)","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"25_CR22","doi-asserted-by":"crossref","unstructured":"Lindemann, M., Koller, A., Titov, I.: Compositional generalisation with structured reordering and fertility layers. arXiv preprint arXiv:2210.03183 (2022)","DOI":"10.18653\/v1\/2023.eacl-main.159"},{"key":"25_CR23","unstructured":"Liu, H., Yan, W., Abbeel, P.: Language quantized autoencoders: towards unsupervised text-image alignment. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"25_CR24","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"25_CR25","unstructured":"Lovering, C., Pavlick, E.: Training priors predict text-to-image model performance. arXiv preprint arXiv:2306.01755 (2023)"},{"key":"25_CR26","unstructured":"Lu, Y., Yang, X., Li, X., Wang, X.E., Wang, W.Y.: Llmscore: unveiling the power of large language models in text-to-image synthesis evaluation. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"25_CR27","doi-asserted-by":"publisher","unstructured":"Minderer, M., et\u00a0al.: Simple open-vocabulary object detection. In: European Conference on Computer Vision, pp. 728\u2013755. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-20080-9_42","DOI":"10.1007\/978-3-031-20080-9_42"},{"key":"25_CR28","doi-asserted-by":"crossref","unstructured":"Mo, S., et al.: Freecontrol: training-free spatial control of any text-to-image diffusion model with any condition. arXiv preprint arXiv:2312.07536 (2023)","DOI":"10.1109\/CVPR52733.2024.00713"},{"key":"25_CR29","unstructured":"Okawa, M., Lubana, E.S., Dick, R., Tanaka, H.: Compositional abilities emerge multiplicatively: exploring diffusion models on a synthetic task. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"25_CR30","doi-asserted-by":"crossref","unstructured":"Peebles, W., Xie, S.: Scalable diffusion models with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4195\u20134205 (2023)","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"25_CR31","unstructured":"von Platen, P., et al.: Diffusers: State-of-the-art diffusion models (2022). https:\/\/github.com\/huggingface\/diffusers"},{"key":"25_CR32","unstructured":"Potts, C.: Compositionality or generalization? (2019)"},{"key":"25_CR33","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"issue":"1","key":"25_CR34","first-page":"5485","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(1), 5485\u20135551 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"25_CR35","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents, 1(2), 3 (2022). arXiv preprint arXiv:2204.06125"},{"key":"25_CR36","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"25_CR37","first-page":"19861","volume":"33","author":"L Ruis","year":"2020","unstructured":"Ruis, L., Andreas, J., Baroni, M., Bouchacourt, D., Lake, B.M.: A benchmark for systematic generalization in grounded language understanding. Adv. Neural. Inf. Process. Syst. 33, 19861\u201319872 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"25_CR38","unstructured":"Russin, J., et al.: Compositional processing emerges in neural networks solving math problems. In: CogSci... Annual Conference of the Cognitive Science Society. Cognitive Science Society (US). Conference, vol.\u00a02021, p.\u00a01767. NIH Public Access (2021)"},{"key":"25_CR39","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"issue":"1\u20132","key":"25_CR40","doi-asserted-by":"publisher","first-page":"159","DOI":"10.1016\/0004-3702(90)90007-M","volume":"46","author":"P Smolensky","year":"1990","unstructured":"Smolensky, P.: Tensor product variable binding and the representation of symbolic structures in connectionist systems. Artif. Intell. 46(1\u20132), 159\u2013216 (1990)","journal-title":"Artif. Intell."},{"key":"25_CR41","doi-asserted-by":"crossref","unstructured":"Tumanyan, N., Geyer, M., Bagon, S., Dekel, T.: Plug-and-play diffusion features for text-driven image-to-image translation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1921\u20131930 (2023)","DOI":"10.1109\/CVPR52729.2023.00191"},{"key":"25_CR42","unstructured":"Wiedemer, T., Mayilvahanan, P., Bethge, M., Brendel, W.: Compositional generalization from first principles. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"25_CR43","unstructured":"Wu, Z., Kreiss, E., Ong, D.C., Potts, C.: Reascan: compositional reasoning in language grounding. arXiv preprint arXiv:2109.08994 (2021)"},{"key":"25_CR44","doi-asserted-by":"crossref","unstructured":"Xiao, G., Yin, T., Freeman, W.T., Durand, F., Han, S.: Fastcomposer: tuning-free multi-subject image generation with localized attention. arXiv preprint arXiv:2305.10431 (2023)","DOI":"10.1007\/s11263-024-02227-z"},{"key":"25_CR45","doi-asserted-by":"crossref","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguist. (2014)","DOI":"10.1162\/tacl_a_00166"},{"key":"25_CR46","unstructured":"Yu, J., et\u00a0al.: Scaling autoregressive models for content-rich text-to-image generation, 2(3), 5 (2022). arXiv preprint arXiv:2206.10789"},{"key":"25_CR47","unstructured":"Yuksekgonul, M., Bianchi, F., Kalluri, P., Jurafsky, D., Zou, J.: When and why vision-language models behave like bags-of-words, and what to do about it? In: The Eleventh International Conference on Learning Representations (2022)"},{"key":"25_CR48","doi-asserted-by":"crossref","unstructured":"Zeng, Y., et al.: Scenecomposer: any-level semantic image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22468\u201322478 (2023)","DOI":"10.1109\/CVPR52729.2023.02152"},{"key":"25_CR49","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3836\u20133847 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"25_CR50","doi-asserted-by":"publisher","DOI":"10.1016\/j.cognition.2023.105711","volume":"244","author":"Y Zhou","year":"2024","unstructured":"Zhou, Y., Feinman, R., Lake, B.M.: Compositional diversity in visual concept learning. Cognition 244, 105711 (2024)","journal-title":"Cognition"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73021-4_25","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,20]],"date-time":"2024-11-20T09:49:15Z","timestamp":1732096155000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73021-4_25"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,21]]},"ISBN":["9783031730207","9783031730214"],"references-count":50,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73021-4_25","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,21]]},"assertion":[{"value":"21 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}