{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T07:42:43Z","timestamp":1767339763109,"version":"3.40.3"},"publisher-location":"Cham","reference-count":36,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031726514"},{"type":"electronic","value":"9783031726521"}],"license":[{"start":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:00:00Z","timestamp":1730246400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:00:00Z","timestamp":1730246400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72652-1_19","type":"book-chapter","created":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T08:29:02Z","timestamp":1730190542000},"page":"318-334","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Textual-Visual Logic Challenge: Understanding and\u00a0Reasoning in\u00a0Text-to-Image Generation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7807-8014","authenticated-orcid":false,"given":"Peixi","family":"Xiong","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0939-3297","authenticated-orcid":false,"given":"Michael","family":"Kozuch","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9685-2763","authenticated-orcid":false,"given":"Nilesh","family":"Jain","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,30]]},"reference":[{"key":"19_CR1","doi-asserted-by":"crossref","unstructured":"Ak, K.E., Lim, J.H., Tham, J.Y., Kassim, A.: Semantically consistent hierarchical text to fashion image synthesis with an enhanced-attentional generative adversarial network. In: 2019 IEEE\/CVF International Conference on Computer Vision Workshop (ICCVW), pp. 3121\u20133124. IEEE (2019)","DOI":"10.1109\/ICCVW.2019.00379"},{"key":"19_CR2","doi-asserted-by":"crossref","unstructured":"Bao, F., et al.: All are worth words: a ViT backbone for diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22669\u201322679 (2023)","DOI":"10.1109\/CVPR52729.2023.02171"},{"key":"19_CR3","unstructured":"Blender Foundation: Blender - a free and open source 3D creation suite (2023). https:\/\/www.blender.org\/. Accessed 29 Feb 2024"},{"issue":"1","key":"19_CR4","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1109\/MSP.2017.2765202","volume":"35","author":"A Creswell","year":"2018","unstructured":"Creswell, A., White, T., Dumoulin, V., Arulkumaran, K., Sengupta, B., Bharath, A.A.: Generative adversarial networks: an overview. IEEE Sig. Process. Mag. 35(1), 53\u201365 (2018)","journal-title":"IEEE Sig. Process. Mag."},{"key":"19_CR5","doi-asserted-by":"crossref","unstructured":"El-Nouby, A., et al.: Tell, draw, and repeat: generating and modifying images based on continual linguistic instruction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10304\u201310312 (2019)","DOI":"10.1109\/ICCV.2019.01040"},{"key":"19_CR6","doi-asserted-by":"crossref","unstructured":"El-Nouby, A., et al.: Tell, draw, and repeat: generating and modifying images based on continual linguistic instruction. In: The IEEE International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.01040"},{"key":"19_CR7","doi-asserted-by":"crossref","unstructured":"Gao, L., Wang, B., Wang, W.: Image captioning with scene-graph based semantic concepts. In: Proceedings of the 2018 10th International Conference on Machine Learning and Computing, pp. 225\u2013229 (2018)","DOI":"10.1145\/3195106.3195114"},{"key":"19_CR8","doi-asserted-by":"crossref","unstructured":"Gu, S., et al.: Vector quantized diffusion model for text-to-image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10696\u201310706 (2022)","DOI":"10.1109\/CVPR52688.2022.01043"},{"key":"19_CR9","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local Nash equilibrium. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"19_CR10","doi-asserted-by":"crossref","unstructured":"Hong, S., Yang, D., Choi, J., Lee, H.: Inferring semantic layout for hierarchical text-to-image synthesis. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7986\u20137994 (2018)","DOI":"10.1109\/CVPR.2018.00833"},{"key":"19_CR11","doi-asserted-by":"crossref","unstructured":"Johnson, J., Gupta, A., Fei-Fei, L.: Image generation from scene graphs. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1219\u20131228 (2018)","DOI":"10.1109\/CVPR.2018.00133"},{"key":"19_CR12","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational Bayes. arXiv preprint arXiv:1312.6114 (2013)"},{"key":"19_CR13","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vision 123, 32\u201373 (2017)","journal-title":"Int. J. Comput. Vision"},{"key":"19_CR14","doi-asserted-by":"crossref","unstructured":"Liang, W., Jiang, Y., Liu, Z.: GraghVQA: language-guided graph neural networks for graph-based visual question answering. arXiv preprint arXiv:2104.10283 (2021)","DOI":"10.18653\/v1\/2021.maiworkshop-1.12"},{"key":"19_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014, Part V. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"19_CR16","doi-asserted-by":"publisher","first-page":"160521","DOI":"10.1109\/ACCESS.2021.3129215","volume":"9","author":"S Matsumori","year":"2021","unstructured":"Matsumori, S., Abe, Y., Shingyouchi, K., Sugiura, K., Imai, M.: LatteGAN: visually guided language attention for multi-turn text-conditioned image manipulation. IEEE Access 9, 160521\u2013160532 (2021)","journal-title":"IEEE Access"},{"key":"19_CR17","unstructured":"Naeem, M.F., Oh, S.J., Uh, Y., Choi, Y., Yoo, J.: Reliable fidelity and diversity metrics for generative models. arXiv abs\/2002.09797 (2020)"},{"key":"19_CR18","doi-asserted-by":"crossref","unstructured":"Nguyen, K., Tripathi, S., Du, B., Guha, T., Nguyen, T.Q.: In defense of scene graphs for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1407\u20131416 (2021)","DOI":"10.1109\/ICCV48922.2021.00144"},{"key":"19_CR19","unstructured":"Nichol, A., et al.: Glide: towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)"},{"key":"19_CR20","unstructured":"OpenAI: Gpt-4: OpenAI\u2019s generative pre-trained transformer 4 (2023). https:\/\/openai.com\/. Accessed 29 Feb 2024"},{"key":"19_CR21","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: International Conference on Machine Learning, pp. 8821\u20138831. PMLR (2021)"},{"key":"19_CR22","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"19_CR23","unstructured":"Salimans, T., Goodfellow, I., Zaremba, W., Cheung, V., Radford, A., Chen, X.: Improved techniques for training GANs. In: Advances in Neural Information Processing Systems, vol. 29 (2016)"},{"key":"19_CR24","doi-asserted-by":"crossref","unstructured":"Schonfeld, E., Schiele, B., Khoreva, A.: A U-net based discriminator for generative adversarial networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8207\u20138216 (2020)","DOI":"10.1109\/CVPR42600.2020.00823"},{"key":"19_CR25","doi-asserted-by":"crossref","unstructured":"Teney, D., Liu, L., van Den\u00a0Hengel, A.: Graph-structured representations for visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp.\u00a01\u20139 (2017)","DOI":"10.1109\/CVPR.2017.344"},{"key":"19_CR26","doi-asserted-by":"crossref","unstructured":"Toutanova, K., Klein, D., Manning, C.D., Singer, Y.: Feature-rich part-of-speech tagging with a cyclic dependency network. In: Proceedings of the 2003 Human Language Technology Conference of the North American Chapter of the Association for Computational Linguistics, pp. 252\u2013259 (2003)","DOI":"10.3115\/1073445.1073478"},{"key":"19_CR27","doi-asserted-by":"crossref","unstructured":"Vo, N., et al.: Composing text and image for image retrieval-an empirical odyssey. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6439\u20136448 (2019)","DOI":"10.1109\/CVPR.2019.00660"},{"issue":"4","key":"19_CR28","doi-asserted-by":"publisher","first-page":"600","DOI":"10.1109\/TIP.2003.819861","volume":"13","author":"Z Wang","year":"2004","unstructured":"Wang, Z., Bovik, A.C., Sheikh, H.R., Simoncelli, E.P.: Image quality assessment: from error visibility to structural similarity. IEEE Trans. Image Process. 13(4), 600\u2013612 (2004). https:\/\/doi.org\/10.1109\/TIP.2003.819861","journal-title":"IEEE Trans. Image Process."},{"key":"19_CR29","doi-asserted-by":"crossref","unstructured":"Xiong, P., Zhan, H., Wang, X., Sinha, B., Wu, Y.: Visual query answering by entity-attribute graph matching and reasoning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8357\u20138366 (2019)","DOI":"10.1109\/CVPR.2019.00855"},{"key":"19_CR30","doi-asserted-by":"crossref","unstructured":"Xu, T., et al.: AttnGAN: fine-grained text to image generation with attentional generative adversarial networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1316\u20131324 (2018)","DOI":"10.1109\/CVPR.2018.00143"},{"key":"19_CR31","doi-asserted-by":"crossref","unstructured":"Xu, X., Wu, C., Rosenman, S., Lal, V., Che, W., Duan, N.: Bridgetower: building bridges between encoders in vision-language representation learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a037, pp. 10637\u201310647 (2023)","DOI":"10.1609\/aaai.v37i9.26263"},{"key":"19_CR32","doi-asserted-by":"crossref","unstructured":"Yang, X., Tang, K., Zhang, H., Cai, J.: Auto-encoding scene graphs for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10685\u201310694 (2019)","DOI":"10.1109\/CVPR.2019.01094"},{"key":"19_CR33","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"711","DOI":"10.1007\/978-3-030-01264-9_42","volume-title":"Computer Vision \u2013 ECCV 2018","author":"T Yao","year":"2018","unstructured":"Yao, T., Pan, Y., Li, Y., Mei, T.: Exploring visual relationship for image captioning. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11218, pp. 711\u2013727. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01264-9_42"},{"key":"19_CR34","doi-asserted-by":"crossref","unstructured":"Zhang, H., et al.: StackGAN: text to photo-realistic image synthesis with stacked generative adversarial networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5907\u20135915 (2017)","DOI":"10.1109\/ICCV.2017.629"},{"key":"19_CR35","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A.A., Shechtman, E., Wang, O.: The unreasonable effectiveness of deep features as a perceptual metric. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 586\u2013595 (2018)","DOI":"10.1109\/CVPR.2018.00068"},{"key":"19_CR36","doi-asserted-by":"publisher","first-page":"2418","DOI":"10.1007\/s11263-020-01300-7","volume":"128","author":"B Zhao","year":"2020","unstructured":"Zhao, B., Yin, W., Meng, L., Sigal, L.: Layout2image: image generation from layout. Int. J. Comput. Vision 128, 2418\u20132435 (2020)","journal-title":"Int. J. Comput. Vision"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72652-1_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T08:38:16Z","timestamp":1730191096000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72652-1_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,30]]},"ISBN":["9783031726514","9783031726521"],"references-count":36,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72652-1_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,30]]},"assertion":[{"value":"30 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}