{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T00:49:50Z","timestamp":1767314990868,"version":"3.48.0"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032101914","type":"print"},{"value":"9783032101921","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-10192-1_47","type":"book-chapter","created":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T00:46:22Z","timestamp":1767314782000},"page":"561-573","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Evaluating Attribute Confusion in Fashion Text-to-Image Generation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-2793-3326","authenticated-orcid":false,"given":"Ziyue","family":"Liu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6400-8859","authenticated-orcid":false,"given":"Federico","family":"Girella","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5932-4371","authenticated-orcid":false,"given":"Yiming","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6029-1532","authenticated-orcid":false,"given":"Davide","family":"Talon","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,2]]},"reference":[{"key":"47_CR1","doi-asserted-by":"crossref","unstructured":"Chefer, H., et\u00a0al.: Attend-and-excite: attention-based semantic guidance for text-to-image diffusion models. TOG (2023)","DOI":"10.1145\/3592116"},{"key":"47_CR2","unstructured":"Cho, J., et\u00a0al.: Davidsonian scene graph: improving reliability in fine-grained evaluation for text-image generation. In: ICLR (2024)"},{"key":"47_CR3","unstructured":"Dai, W., et\u00a0al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning. In: NeurIPS (2023)"},{"key":"47_CR4","unstructured":"Ding, M., et\u00a0al.: Cogview2: faster and better text-to-image generation via hierarchical transformers. In: NeurIPS (2022)"},{"key":"47_CR5","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv:2010.11929 (2020)"},{"key":"47_CR6","unstructured":"Esser, P., et\u00a0al.: Scaling rectified flow transformers for high-resolution image synthesis. In: ICML (2024)"},{"key":"47_CR7","unstructured":"Feng, W., et\u00a0al.: Training-free structured diffusion guidance for compositional text-to-image synthesis. arXiv:2212.05032 (2022)"},{"key":"47_CR8","doi-asserted-by":"crossref","unstructured":"Hessel, J., et\u00a0al.: Clipscore: a reference-free evaluation metric for image captioning. In: EMNLP (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"47_CR9","unstructured":"Heusel, M., et\u00a0al.: GANs trained by a two time-scale update rule converge to a local Nash equilibrium. In: NIPS (2017)"},{"key":"47_CR10","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. In: NeurIPS Workshop on Deep Generative Models and Downstream Applications (2021)"},{"key":"47_CR11","unstructured":"Huang, K., et\u00a0al.: T2i-compbench: a comprehensive benchmark for open-world compositional text-to-image generation. In: NeurIPS (2023)"},{"key":"47_CR12","doi-asserted-by":"crossref","unstructured":"Jia, M., et\u00a0al.: Fashionpedia: ontology, segmentation, and an attribute localization dataset. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58452-8_19"},{"key":"47_CR13","doi-asserted-by":"crossref","unstructured":"Karpinska, M., et\u00a0al.: The perils of using Mechanical Turk to evaluate open-ended text generation. arXiv:2109.06835 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.97"},{"key":"47_CR14","unstructured":"Kirstain, Y., et\u00a0al.: Pick-a-pic: an open dataset of user preferences for text-to-image generation. In: NeurIPS (2023)"},{"key":"47_CR15","unstructured":"Koishigarina, D., et\u00a0al.: CLIP behaves like a bag-of-words model cross-modally but not Uni-modally. arXiv:2502.03566 (2025)"},{"key":"47_CR16","doi-asserted-by":"crossref","unstructured":"Ku, M., et\u00a0al.: Viescore: towards explainable metrics for conditional image synthesis evaluation. In: ACL (2024)","DOI":"10.18653\/v1\/2024.acl-long.663"},{"key":"47_CR17","unstructured":"Labs, B.F.: Flux. https:\/\/github.com\/black-forest-labs\/flux (2024)"},{"key":"47_CR18","doi-asserted-by":"crossref","unstructured":"Liang, F., et\u00a0al.: Open-vocabulary semantic segmentation with mask-adapted clip. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"47_CR19","doi-asserted-by":"crossref","unstructured":"Lin, Z., et\u00a0al.: Evaluating text-to-visual generation with image-to-text generation. In: ECCV (2024)","DOI":"10.1007\/978-3-031-72673-6_20"},{"key":"47_CR20","doi-asserted-by":"crossref","unstructured":"Liu, H., et\u00a0al.: Improved baselines with visual instruction tuning. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"47_CR21","unstructured":"Nichol, A.Q., et\u00a0al.: Glide: Towards photorealistic image generation and editing with text-guided diffusion models. In: ICML (2022)"},{"key":"47_CR22","doi-asserted-by":"crossref","unstructured":"Otani, M., et\u00a0al.: Toward verifiable and reproducible human evaluation for text-to-image generation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01372"},{"key":"47_CR23","unstructured":"Podell, D., et\u00a0al.: Sdxl: Improving latent diffusion models for high-resolution image synthesis. In: ICLR (2024)"},{"key":"47_CR24","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"47_CR25","unstructured":"Ramesh, A., et\u00a0al.: Hierarchical text-conditional image generation with clip latents. arXiv:2204.06125 (2022)"},{"key":"47_CR26","unstructured":"Ravi, N., et\u00a0al.: Sam 2: Segment anything in images and videos. arXiv:2408.00714 (2024)"},{"key":"47_CR27","unstructured":"Ren, T., et\u00a0al.: Grounded sam: assembling open-world models for diverse visual tasks. arXiv:2401.14159 (2024)"},{"key":"47_CR28","doi-asserted-by":"crossref","unstructured":"Rombach, R., et\u00a0al.: High-resolution image synthesis with latent diffusion models. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"47_CR29","unstructured":"Stability.ai: Sd-3.5-large. https:\/\/huggingface.co\/stabilityai (2024)"},{"key":"47_CR30","unstructured":"Vivago.ai: Hidream-i1-full. https:\/\/huggingface.co\/HiDream-ai (2025)"},{"key":"47_CR31","unstructured":"Wu, X., et\u00a0al.: Human preference score v2: a solid benchmark for evaluating human preferences of text-to-image synthesis. CoRR (2023)"},{"key":"47_CR32","unstructured":"Xu, J., et\u00a0al.: Imagereward: learning and evaluating human preferences for text-to-image generation. In: NeurIPS (2023)"},{"key":"47_CR33","unstructured":"Yarom, M., et\u00a0al.: What you see is what you read? Improving text-image alignment evaluation. In: NeurIPS (2023)"},{"key":"47_CR34","unstructured":"Yuksekgonul, M., et\u00a0al.: When and why vision-language models behave like bags-of-words, and what to do about it? In: ICLR (2023)"}],"container-title":["Lecture Notes in Computer Science","Image Analysis and Processing \u2013 ICIAP 2025"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-10192-1_47","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T00:46:25Z","timestamp":1767314785000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-10192-1_47"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9783032101914","9783032101921"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-10192-1_47","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"2 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICIAP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Image Analysis and Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Rome","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iciap2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.iciap.org\/home","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}