{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T10:15:20Z","timestamp":1777889720304,"version":"3.51.4"},"reference-count":76,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01455","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"15680-15691","source":"Crossref","is-referenced-by-count":0,"title":["CuRe: Cultural Gaps in the Long Tail of Text-to-Image Systems"],"prefix":"10.1109","author":[{"given":"Aniket","family":"Rege","sequence":"first","affiliation":[{"name":"University of Wisconsin-Madison"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zinnia","family":"Nie","sequence":"additional","affiliation":[{"name":"University of Wisconsin-Madison"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mahesh","family":"Ramesh","sequence":"additional","affiliation":[{"name":"University of Wisconsin-Madison"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Unmesh","family":"Raskar","sequence":"additional","affiliation":[{"name":"University of Wisconsin-Madison"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhuoran","family":"Yu","sequence":"additional","affiliation":[{"name":"University of Wisconsin-Madison"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Aditya","family":"Kusupati","sequence":"additional","affiliation":[{"name":"University of Washington"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yong Jae","family":"Lee","sequence":"additional","affiliation":[{"name":"University of Wisconsin-Madison"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ramya Korlakai","family":"Vinayak","sequence":"additional","affiliation":[{"name":"University of Wisconsin-Madison"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00474"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.1503"},{"key":"ref4","volume-title":"Improving image generation with better captions","author":"Betker","year":"2023"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3593013.3594095"},{"key":"ref6","article-title":"Demystifying mmd gans","volume-title":"International Conference on Learning Representations","author":"Bi\u0144kowski"},{"key":"ref7","article-title":"Multimodal datasets: misogyny, pornography, and malignant stereotypes","author":"Birhane","year":"2021","journal-title":"arXiv preprint arXiv"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0930"},{"key":"ref9","article-title":"Mllm-as-a-judge: Assessing multimodal llm-as-a-judge with vision-language benchmark","volume-title":"Forty-first International Conference on Machine Learning","author":"Chen"},{"key":"ref10","article-title":"Pal: Sample-efficient personalized reward modeling for pluralistic alignment","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Chen"},{"key":"ref11","article-title":"Pali: A jointly-scaled multilingual language-image model","volume-title":"The Eleventh International Conference on Learning Representations","author":"Chen"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00283"},{"key":"ref13","volume-title":"Food and culture","author":"Counihan","year":"2013"},{"key":"ref14","first-page":"52","article-title":"Does object recognition work for everyone?","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition workshops","author":"De Vries"},{"key":"ref15","volume-title":"Google DeepMind. gemini","year":"2025"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1002\/9781394260645"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1229"},{"key":"ref18","article-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020","journal-title":"arXiv preprint arXiv"},{"key":"ref19","article-title":"Data filtering networks","volume-title":"The Twelfth International Conference on Learning Representations","author":"Fang"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.00901"},{"key":"ref21","volume-title":"The vendi score: A diversity evaluation metric for machine learning","author":"Friedman","year":"2023"},{"key":"ref22","article-title":"Datacomp: In search of the next generation of multimodal datasets","author":"Yitzhak Gadre","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref23","article-title":"Imagenet-trained cnns are biased towards texture; increasing shape bias improves accuracy and robustness","volume-title":"International conference on learning representations","author":"Geirhos"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref25","article-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium","volume":"30","author":"Heusel","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref26","volume-title":"Culture\u2019s consequences: Comparing values, behaviors, institutions and organizations across nations","author":"Hofstede","year":"2001"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.667"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2019.2921572"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00976"},{"key":"ref30","article-title":"Beyond aesthetics: Cultural competence in text-to-image models","author":"Kannen","year":"2024","journal-title":"NeurIPS D&B Track"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.573"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1594"},{"issue":"3","key":"ref33","volume-title":"Openimages: A public dataset for large-scale multi-label and multi-class image classification","volume":"2","author":"Krasin","year":"2017"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.5555\/2999134.2999257"},{"key":"ref35","volume-title":"Research methods in human-computer interaction","author":"Lazar","year":"2017"},{"key":"ref36","article-title":"A technique for the measurement of attitudes","author":"Likert","year":"1932","journal-title":"Archives of psychology"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref39","article-title":"Towards equitable representation in text-to-image synthesis models with the cross-cultural understanding benchmark (ccub) dataset","author":"Liu","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref40","article-title":"Stable bias: Analyzing societal representations in diffusion models","author":"Luccioni","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3600211.3604711"},{"key":"ref42","volume-title":"OpenAI. o3-mini","year":"2025"},{"key":"ref43","article-title":"Dinov2: Learning robust visual features without supervision","author":"Oquab","year":"2024","journal-title":"Transactions on Machine Learning Research"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01234"},{"key":"ref45","article-title":"SDXL: Improving latent diffusion models for high-resolution image synthesis","volume-title":"The Twelfth International Conference on Learning Representations","author":"Podell"},{"key":"ref46","volume-title":"Prolific. Prolific","year":"2025"},{"key":"ref47","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford"},{"issue":"2","key":"ref48","article-title":"Hierarchical text-conditional image generation with clip latents","volume":"1","author":"Ramesh","year":"2022","journal-title":"arXiv preprint arXiv"},{"key":"ref49","article-title":"Red-teaming the stable diffusion safety filter","author":"Rando","year":"2022","journal-title":"arXiv preprint arXiv"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref51","first-page":"11479","article-title":"Cvqa: Culturally-diverse multilingual visual question answering benchmark","volume":"37","author":"Romero","year":"2025","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.52202\/068431-2643"},{"key":"ref54","article-title":"Stylegan-t: Unlocking the power of gans for fast large-scale text-to-image synthesis","author":"Sauer","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref55","article-title":"LAION-400M: Open dataset of CLIP-filtered 400 million image-text pairs","volume-title":"Proceedings of Neurips Data-Centric AI Workshop","author":"Schuhmann"},{"key":"ref56","article-title":"LAION-5b: An open large-scale dataset for training next generation image-text models","volume-title":"Thirty-sixth Conference on Neural Information Processing Systems Datasets and Benchmarks Track","author":"Schuhmann"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.353"},{"key":"ref58","article-title":"No classification without representation: Assessing geodiversity issues in open data sets for the developing world","volume-title":"NIPS 2017 workshop: Machine Learning for the Developing World","author":"Shankar"},{"key":"ref59","article-title":"Identifying and eliminating csam in generative ml training data and models","volume":"23","author":"Thiel","year":"2023","journal-title":"Stanford Internet Observatory, Cyber Policy Center"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1145\/2812802"},{"key":"ref61","article-title":"Metamorph: Multimodal understanding and generation via instruction tuning","author":"Tong","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref62","volume-title":"UN Trade and Development","year":"2025"},{"key":"ref63","article-title":"Siglip 2: Multilingual vision-language encoders with improved semantic understanding, localization, and dense features","author":"Tschannen","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref64","volume-title":"How ai reduces the world to stereotypes","author":"Turk","year":"2024"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00732"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.513"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01625-5"},{"key":"ref68","article-title":"Emu3: Next-token prediction is all you need","author":"Wang","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref69","volume-title":"Wikimedia Commons","year":"2025"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1145\/3514094.3534136"},{"key":"ref71","article-title":"Human preference score v2: A solid benchmark for evaluating human preferences of text-to-image synthesis","author":"Wu","year":"2023","journal-title":"CoRR"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0700"},{"key":"ref73","article-title":"Scaling autoregressive models for content-rich text-to-image generation","author":"Yu","year":"2022","journal-title":"arXiv preprint arXiv"},{"key":"ref74","article-title":"Scaling autoregressive multimodal models: Pretraining and instruction tuning","author":"Yu","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642877"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11444025.pdf?arnumber=11444025","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:16:08Z","timestamp":1777612568000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11444025\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":76,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01455","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}