{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T08:00:37Z","timestamp":1776931237993,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":74,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,13]]},"DOI":"10.1145\/3772318.3791172","type":"proceedings-article","created":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T04:12:36Z","timestamp":1776053556000},"page":"1-19","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Building Benchmarks from the Ground Up: Community-Centered Evaluation of LLMs in Healthcare Chatbot Settings"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-9684-1192","authenticated-orcid":false,"given":"Hamna","family":"Hamna","sequence":"first","affiliation":[{"name":"Microsoft Research India, Microsoft Corporation, Bangalore, Karnataka, India"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4876-2171","authenticated-orcid":false,"given":"Gayatri","family":"Bhat","sequence":"additional","affiliation":[{"name":"Karya, Bengaluru, India"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1713-2769","authenticated-orcid":false,"given":"Sourabrata","family":"Mukherjee","sequence":"additional","affiliation":[{"name":"Microsoft Research, Bengaluru, Karnataka, India"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1209-8933","authenticated-orcid":false,"given":"Faisal M.","family":"Lalani","sequence":"additional","affiliation":[{"name":"Collective Intelligence Project, New York, New York, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8346-9148","authenticated-orcid":false,"given":"Evan","family":"Hadfield","sequence":"additional","affiliation":[{"name":"Collective Intelligence Project, New York, New York, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7073-6728","authenticated-orcid":false,"given":"Divya","family":"Siddarth","sequence":"additional","affiliation":[{"name":"Collective Intelligence Project, New York, New York, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9275-742X","authenticated-orcid":false,"given":"Kalika","family":"Bali","sequence":"additional","affiliation":[{"name":"Microsoft Research, Bangalore, India"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4251-9719","authenticated-orcid":false,"given":"Sunayana","family":"Sitaram","sequence":"additional","affiliation":[{"name":"Microsoft Research, Bangalore, Karnataka, India"}]}],"member":"320","published-online":{"date-parts":[[2026,4,13]]},"reference":[{"key":"e_1_3_3_4_2_2","doi-asserted-by":"publisher","unstructured":"Prottay\u00a0Kumar Adhikary Isha Motiyani Gayatri Oke Maithili Joshi Kanupriya Pathak Salam\u00a0Michael Singh and Tanmoy Chakraborty. 2025. Menstrual Health Education Using a Specialized Large Language Model in India: Development and Evaluation Study of MenstLLaMA. J Med Internet Res 27 (16 Jul 2025) e71977. 10.2196\/71977","DOI":"10.2196\/71977"},{"key":"e_1_3_3_4_3_2","first-page":"85","volume-title":"International Conference on Machine Learning","author":"Agarwal Arpit","year":"2020","unstructured":"Arpit Agarwal, Shivani Agarwal, Sanjeev Khanna, and Prathamesh Patil. 2020. Rank aggregation from pairwise comparisons in the presence of adversarial corruptions. In International Conference on Machine Learning. PMLR, 85\u201395."},{"key":"e_1_3_3_4_4_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.258"},{"key":"e_1_3_3_4_5_2","doi-asserted-by":"publisher","unstructured":"Mohammad Atari Mona Xue Peter Park Dami\u00e1n Blasi and Joseph Henrich. 2023. Which Humans? (09 2023). 10.31234\/osf.io\/5b26t","DOI":"10.31234\/osf.io\/5b26t"},{"key":"e_1_3_3_4_6_2","doi-asserted-by":"publisher","unstructured":"Emily\u00a0M. Bender and Batya Friedman. 2018. Data Statements for Natural Language Processing: Toward Mitigating System Bias and Enabling Better Science. Transactions of the Association for Computational Linguistics 6 (2018) 587\u2013604. 10.1162\/tacl_a_00041","DOI":"10.1162\/tacl_a_00041"},{"key":"e_1_3_3_4_7_2","doi-asserted-by":"publisher","unstructured":"Stevie Bergman Nahema Marchal John Mellor Shakir Mohamed Iason Gabriel and William Isaac. 2024. STELA: a community-centred approach to norm elicitation for AI alignment. Scientific Reports 14 (03 2024). 10.1038\/s41598-024-56648-4","DOI":"10.1038\/s41598-024-56648-4"},{"key":"e_1_3_3_4_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3551624.3555290"},{"key":"e_1_3_3_4_9_2","doi-asserted-by":"publisher","unstructured":"Leona\u00a0Cilar Budler Hongyu Chen Aokun Chen Maxim Topaz Wilson Tam Jiang Bian and Gregor Stiglic. 2025. A Brief Review on Benchmarking for Large Language Models Evaluation in Healthcare. WIREs Data Mining and Knowledge Discovery 15 2 (2025) e70010. 10.1002\/widm.70010e70010 DMKD-00787.R1.","DOI":"10.1002\/widm.70010"},{"key":"e_1_3_3_4_10_2","doi-asserted-by":"publisher","unstructured":"Yupeng Chang Xu Wang Jindong Wang Yuan Wu Linyi Yang Kaijie Zhu Hao Chen Xiaoyuan Yi Cunxiang Wang Yidong Wang Wei Ye Yue Zhang Yi Chang Philip\u00a0S. Yu Qiang Yang and Xing Xie. 2024. A Survey on Evaluation of Large Language Models. 15 3 Article 39 (2024) 45\u00a0pages. 10.1145\/3641289","DOI":"10.1145\/3641289"},{"key":"e_1_3_3_4_11_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.1247"},{"key":"e_1_3_3_4_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3617694.3623261"},{"key":"e_1_3_3_4_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3713362"},{"key":"e_1_3_3_4_14_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.626"},{"key":"e_1_3_3_4_15_2","doi-asserted-by":"crossref","unstructured":"Gaohong Dong David\u00a0C Hoaglin Junshan Qiu Roland\u00a0A Matsouaka Yu-Wei Chang Jiuzhou Wang and Marc Vandemeulebroecke. 2020. The win ratio: on interpretation and handling of ties. Statistics in Biopharmaceutical Research (2020).","DOI":"10.1080\/19466315.2019.1575279"},{"key":"e_1_3_3_4_16_2","unstructured":"Jay Gala Pranjal\u00a0A. Chitale Raghavan AK Varun Gumma Sumanth Doddapaneni Aswanth Kumar Janki Nawale Anupama Sujatha Ratish Puduppully Vivek Raghavan Pratyush Kumar Mitesh\u00a0M. Khapra Raj Dabre and Anoop Kunchukuttan. 2023. IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages. arxiv:https:\/\/arXiv.org\/abs\/2305.16307\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2305.16307"},{"key":"e_1_3_3_4_17_2","doi-asserted-by":"crossref","unstructured":"Fabrizio Gilardi Meysam Alizadeh and Ma\u00ebl Kubli. 2023. ChatGPT outperforms crowd workers for text-annotation tasks. Proceedings of the National Academy of Sciences 120 30 (2023).","DOI":"10.1073\/pnas.2305016120"},{"key":"e_1_3_3_4_18_2","unstructured":"A. Grattafiori and Meta AI. 2024. The Llama 3 Herd of Models. arxiv:https:\/\/arXiv.org\/abs\/2407.21783\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_3_4_19_2","doi-asserted-by":"crossref","unstructured":"Neel Guha Julian Nyarko Daniel\u00a0E. Ho Christopher R\u00e9 Adam Chilton Aditya Narayana Alex Chohlas-Wood Austin Peters Brandon Waldon Daniel\u00a0N. Rockmore Diego Zambrano Dmitry Talisman Enam Hoque Faiz Surani Frank Fagan Galit Sarfaty Gregory\u00a0M. Dickinson Haggai Porat Jason Hegland Jessica Wu Joe Nudell Joel Niklaus John Nay Jonathan\u00a0H. Choi Kevin Tobia Margaret Hagan Megan Ma Michael Livermore Nikon Rasumov-Rahe Nils Holzenberger Noam Kolt Peter Henderson Sean Rehaag Sharad Goel Shang Gao Spencer Williams Sunny Gandhi Tom Zur Varun Iyer and Zehua Li. 2023. LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2308.11462\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2308.11462","DOI":"10.2139\/ssrn.4583531"},{"key":"e_1_3_3_4_20_2","unstructured":"Varun Gumma Anandhita Raghunath Mohit Jain and Sunayana Sitaram. 2024. HEALTH-PARIKSHA: Assessing RAG Models for Health Chatbots in Real-World Multilingual Settings. arxiv:https:\/\/arXiv.org\/abs\/2410.13671\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2410.13671"},{"key":"e_1_3_3_4_21_2","unstructured":"Rishav Hada Varun Gumma Adrian de Wynter Harshita Diddee Mohamed Ahmed Monojit Choudhury Kalika Bali and Sunayana Sitaram. 2023. Are large language model-based evaluators the solution to scaling up multilingual evaluation? arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.07462 (2023)."},{"key":"e_1_3_3_4_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3630106.3658927"},{"key":"e_1_3_3_4_23_2","unstructured":"Siobhan\u00a0Mackenzie Hall Samantha Dalal Raesetje Sefala Foutse Yuehgoh Aisha Alaagib Imane Hamzaoui Shu Ishida Jabez Magomere Lauren Crais Aya Salama and Tejumade Afonja. 2025. The Human Labour of Data Work: Capturing Cultural Diversity through World Wide Dishes. arxiv:https:\/\/arXiv.org\/abs\/2502.05961\u00a0[cs.CY] https:\/\/arxiv.org\/abs\/2502.05961"},{"key":"e_1_3_3_4_24_2","unstructured":"Zhengyu Hu Linxin Song Jieyu Zhang Zheyuan Xiao Tianfu Wang Zhengyu Chen Nicholas\u00a0Jing Yuan Jianxun Lian Kaize Ding and Hui Xiong. 2024. Explaining length bias in llm-based preference evaluations. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.01085 (2024)."},{"key":"e_1_3_3_4_25_2","doi-asserted-by":"publisher","unstructured":"Shafquat Hussain and Athula Ginige. 2018. Extending a Conventional Chatbot Knowledge Base to External Knowledge Source and Introducing User Based Sessions for Diabetes Education. 698\u2013703. 10.1109\/WAINA.2018.00170","DOI":"10.1109\/WAINA.2018.00170"},{"key":"e_1_3_3_4_26_2","doi-asserted-by":"crossref","unstructured":"Di Jin Eileen Pan Nassim Oufattole Wei-Hung Weng Hanyi Fang and Peter Szolovits. 2020. What Disease does this Patient Have? A Large-scale Open Domain Question Answering Dataset from Medical Exams. arxiv:https:\/\/arXiv.org\/abs\/2009.13081\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2009.13081","DOI":"10.20944\/preprints202105.0498.v1"},{"key":"e_1_3_3_4_27_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.445"},{"key":"e_1_3_3_4_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3715275.3732147"},{"key":"e_1_3_3_4_29_2","unstructured":"Dayeon Ki Rachel Rudinger Tianyi Zhou and Marine Carpuat. 2025. Multiple LLM Agents Debate for Equitable Cultural Alignment. arxiv:https:\/\/arXiv.org\/abs\/2505.24671\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2505.24671"},{"key":"e_1_3_3_4_30_2","unstructured":"Fajri Koto. 2025. Cracking the Code: Multi-domain LLM Evaluation on Real-World Professional Exams in Indonesia. arxiv:https:\/\/arXiv.org\/abs\/2409.08564\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2409.08564"},{"key":"e_1_3_3_4_31_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-60114-0_10"},{"key":"e_1_3_3_4_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642278"},{"key":"e_1_3_3_4_33_2","unstructured":"Cheng Li Mengzhou Chen Jindong Wang Sunayana Sitaram and Xing Xie. 2024. CultureLLM: Incorporating Cultural Differences into Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2402.10946\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2402.10946"},{"key":"e_1_3_3_4_34_2","doi-asserted-by":"crossref","unstructured":"Cheng Li Damien Teney Linyi Yang Qingsong Wen Xing Xie and Jindong Wang. 2024. Culturepark: Boosting cross-cultural understanding in large language models. Advances in Neural Information Processing Systems 37 (2024) 65183\u201365216.","DOI":"10.52202\/079017-2082"},{"key":"e_1_3_3_4_35_2","unstructured":"Haitao Li Qian Dong Junjie Chen Huixue Su Yujia Zhou Qingyao Ai Ziyi Ye and Yiqun Liu. 2024. Llms-as-judges: a comprehensive survey on llm-based evaluation methods. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.05579 (2024)."},{"key":"e_1_3_3_4_36_2","volume-title":"ACL","author":"Liu Jiacheng","year":"2023","unstructured":"Jiacheng Liu et\u00a0al. 2023. GPTEval: NLG Evaluation using GPT-4 with Better Human Alignment. In ACL."},{"key":"e_1_3_3_4_37_2","doi-asserted-by":"crossref","unstructured":"Yang Liu Dan Iter Yichong Xu Shuohang Wang Ruochen Xu and Chenguang Zhu. 2023. G-eval: NLG evaluation using gpt-4 with better human alignment. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.16634 (2023).","DOI":"10.18653\/v1\/2023.emnlp-main.153"},{"key":"e_1_3_3_4_38_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.167"},{"key":"e_1_3_3_4_39_2","doi-asserted-by":"publisher","unstructured":"Milagros Miceli and Julian Posada. 2022. The Data-Production Dispositif. 6 CSCW2 Article 460 (Nov. 2022) 37\u00a0pages. 10.1145\/3555561","DOI":"10.1145\/3555561"},{"key":"e_1_3_3_4_40_2","doi-asserted-by":"crossref","unstructured":"Junho Myung Nayeon Lee Yi Zhou Jiho Jin Rifki Putri Dimosthenis Antypas Hsuvas Borkakoty Eunsu Kim Carla Perez-Almendros Abinew\u00a0Ali Ayele et\u00a0al. 2024. Blend: A benchmark for llms on everyday knowledge in diverse cultures and languages. Advances in Neural Information Processing Systems 37 (2024) 78104\u201378146.","DOI":"10.52202\/079017-2483"},{"key":"e_1_3_3_4_41_2","doi-asserted-by":"publisher","unstructured":"Mihai Nad\u01ce\u015f Laura Dio\u015fan and Andreea Tomescu. 2025. Synthetic Data Generation Using Large Language Models: Advances in Text and Code. IEEE Access 13 (2025) 134615\u2013134633. 10.1109\/access.2025.3589503","DOI":"10.1109\/access.2025.3589503"},{"key":"e_1_3_3_4_42_2","volume-title":"Annual Meeting of the Association for Computational Linguistics","author":"Naous Tarek","year":"2023","unstructured":"Tarek Naous, Michael\u00a0Joseph Ryan, and Wei Xu. 2023. Having Beer after Prayer? Measuring Cultural Bias in Large Language Models. In Annual Meeting of the Association for Computational Linguistics. https:\/\/api.semanticscholar.org\/CorpusID:258865272"},{"key":"e_1_3_3_4_43_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.ijcnlp-main.42"},{"key":"e_1_3_3_4_44_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.96"},{"key":"e_1_3_3_4_45_2","unstructured":"Open Life Science AI and Hugging Face. 2026. The Open Medical-LLM Leaderboard: Benchmarking Large Language Models in Healthcare. https:\/\/huggingface.co\/blog\/leaderboard-medicalllm. Hugging Face blog post."},{"key":"e_1_3_3_4_46_2","unstructured":"OpenAI. 2024. GPT-4o: OpenAI\u2019s GPT-4 Omni Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2405.15544\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2405.15544"},{"key":"e_1_3_3_4_47_2","doi-asserted-by":"publisher","unstructured":"Natalia O\u017cegalska-\u0141ukasik and Szymon \u0141ukasik. 2023. Culturally Responsive Artificial Intelligence \u2013 Problems Challenges and Solutions. Intercultural Relations 7 (12 2023) 106\u2013119. 10.12797\/RM.02.2023.14.07","DOI":"10.12797\/RM.02.2023.14.07"},{"key":"e_1_3_3_4_48_2","unstructured":"Ankit Pal AIKosh \/\u00a0India\u2011AI team and ekacare. 2025. MedMCQA\u2011Indic: Indic\u2011language version of the MedMCQA medical exam benchmark. https:\/\/huggingface.co\/datasets\/ekacare\/MedMCQA-Indic. Hugging Face dataset."},{"key":"e_1_3_3_4_49_2","unstructured":"Ankit Pal Logesh\u00a0Kumar Umapathi and Malaikannan Sankarasubbu. 2022. MedMCQA : A Large-scale Multi-Subject Multi-Choice Dataset for Medical domain Question Answering. arxiv:https:\/\/arXiv.org\/abs\/2203.14371\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2203.14371"},{"key":"e_1_3_3_4_50_2","unstructured":"Vinodkumar Prabhakaran Rida Qadri and Ben Hutchinson. 2022. Cultural Incongruencies in Artificial Intelligence. arxiv:https:\/\/arXiv.org\/abs\/2211.13069\u00a0[cs.CY] https:\/\/arxiv.org\/abs\/2211.13069"},{"key":"e_1_3_3_4_51_2","unstructured":"Rida Qadri Mark Diaz Ding Wang and Michael Madaio. 2025. The Case for \"Thick Evaluations\" of Cultural Representation in AI. arxiv:https:\/\/arXiv.org\/abs\/2503.19075\u00a0[cs.CY] https:\/\/arxiv.org\/abs\/2503.19075"},{"key":"e_1_3_3_4_52_2","doi-asserted-by":"publisher","unstructured":"Libo Qin Qiguang Chen Yuhang Zhou Zhi Chen Yinghui Li Lizi Liao Min Li Wanxiang Che and Philip\u00a0S. Yu. 2025. A survey of multilingual large language models. Patterns 6 1 (2025) 101118. 10.1016\/j.patter.2024.101118","DOI":"10.1016\/j.patter.2024.101118"},{"key":"e_1_3_3_4_53_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3713680"},{"key":"e_1_3_3_4_54_2","unstructured":"David Rein Betty\u00a0Li Hou Asa\u00a0Cooper Stickland Jackson Petty Richard\u00a0Yuanzhe Pang Julien Dirani Julian Michael and Samuel\u00a0R. Bowman. 2023. GPQA: A Graduate-Level Google-Proof Q&A Benchmark. arXiv:https:\/\/arXiv.org\/abs\/2311.12022\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2311.12022"},{"key":"e_1_3_3_4_55_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-naacl.268"},{"key":"e_1_3_3_4_56_2","doi-asserted-by":"publisher","unstructured":"Benjamin Saunders Julius Sim Tom Kingstone Shula Baker Jackie Waterfield Bernadette Bartlam Heather Burroughs and Clare Jinks. 2018. Saturation in qualitative research: exploring its conceptualization and operationalization. Quality & Quantity 52 4 (2018) 1893\u20131907. 10.1007\/s11135-017-0574-8","DOI":"10.1007\/s11135-017-0574-8"},{"key":"e_1_3_3_4_57_2","doi-asserted-by":"crossref","unstructured":"Ashish Sharma Kevin Rushton Inna\u00a0Wanyin Lin Theresa Nguyen and Tim Althoff. 2024. Facilitating Self-Guided Mental Health Interventions Through Human-Language Model Interaction: A Case Study of Cognitive Restructuring. arxiv:https:\/\/arXiv.org\/abs\/2310.15461\u00a0[cs.HC] https:\/\/arxiv.org\/abs\/2310.15461","DOI":"10.1145\/3613904.3642761"},{"key":"e_1_3_3_4_58_2","unstructured":"Aryan Shrivastava and Paula\u00a0Akemi Aoyagui. 2025. DICE: A Framework for Dimensional and Contextual Evaluation of Language Models. arxiv:https:\/\/arXiv.org\/abs\/2504.10359\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2504.10359"},{"key":"e_1_3_3_4_59_2","unstructured":"Namita Singh Jacqueline Wang\u2019ombe Nereah Okanga Tetyana Zelenska Jona Repishti Jayasankar\u00a0G K Sanjeev Mishra Rajsekar Manokaran Vineet Singh Mohammed\u00a0Irfan Rafiq Rikin Gandhi and Akshay Nambi. 2024. Farmer.Chat: Scaling AI-Powered Agricultural Services for Smallholder Farmers. arxiv:https:\/\/arXiv.org\/abs\/2409.08916\u00a0[cs.ET] https:\/\/arxiv.org\/abs\/2409.08916"},{"key":"e_1_3_3_4_60_2","unstructured":"Shivalika Singh Angelika Romanou Cl\u00e9mentine Fourrier David\u00a0I. Adelani Jian\u00a0Gang Ngui Daniel Vila-Suero Peerat Limkonchotiwat Kelly Marchisio Wei\u00a0Qi Leong Yosephine Susanto Raymond Ng Shayne Longpre Wei-Yin Ko Sebastian Ruder Madeline Smith Antoine Bosselut Alice Oh Andre F.\u00a0T. Martins Leshem Choshen Daphne Ippolito Enzo Ferrante Marzieh Fadaee Beyza Ermis and Sara Hooker. 2025. Global MMLU: Understanding and Addressing Cultural and Linguistic Biases in Multilingual Evaluation. arxiv:https:\/\/arXiv.org\/abs\/2412.03304\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2412.03304"},{"key":"e_1_3_3_4_61_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.620"},{"key":"e_1_3_3_4_62_2","unstructured":"Nicholas Sukiennik Chen Gao Fengli Xu and Yong Li. 2025. An Evaluation of Cultural Value Alignment in LLM. arxiv:https:\/\/arXiv.org\/abs\/2504.08863\u00a0[cs.CY] https:\/\/arxiv.org\/abs\/2504.08863"},{"key":"e_1_3_3_4_63_2","unstructured":"Panuthep Tasawong Jian\u00a0Gang Ngui Alham\u00a0Fikri Aji Trevor Cohn and Peerat Limkonchotiwat. 2026. SEA-Guard: Culturally Grounded Multilingual Safeguard for Southeast Asia. arxiv:https:\/\/arXiv.org\/abs\/2602.01618\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2602.01618"},{"key":"e_1_3_3_4_64_2","unstructured":"Sarvam\u00a0AI Team. 2025. Sarvam-M: Multilingual Large Language Model. https:\/\/huggingface.co\/sarvamai\/sarvam-m."},{"key":"e_1_3_3_4_65_2","unstructured":"Vishesh Thakur. 2023. Unveiling Gender Bias in Terms of Profession Across LLMs: Analyzing and Addressing Sociological Implications. arxiv:https:\/\/arXiv.org\/abs\/2307.09162\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2307.09162"},{"key":"e_1_3_3_4_66_2","unstructured":"Atnafu\u00a0Lambebo Tonja Srija Anand Emilio Villa-Cueva Israel\u00a0Abebe Azime Jesujoba\u00a0Oluwadara Alabi Muhidin\u00a0A. Mohamed Debela\u00a0Desalegn Yadeta Negasi\u00a0Haile Abadi Abigail Oppong Nnaemeka\u00a0Casmir Obiefuna Idris Abdulmumin Naome\u00a0A Etori Eric\u00a0Peter Wairagala Kanda\u00a0Patrick Tshinu Imanigirimbabazi Emmanuel Gabofetswe Malema Alham\u00a0Fikri Aji David\u00a0Ifeoluwa Adelani and Thamar Solorio. 2026. Afri-MCQA: Multimodal Cultural Question Answering for African Languages. arxiv:https:\/\/arXiv.org\/abs\/2601.05699\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2601.05699"},{"key":"e_1_3_3_4_67_2","unstructured":"Sshubam Verma Mohammed Safi Ur\u00a0Rahman Khan Vishwajeet Kumar Rudra Murthy and Jaydeep Sen. 2025. MILU: A Multi-task Indic Language Understanding Benchmark. arxiv:https:\/\/arXiv.org\/abs\/2411.02538\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2411.02538"},{"key":"e_1_3_3_4_68_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-5446"},{"key":"e_1_3_3_4_69_2","volume-title":"EMNLP","author":"Wang Weizhe","year":"2023","unstructured":"Weizhe Wang et\u00a0al. 2023. Large Language Models are State-of-the-Art Evaluators of Translation Quality. In EMNLP."},{"key":"e_1_3_3_4_70_2","unstructured":"Ishaan Watts Varun Gumma Aditya Yadavalli Vivek Seshadri Manohar Swaminathan and Sunayana Sitaram. 2024. PARIKSHA: A Large-Scale Investigation of Human-LLM Evaluator Agreement on Multilingual and Multi-Cultural Data. arxiv:https:\/\/arXiv.org\/abs\/2406.15053\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2406.15053"},{"key":"e_1_3_3_4_71_2","unstructured":"A. Yang and Qwen Team. 2025. Qwen3 Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2505.09388\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2505.09388"},{"key":"e_1_3_3_4_72_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.1082"},{"key":"e_1_3_3_4_73_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.743"},{"key":"e_1_3_3_4_74_2","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric\u00a0P. Xing Hao Zhang Joseph\u00a0E. Gonzalez and Ion Stoica. 2023. Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. arxiv:https:\/\/arXiv.org\/abs\/2306.05685\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2306.05685"},{"key":"e_1_3_3_4_75_2","doi-asserted-by":"publisher","unstructured":"Hui Zong Rongrong Wu Jiaxue Cha Jiao Wang Erman Wu Jiajin Li Yilin Zhou Chi Zhang Wenqing Feng and Bing Shen. 2024. Large Language Models in Worldwide Medical Exams: Platform Development and Comprehensive Analysis. Journal of Medical Internet Research 26 (2024) e66114. 10.2196\/66114","DOI":"10.2196\/66114"}],"event":{"name":"CHI 2026: CHI Conference on Human Factors in Computing Systems","location":"Barcelona Spain","acronym":"CHI '26","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 2026 CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772318.3791172","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,17]],"date-time":"2026-04-17T09:57:55Z","timestamp":1776419875000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772318.3791172"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,13]]},"references-count":74,"alternative-id":["10.1145\/3772318.3791172","10.1145\/3772318"],"URL":"https:\/\/doi.org\/10.1145\/3772318.3791172","relation":{},"subject":[],"published":{"date-parts":[[2026,4,13]]},"assertion":[{"value":"2026-04-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}