{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,17]],"date-time":"2026-04-17T03:00:05Z","timestamp":1776394805735,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":181,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,6,20]],"date-time":"2022-06-20T00:00:00Z","timestamp":1655683200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,6,21]]},"DOI":"10.1145\/3531146.3533233","type":"proceedings-article","created":{"date-parts":[[2022,6,20]],"date-time":"2022-06-20T14:27:10Z","timestamp":1655735230000},"page":"1859-1876","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":41,"title":["Evaluation Gaps in Machine Learning Practice"],"prefix":"10.1145","author":[{"given":"Ben","family":"Hutchinson","sequence":"first","affiliation":[{"name":"Google Research, Australia"}]},{"given":"Negar","family":"Rostamzadeh","sequence":"additional","affiliation":[{"name":"Google Research, Canada"}]},{"given":"Christina","family":"Greer","sequence":"additional","affiliation":[{"name":"Google Research, USA"}]},{"given":"Katherine","family":"Heller","sequence":"additional","affiliation":[{"name":"Google Research, USA"}]},{"given":"Vinodkumar","family":"Prabhakaran","sequence":"additional","affiliation":[{"name":"Google Research, USA"}]}],"member":"320","published-online":{"date-parts":[[2022,6,20]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1099"},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies. 107\u2013112","author":"Alm Cecilia\u00a0Ovesdotter","year":"2011","unstructured":"Cecilia\u00a0Ovesdotter Alm. 2011. Subjective natural language problems: Motivations, applications, characterizations, and implications. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies. 107\u2013112."},{"key":"e_1_3_2_1_3_1","unstructured":"Dario Amodei Chris Olah Jacob Steinhardt Paul Christiano John Schulman and Dan Man\u00e9. 2016. Concrete problems in AI safety. arXiv preprint arXiv:1606.06565(2016)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00146-021-01262-5"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3306618.3314275"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442188.3445888"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1609\/aimag.v36i1.2564"},{"key":"e_1_3_2_1_8_1","volume-title":"Test driven development: A practical guide","author":"Astels Dave","unstructured":"Dave Astels. 2003. Test driven development: A practical guide. Prentice Hall Professional Technical Reference."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Solon Barocas Anhong Guo Ece Kamar Jacquelyn Krones Meredith\u00a0Ringel Morris Jennifer\u00a0Wortman Vaughan Duncan Wadsworth and Hanna Wallach. 2021. Designing Disaggregated Evaluations of AI Systems: Choices Considerations and Tradeoffs. arXiv preprint arXiv:2103.06076(2021).","DOI":"10.1145\/3461702.3462610"},{"key":"e_1_3_2_1_10_1","volume-title":"Fairness in machine learning. NIPS tutorial 1(2017)","author":"Barocas Solon","year":"2017","unstructured":"Solon Barocas, Moritz Hardt, and Arvind Narayanan. 2017. Fairness in machine learning. NIPS tutorial 1(2017), 2017."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Roland Barthes. 1977. Image-Music-Text. Macmillan.","DOI":"10.1007\/978-1-349-03518-2"},{"key":"e_1_3_2_1_12_1","unstructured":"Valerio Basile Federico Cabitza Andrea Campagner and Michael Fell. 2021. Toward a Perspectivist Turn in Ground Truthing for Predictive Computing. arXiv preprint arXiv:2109.04270(2021)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00041"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442188.3445922"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448250"},{"key":"e_1_3_2_1_16_1","volume-title":"Ways of seeing","author":"Berger John","unstructured":"John Berger. 2008. Ways of seeing. Penguin UK."},{"key":"e_1_3_2_1_17_1","volume-title":"Conference on Fairness, Accountability and Transparency. PMLR, 149\u2013159","author":"Binns Reuben","year":"2018","unstructured":"Reuben Binns. 2018. Fairness in machine learning: Lessons from political philosophy. In Conference on Fairness, Accountability and Transparency. PMLR, 149\u2013159."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Abeba Birhane Pratyusha Kalluri Dallas Card William Agnew Ravit Dotan and Michelle Bao. 2021. The values encoded in machine learning research. arXiv preprint arXiv:2106.15590(2021).","DOI":"10.1145\/3531146.3533083"},{"key":"e_1_3_2_1_19_1","unstructured":"Dami\u00e1n Blasi Antonios Anastasopoulos and Graham Neubig. 2021. Systematic Inequalities in Language Technology Performance across the World\u2019s Languages. arXiv preprint arXiv:2110.06733(2021)."},{"key":"e_1_3_2_1_20_1","unstructured":"Rishi Bommasani Drew\u00a0A Hudson Ehsan Adeli Russ Altman Simran Arora Sydney von Arx Michael\u00a0S Bernstein Jeannette Bohg Antoine Bosselut Emma Brunskill 2021. On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258(2021)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3308560.3317593"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.385"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigData.2017.8258038"},{"key":"e_1_3_2_1_24_1","volume-title":"Statistical modeling: The two cultures (with comments and a rejoinder by the author). Statistical science 16, 3","author":"Breiman Leo","year":"2001","unstructured":"Leo Breiman. 2001. Statistical modeling: The two cultures (with comments and a rejoinder by the author). Statistical science 16, 3 (2001), 199\u2013231."},{"key":"e_1_3_2_1_25_1","volume-title":"The Yale Literary Magazine","author":"Brewster Benjamin","year":"1881","unstructured":"Benjamin Brewster. 1881. The Yale Literary Magazine October 1881\u2013June 1882 (1881)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1061\/(ASCE)EI.1943-5541.0000205"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.5555\/1873781.1873796"},{"key":"e_1_3_2_1_28_1","volume-title":"Asian Conference on Machine Learning. PMLR, 622\u2013637","author":"Cai Ermao","year":"2017","unstructured":"Ermao Cai, Da-Cheng Juan, Dimitrios Stamoulis, and Diana Marculescu. 2017. NeuralPower: Predict and deploy energy-efficient convolutional neural networks. In Asian Conference on Machine Learning. PMLR, 622\u2013637."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.3389\/frai.2020.00034"},{"key":"e_1_3_2_1_30_1","volume-title":"30th USENIX Security Symposium (USENIX Security 21)","author":"Carlini Nicholas","year":"2021","unstructured":"Nicholas Carlini, Florian Tramer, Eric Wallace, Matthew Jagielski, Ariel Herbert-Voss, Katherine Lee, Adam Roberts, Tom Brown, Dawn Song, Ulfar Erlingsson, 2021. Extracting training data from large language models. In 30th USENIX Security Symposium (USENIX Security 21). 2633\u20132650."},{"key":"e_1_3_2_1_31_1","volume-title":"Overinterpretation reveals image classification model pathologies. Advances in Neural Information Processing Systems 34","author":"Carter Brandon","year":"2021","unstructured":"Brandon Carter, Siddhartha Jain, Jonas\u00a0W Mueller, and David Gifford. 2021. Overinterpretation reveals image classification model pathologies. Advances in Neural Information Processing Systems 34 (2021)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1136\/bmjqs-2018-008370"},{"key":"e_1_3_2_1_33_1","volume-title":"Nothing about us without us","author":"Charlton I","unstructured":"James\u00a0I Charlton. 1998. Nothing about us without us. University of California Press."},{"key":"e_1_3_2_1_34_1","volume-title":"International Conference on Machine Learning. PMLR, 1617\u20131629","author":"Chen Mayee","year":"2021","unstructured":"Mayee Chen, Karan Goel, Nimit\u00a0S Sohoni, Fait Poms, Kayvon Fatahalian, and Christopher R\u00e9. 2021. Mandoline: Model Evaluation under Distribution Shift. In International Conference on Machine Learning. PMLR, 1617\u20131629."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.373"},{"key":"e_1_3_2_1_36_1","unstructured":"Alex Chohlas-Wood Madison Coots Emma Brunskill and Sharad Goel. 2021. Learning to be Fair: A Consequentialist Approach to Equitable Decision-Making. arXiv preprint arXiv:2109.08792(2021)."},{"key":"e_1_3_2_1_37_1","volume-title":"Fair prediction with disparate impact: A study of bias in recidivism prediction instruments. Big data 5, 2","author":"Chouldechova Alexandra","year":"2017","unstructured":"Alexandra Chouldechova. 2017. Fair prediction with disparate impact: A study of bias in recidivism prediction instruments. Big data 5, 2 (2017), 153\u2013163."},{"key":"e_1_3_2_1_38_1","unstructured":"Sam Corbett-Davies and Sharad Goel. 2018. The measure and mismeasure of fairness: A critical review of fair machine learning. arXiv preprint arXiv:1808.00023(2018)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3097983.3098095"},{"key":"e_1_3_2_1_40_1","volume-title":"Anatomy of an AI System. (Accessed","author":"Crawford Kate","year":"2022","unstructured":"Kate Crawford and Vladan Joler. 2018. Anatomy of an AI System. (Accessed January, 2022)."},{"key":"e_1_3_2_1_41_1","volume-title":"Excavating AI: The politics of images in machine learning training sets. AI & SOCIETY","author":"Crawford Kate","year":"2021","unstructured":"Kate Crawford and Trevor Paglen. 2021. Excavating AI: The politics of images in machine learning training sets. AI & SOCIETY (2021), 1\u201312."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-28005-5_57"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1057\/s41599-021-00750-9"},{"key":"e_1_3_2_1_44_1","unstructured":"Alexander D\u2019Amour Katherine Heller Dan Moldovan Ben Adlam Babak Alipanahi Alex Beutel Christina Chen Jonathan Deaton Jacob Eisenstein Matthew\u00a0D Hoffman 2020. Underspecification presents challenges for credibility in modern machine learning. arXiv preprint arXiv:2011.03395(2020)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00449"},{"key":"e_1_3_2_1_46_1","unstructured":"Harm De\u00a0Vries Dzmitry Bahdanau and Christopher Manning. 2020. Towards ecologically valid research on language user interfaces. arXiv preprint arXiv:2007.14435(2020)."},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC\u201916)","author":"Derczynski Leon","year":"2016","unstructured":"Leon Derczynski. 2016. Complementarity, F-score, and NLP Evaluation. In Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC\u201916). 261\u2013266."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3359228"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1177\/0093854818811379"},{"key":"e_1_3_2_1_50_1","unstructured":"Ulle Endriss. 2018. Lecture notes on fair division. arXiv preprint arXiv:1806.04234(2018)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.393"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Allyson Ettinger Sudha Rao Hal Daum\u00e9\u00a0III and Emily\u00a0M Bender. 2017. Towards linguistically generalizable NLP systems: A workshop and shared task. arXiv preprint arXiv:1711.01505(2017).","DOI":"10.18653\/v1\/W17-5401"},{"key":"e_1_3_2_1_53_1","unstructured":"Utku Evci Vincent Dumoulin Hugo Larochelle and Michael\u00a0Curtis Mozer. 2021. Head2Toe: Utilizing Intermediate Representations for Better OOD Generalization. (2021)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Abolfazl Farahani Sahar Voghoei Khaled Rasheed and Hamid\u00a0R Arabnia. 2020. A brief review of domain adaptation. arXiv preprint arXiv:2010.03978(2020).","DOI":"10.1007\/978-3-030-71704-9_65"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.204"},{"key":"e_1_3_2_1_56_1","volume-title":"Studying those who study us: An anthropologist in the world of Artificial Intelligence","author":"Forsythe Diana","unstructured":"Diana Forsythe. 2001. Studying those who study us: An anthropologist in the world of Artificial Intelligence. Stanford University Press, Chapter Artificial intelligence invents itself: Collective identity and boundary maintenance in an emergent scientific discipline."},{"key":"e_1_3_2_1_57_1","volume-title":"Studying those who study us: An anthropologist in the world of Artificial Intelligence","author":"Forsythe Diana","unstructured":"Diana Forsythe. 2001. Studying those who study us: An anthropologist in the world of Artificial Intelligence. Stanford University Press, Chapter The Construction of Knowledge in Artificial Intelligence."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3433949"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00119"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2019.07.007"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3306618.3317950"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458723"},{"key":"e_1_3_2_1_63_1","volume-title":"The Interpretation of Cultures","author":"Geertz Clifford","unstructured":"Clifford Geertz. 1973. The Interpretation of Cultures. Basic Books."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1080\/17530350.2019.1684337"},{"key":"e_1_3_2_1_65_1","volume-title":"A structured experiment of test-driven development. Information and software Technology 46, 5","author":"George Boby","year":"2004","unstructured":"Boby George and Laurie Williams. 2004. A structured experiment of test-driven development. Information and software Technology 46, 5 (2004), 337\u2013342."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.180"},{"key":"e_1_3_2_1_67_1","volume-title":"Ghost work: How to stop Silicon Valley from building a new global underclass","author":"Gray L","unstructured":"Mary\u00a0L Gray and Siddharth Suri. 2019. Ghost work: How to stop Silicon Valley from building a new global underclass. Eamon Dolan Books."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"crossref","unstructured":"Ben Green. 2020. Data science as political action: grounding data science in a politics of justice. Available at SSRN 3658431(2020).","DOI":"10.2139\/ssrn.3658431"},{"key":"e_1_3_2_1_69_1","unstructured":"Huong Ha Sunil Gupta Santu Rana and Svetha Venkatesh. 2021. ALT-MAS: A Data-Efficient Framework for Active Testing of Machine Learning Algorithms. arXiv preprint arXiv:2104.04999(2021)."},{"key":"e_1_3_2_1_70_1","volume-title":"Is statistics too difficult?Canadian Journal of Statistics 26, 3","author":"Hampel Frank","year":"1998","unstructured":"Frank Hampel and Eth Zurich. 1998. Is statistics too difficult?Canadian Journal of Statistics 26, 3 (1998), 497\u2013513."},{"key":"e_1_3_2_1_71_1","volume-title":"Equality of opportunity in supervised learning. Advances in neural information processing systems 29","author":"Hardt Moritz","year":"2016","unstructured":"Moritz Hardt, Eric Price, and Nati Srebro. 2016. Equality of opportunity in supervised learning. Advances in neural information processing systems 29 (2016), 3315\u20133323."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/336512.336532"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_74_1","unstructured":"Courtney Heldreth Michal Lahav Zion Mengesha Juliana Sublewski and Elyse Tuennerman. 2021. \u201cI don\u2019t think these devices are very culturally sensitive.\u201d\u2014The impact of errors on African Americans in Automated Speech Recognition. Frontiers in Artificial Intelligence 26 (2021)."},{"key":"e_1_3_2_1_75_1","first-page":"1","article-title":"Towards the systematic reporting of the energy and carbon footprints of machine learning","volume":"21","author":"Henderson Peter","year":"2020","unstructured":"Peter Henderson, Jieru Hu, Joshua Romoff, Emma Brunskill, Dan Jurafsky, and Joelle Pineau. 2020. Towards the systematic reporting of the energy and carbon footprints of machine learning. Journal of Machine Learning Research 21, 248 (2020), 1\u201343.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_76_1","volume-title":"International Conference on Learning Representations.","author":"Hendrycks Dan","year":"2018","unstructured":"Dan Hendrycks and Thomas Dietterich. 2018. Benchmarking Neural Network Robustness to Common Corruptions and Perturbations. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_27"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300830"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1007\/BF02430364"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-2096"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.7"},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1145\/3287560.3287600"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442188.3445918"},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.trustnlp-1.8"},{"key":"e_1_3_2_1_85_1","volume-title":"Ethically Aligned Design: A Vision for Prioritizing Human Well-being with Autonomous and Intelligent Systems","author":"Global IEEE.","unstructured":"IEEE. 2019. The IEEE Global Initiative on Ethics of Autonomous and Intelligent Systems. \u201cClassical Ethics in A\/IS\u201d. In Ethically Aligned Design: A Vision for Prioritizing Human Well-being with Autonomous and Intelligent Systems, First Edition. 36\u201367."},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3375671"},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442188.3445901"},{"key":"e_1_3_2_1_88_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01256"},{"key":"e_1_3_2_1_89_1","volume-title":"AAAI workshop on evaluation methods for machine learning. 6\u201311","author":"Japkowicz Nathalie","year":"2006","unstructured":"Nathalie Japkowicz. 2006. Why question machine learning evaluation methods. In AAAI workshop on evaluation methods for machine learning. 6\u201311."},{"key":"e_1_3_2_1_90_1","unstructured":"Tony Jappy. 2013. Introduction to Peircean visual semiotics. A&C Black."},{"key":"e_1_3_2_1_91_1","first-page":"18600","article-title":"Can i trust my fairness metric? assessing fairness with unlabeled data and bayesian inference","volume":"33","author":"Ji Disi","year":"2020","unstructured":"Disi Ji, Padhraic Smyth, and Mark Steyvers. 2020. Can i trust my fairness metric? assessing fairness with unlabeled data and bayesian inference. Advances in Neural Information Processing Systems 33 (2020), 18600\u201318612.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_92_1","volume-title":"Evaluating natural language processing systems: An analysis and review. Vol.\u00a01083","author":"Jones Karen\u00a0Sparck","unstructured":"Karen\u00a0Sparck Jones and Julia\u00a0R Galliers. 1995. Evaluating natural language processing systems: An analysis and review. Vol.\u00a01083. Springer Science & Business Media."},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"publisher","DOI":"10.1145\/3287560.3287578"},{"key":"e_1_3_2_1_94_1","unstructured":"Bernard Koch Emily Denton Alex Hanna and Jacob\u00a0G Foster. 2021. Reduced Reused and Recycled: The Life of a Dataset in Machine Learning Research. NeurIPS Dataset & Benchmark track(2021)."},{"key":"e_1_3_2_1_95_1","volume-title":"WILDS: A Benchmark of in-the-Wild Distribution Shifts. CoRR abs\/2012.07421(2020). https:\/\/arxiv.org\/abs\/2012.07421","author":"Koh Pang\u00a0Wei","year":"2020","unstructured":"Pang\u00a0Wei Koh, Shiori Sagawa, Henrik Marklund, Sang\u00a0Michael Xie, Marvin Zhang, Akshay Balsubramani, Weihua Hu, Michihiro Yasunaga, Richard\u00a0Lanas Phillips, Sara Beery, Jure Leskovec, Anshul Kundaje, Emma Pierson, Sergey Levine, Chelsea Finn, and Percy Liang. 2020. WILDS: A Benchmark of in-the-Wild Distribution Shifts. CoRR abs\/2012.07421(2020). https:\/\/arxiv.org\/abs\/2012.07421"},{"key":"e_1_3_2_1_96_1","volume-title":"International Conference on Machine Learning. PMLR, 5753\u20135763","author":"Kossen Jannik","year":"2021","unstructured":"Jannik Kossen, Sebastian Farquhar, Yarin Gal, and Tom Rainforth. 2021. Active testing: Sample-efficient model evaluation. In International Conference on Machine Learning. PMLR, 5753\u20135763."},{"key":"e_1_3_2_1_97_1","volume-title":"Indigenous data sovereignty: Toward an agenda","author":"Kukutai Tahu","unstructured":"Tahu Kukutai and John Taylor. 2016. Indigenous data sovereignty: Toward an agenda. ANU press."},{"key":"e_1_3_2_1_98_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-020-05872-w"},{"key":"e_1_3_2_1_99_1","unstructured":"Alexandre Lacoste Thomas Boquet Negar Rostamzadeh Boris Oreshkin Wonchang Chung and David Krueger. 2017. Deep prior. arXiv preprint arXiv:1712.05016(2017)."},{"key":"e_1_3_2_1_100_1","unstructured":"Alexandre Lacoste Boris Oreshkin Wonchang Chung Thomas Boquet Negar Rostamzadeh and David Krueger. 2018. Uncertainty in multitask transfer learning. arXiv preprint arXiv:1806.07528(2018)."},{"key":"e_1_3_2_1_101_1","volume-title":"Metaphors we live by","author":"Lakoff George","unstructured":"George Lakoff and Mark Johnson. 2008. Metaphors we live by. University of Chicago press."},{"key":"e_1_3_2_1_102_1","doi-asserted-by":"publisher","DOI":"10.1214\/19-AOS1828"},{"key":"e_1_3_2_1_103_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.502"},{"key":"e_1_3_2_1_104_1","volume-title":"Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2).","author":"Liao Thomas","year":"2021","unstructured":"Thomas Liao, Rohan Taori, Inioluwa\u00a0Deborah Raji, and Ludwig Schmidt. 2021. Are We Learning Yet? A Meta Review of Evaluation Failures Across Machine Learning. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)."},{"key":"e_1_3_2_1_105_1","doi-asserted-by":"publisher","DOI":"10.1002\/mar.20177"},{"key":"e_1_3_2_1_106_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74\u201381.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74\u201381."},{"key":"e_1_3_2_1_107_1","volume-title":"International Conference on Machine Learning. PMLR, 3150\u20133158","author":"Liu T","year":"2018","unstructured":"Lydia\u00a0T Liu, Sarah Dean, Esther Rolf, Max Simchowitz, and Moritz Hardt. 2018. Delayed impact of fair machine learning. In International Conference on Machine Learning. PMLR, 3150\u20133158."},{"key":"e_1_3_2_1_108_1","unstructured":"Chi-kiu Lo and Dekai Wu. 2010. Evaluating Machine Translation Utility via Semantic Role Labels.. In LREC. Citeseer."},{"key":"e_1_3_2_1_109_1","doi-asserted-by":"publisher","DOI":"10.1145\/3512899"},{"key":"e_1_3_2_1_110_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.clsr.2018.05.017"},{"key":"e_1_3_2_1_111_1","unstructured":"Marrkula Center. 2019. Approaches to Ethical Decision-making. https:\/\/www.scu.edu\/ethics\/ethics-resources\/ethical-decision-making\/"},{"key":"e_1_3_2_1_112_1","volume-title":"Andrew Smart, and William\u00a0S. Isaac","author":"Martin Donald","year":"2020","unstructured":"Donald Martin, Jr., Vinodkumar Prabhakaran, Jill Kuhlberg, Andrew Smart, and William\u00a0S. Isaac. 2020. Extending the Machine Learning Abstraction Boundary: A Complex Systems Approach to Incorporate Societal Context. arxiv:2006.09663\u00a0[cs.CY]"},{"key":"e_1_3_2_1_113_1","doi-asserted-by":"publisher","DOI":"10.1109\/CHASE.2013.6614749"},{"key":"e_1_3_2_1_114_1","volume-title":"Could Big Data be the end of theory in science? A few remarks on the epistemology of data-driven science. EMBO reports 16, 10","author":"Mazzocchi Fulvio","year":"2015","unstructured":"Fulvio Mazzocchi. 2015. Could Big Data be the end of theory in science? A few remarks on the epistemology of data-driven science. EMBO reports 16, 10 (2015), 1250\u20131255."},{"key":"e_1_3_2_1_115_1","doi-asserted-by":"publisher","DOI":"10.1017\/S0020589319000046"},{"key":"e_1_3_2_1_116_1","doi-asserted-by":"crossref","unstructured":"Douglas\u00a0S McNair. 2018. Preventing disparities: Bayesian and frequentist methods for assessing fairness in machine learning decision-support models. New Insights into Bayesian Inference(2018) 71.","DOI":"10.5772\/intechopen.73176"},{"key":"e_1_3_2_1_117_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442188.3445880"},{"key":"e_1_3_2_1_118_1","doi-asserted-by":"publisher","DOI":"10.1145\/3287560.3287596"},{"key":"e_1_3_2_1_119_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.117"},{"key":"e_1_3_2_1_120_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00473"},{"key":"e_1_3_2_1_121_1","volume-title":"The art of software testing","author":"Myers J","unstructured":"Glenford\u00a0J Myers, Corey Sandler, and Tom Badgett. 2011. The art of software testing. John Wiley & Sons."},{"key":"e_1_3_2_1_122_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.nlpmc-1.7"},{"key":"e_1_3_2_1_123_1","doi-asserted-by":"crossref","unstructured":"Peter Norvig. 2017. On Chomsky and the two cultures of statistical learning. In Berechenbarkeit der Welt?Springer 61\u201383.","DOI":"10.1007\/978-3-658-12153-2_3"},{"key":"e_1_3_2_1_124_1","doi-asserted-by":"publisher","DOI":"10.3389\/fdata.2019.00013"},{"key":"e_1_3_2_1_125_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00883"},{"key":"e_1_3_2_1_126_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318."},{"key":"e_1_3_2_1_127_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00773"},{"key":"e_1_3_2_1_128_1","first-page":"37","article-title":"Evaluation: From Precision, Recall and F-Factor to ROC, Informedness, Markedness & Correlation","volume":"2","author":"Martin\u00a0Ward Powers David","year":"2011","unstructured":"David Martin\u00a0Ward Powers. 2011. Evaluation: From Precision, Recall and F-Factor to ROC, Informedness, Markedness & Correlation. Journal of Machine Learning Technologies 2, 1 (2011), 37\u201363.","journal-title":"Journal of Machine Learning Technologies"},{"key":"e_1_3_2_1_129_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIST.2012.6221710"},{"key":"e_1_3_2_1_130_1","volume-title":"Proceedings of the 13th Conference of the European Chapter of the Association for Computational Linguistics. 345\u2013355","author":"Martin\u00a0Ward Powers David","year":"2012","unstructured":"David Martin\u00a0Ward Powers. 2012. The problem with kappa. In Proceedings of the 13th Conference of the European Chapter of the Association for Computational Linguistics. 345\u2013355."},{"key":"e_1_3_2_1_131_1","volume-title":"Flaws, Fallacies and Fixes. Technical report","author":"Martin\u00a0Ward Powers David","year":"2014","unstructured":"David Martin\u00a0Ward Powers. 2014. What the F-measure doesn\u2019t measure: Features, Flaws, Fallacies and Fixes. Technical report, Beijing University of Technology, China & Flinders University, Australia, Tech. Rep.(2014)."},{"key":"e_1_3_2_1_132_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.law-1.14"},{"key":"e_1_3_2_1_133_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1578"},{"key":"e_1_3_2_1_134_1","volume-title":"Proceedings of AAAI-97 Workshop on AI Approaches to Fraud Detection & Risk Management. 57\u201363","author":"Provost Foster","year":"1997","unstructured":"Foster Provost and Tom Fawcett. 1997. Analysis and visualization of classifier performance with nonuniform class and cost distributions. In Proceedings of AAAI-97 Workshop on AI Approaches to Fraud Detection & Risk Management. 57\u201363."},{"key":"e_1_3_2_1_135_1","volume-title":"The generative lexicon","author":"Pustejovsky James","unstructured":"James Pustejovsky. 1998. The generative lexicon. MIT press."},{"key":"e_1_3_2_1_136_1","volume-title":"AI and the Everything in the Whole Wide World Benchmark. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2).","author":"Raji Inioluwa\u00a0Deborah","year":"2021","unstructured":"Inioluwa\u00a0Deborah Raji, Emily Denton, Emily\u00a0M Bender, Alex Hanna, and Amandalynne Paullada. 2021. AI and the Everything in the Whole Wide World Benchmark. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)."},{"key":"e_1_3_2_1_137_1","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3372873"},{"key":"e_1_3_2_1_138_1","volume-title":"Artificial intelligence & human rights: Opportunities & risks","author":"Raso A","year":"2018","unstructured":"Filippo\u00a0A Raso, Hannah Hilligoss, Vivek Krishnamurthy, Christopher Bavitz, and Levin Kim. 2018. Artificial intelligence & human rights: Opportunities & risks. Berkman Klein Center Research Publication2018-6 (2018)."},{"key":"e_1_3_2_1_139_1","volume-title":"Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28 (2015)."},{"key":"e_1_3_2_1_140_1","doi-asserted-by":"crossref","unstructured":"Marco\u00a0Tulio Ribeiro Tongshuang Wu Carlos Guestrin and Sameer Singh. 2020. Beyond accuracy: Behavioral testing of NLP models with CheckList. arXiv preprint arXiv:2005.04118(2020).","DOI":"10.18653\/v1\/2020.acl-main.442"},{"key":"e_1_3_2_1_141_1","doi-asserted-by":"publisher","DOI":"10.1109\/ETHICS53270.2021.9632769"},{"key":"e_1_3_2_1_142_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_143_1","volume-title":"Fashion-gen: The generative fashion dataset and challenge. arXiv preprint arXiv:1806.08317(2018).","author":"Rostamzadeh Negar","year":"2018","unstructured":"Negar Rostamzadeh, Seyedarian Hosseini, Thomas Boquet, Wojciech Stokowiec, Ying Zhang, Christian Jauvin, and Chris Pal. 2018. Fashion-gen: The generative fashion dataset and challenge. arXiv preprint arXiv:1806.08317(2018)."},{"key":"e_1_3_2_1_144_1","volume-title":"Thinking Beyond Distributions in Testing Machine Learned Models. In NeurIPS 2021 Workshop on Distribution Shifts: Connecting Methods and Applications.","author":"Rostamzadeh Negar","year":"2021","unstructured":"Negar Rostamzadeh, Ben Hutchinson, Christina Greer, and Vinodkumar Prabhakaran. 2021. Thinking Beyond Distributions in Testing Machine Learned Models. In NeurIPS 2021 Workshop on Distribution Shifts: Connecting Methods and Applications."},{"key":"e_1_3_2_1_145_1","volume-title":"Simulated Adversarial Testing of Face Recognition Models. CVPR","author":"Ruiz Nataniel","year":"2022","unstructured":"Nataniel Ruiz, Adam Kortylewski, Weichao Qiu, Cihang Xie, Sarah\u00a0Adel Bargal, Alan Yuille, and Stan Sclaroff. 2022. Simulated Adversarial Testing of Face Recognition Models. CVPR (2022)."},{"key":"e_1_3_2_1_146_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442188.3445896"},{"key":"e_1_3_2_1_147_1","doi-asserted-by":"publisher","DOI":"10.1145\/3298735"},{"key":"e_1_3_2_1_148_1","volume-title":"Data Cascades in High-Stakes AI. In proceedings of the 2021 CHI Conference on Human Factors in Computing Systems. 1\u201315","author":"Sambasivan Nithya","year":"2021","unstructured":"Nithya Sambasivan, Shivani Kapania, Hannah Highfill, Diana Akrong, Praveen Paritosh, and Lora\u00a0M Aroyo. 2021. \u201cEveryone wants to do the model work, not the data work\u201d: Data Cascades in High-Stakes AI. In proceedings of the 2021 CHI Conference on Human Factors in Computing Systems. 1\u201315."},{"key":"e_1_3_2_1_149_1","doi-asserted-by":"publisher","DOI":"10.1145\/3387940.3392238"},{"key":"e_1_3_2_1_150_1","doi-asserted-by":"publisher","DOI":"10.3115\/1119176.1119195"},{"key":"e_1_3_2_1_151_1","doi-asserted-by":"publisher","DOI":"10.1145\/3476058"},{"key":"e_1_3_2_1_152_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-short.85"},{"key":"e_1_3_2_1_153_1","doi-asserted-by":"publisher","DOI":"10.1145\/3381831"},{"key":"e_1_3_2_1_154_1","volume-title":"Proceedings of ICLR","author":"Sculley David","year":"2018","unstructured":"David Sculley, Jasper Snoek, Alex Wiltschko, and Ali Rahimi. 2018. Winner\u2019s curse? On pace, progress, and empirical rigor. In Proceedings of ICLR 2018."},{"key":"e_1_3_2_1_155_1","doi-asserted-by":"publisher","DOI":"10.1145\/3287560.3287598"},{"key":"e_1_3_2_1_156_1","volume-title":"Basic rights: Subsistence, affluence, and US foreign policy","author":"Shue Henry","unstructured":"Henry Shue. 2020. Basic rights: Subsistence, affluence, and US foreign policy. Princeton University Press."},{"key":"e_1_3_2_1_157_1","doi-asserted-by":"publisher","DOI":"10.1145\/3180492"},{"key":"e_1_3_2_1_158_1","volume-title":"The Stanford Encyclopedia of Philosophy","author":"Sinnott-Armstrong Walter","year":"2021","unstructured":"Walter Sinnott-Armstrong. 2021. Consequentialism. The Stanford Encyclopedia of Philosophy Winter 2021 Edition (2021). https:\/\/plato.stanford.edu\/archives\/win2021\/entries\/consequentialism\/"},{"key":"e_1_3_2_1_159_1","volume-title":"Institutional ecology, \u2018translations","author":"Star Susan\u00a0Leigh","year":"1907","unstructured":"Susan\u00a0Leigh Star and James\u00a0R Griesemer. 1989. Institutional ecology, \u2018translations\u2019 and boundary objects: Amateurs and professionals in Berkeley\u2019s Museum of Vertebrate Zoology, 1907-39. Social studies of science 19, 3 (1989), 387\u2013420."},{"key":"e_1_3_2_1_160_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1355"},{"key":"e_1_3_2_1_161_1","volume-title":"Direct Importance Estimation with Model Selection and Its Application to Covariate Shift Adaptation. Advances in Neural Information Processing Systems 20","author":"Sugiyama Masashi","year":"2007","unstructured":"Masashi Sugiyama, Shinichi Nakajima, Hisashi Kashima, Paul Buenau, and Motoaki Kawanabe. 2007. Direct Importance Estimation with Model Selection and Its Application to Covariate Shift Adaptation. Advances in Neural Information Processing Systems 20 (2007)."},{"key":"e_1_3_2_1_162_1","volume-title":"Proceedings of the Ethics of Data Science Conference.","author":"Thomas RL","year":"2020","unstructured":"RL Thomas and D Uminsky. 2020. Reliance on metrics is a fundamental challenge for AI. In Proceedings of the Ethics of Data Science Conference."},{"key":"e_1_3_2_1_163_1","volume-title":"The future of data analysis. The annals of mathematical statistics 33, 1","author":"Tukey W","year":"1962","unstructured":"John\u00a0W Tukey. 1962. The future of data analysis. The annals of mathematical statistics 33, 1 (1962), 1\u201367."},{"key":"e_1_3_2_1_164_1","doi-asserted-by":"publisher","DOI":"10.1613\/jair.120"},{"key":"e_1_3_2_1_165_1","volume-title":"55th Annual Meeting of the Association for Computational Linguistics, ACL","author":"Ustalov Dmitry","year":"2017","unstructured":"Dmitry Ustalov, Alexander Panchenko, and Chris Biemann. 2017. Watset: Automatic induction of synsets from a graph of synonyms. In 55th Annual Meeting of the Association for Computational Linguistics, ACL 2017. Association for Computational Linguistics, 1579\u20131590."},{"key":"e_1_3_2_1_166_1","volume-title":"Technology and the virtues: A philosophical guide to a future worth wanting","author":"Vallor Shannon","unstructured":"Shannon Vallor. 2016. Technology and the virtues: A philosophical guide to a future worth wanting. Oxford University Press."},{"key":"e_1_3_2_1_167_1","doi-asserted-by":"crossref","unstructured":"Cornelis\u00a0Joost Van\u00a0Rijsbergen. 1974. Foundation of evaluation. Journal of documentation(1974).","DOI":"10.1108\/eb026584"},{"key":"e_1_3_2_1_168_1","doi-asserted-by":"publisher","DOI":"10.1109\/REW.2019.00050"},{"key":"e_1_3_2_1_169_1","doi-asserted-by":"publisher","DOI":"10.1145\/3132698"},{"key":"e_1_3_2_1_170_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-5446"},{"key":"e_1_3_2_1_171_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/628"},{"key":"e_1_3_2_1_172_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W19-3801"},{"key":"e_1_3_2_1_173_1","volume-title":"Discriminating systems. AI Now","author":"West Sarah\u00a0Myers","year":"2019","unstructured":"Sarah\u00a0Myers West, Meredith Whittaker, and Kate Crawford. 2019. Discriminating systems. AI Now (2019)."},{"key":"e_1_3_2_1_174_1","volume-title":"Contrastive Training for Improved Out-of-Distribution Detection. arXiv e-prints","author":"Winkens Jim","year":"2020","unstructured":"Jim Winkens, Rudy Bunel, Abhijit Guha\u00a0Roy, Robert Stanforth, Vivek Natarajan, Joseph\u00a0R Ledsam, Patricia MacWilliams, Pushmeet Kohli, Alan Karthikesalingam, Simon Kohl, 2020. Contrastive Training for Improved Out-of-Distribution Detection. arXiv e-prints (2020), arXiv\u20132007."},{"key":"e_1_3_2_1_175_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01115"},{"key":"e_1_3_2_1_176_1","doi-asserted-by":"publisher","DOI":"10.1109\/CSF.2018.00027"},{"key":"e_1_3_2_1_177_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-demo.43"},{"key":"e_1_3_2_1_178_1","volume-title":"Machine learning testing: Survey, landscapes and horizons","author":"Zhang M","year":"2020","unstructured":"Jie\u00a0M Zhang, Mark Harman, Lei Ma, and Yang Liu. 2020. Machine learning testing: Survey, landscapes and horizons. IEEE Transactions on Software Engineering(2020)."},{"key":"e_1_3_2_1_179_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3374217","article-title":"Adversarial attacks on deep-learning models in natural language processing: A survey","volume":"11","author":"Zhang Wei\u00a0Emma","year":"2020","unstructured":"Wei\u00a0Emma Zhang, Quan\u00a0Z Sheng, Ahoud Alhazmi, and Chenliang Li. 2020. Adversarial attacks on deep-learning models in natural language processing: A survey. ACM Transactions on Intelligent Systems and Technology (TIST) 11, 3(2020), 1\u201341.","journal-title":"ACM Transactions on Intelligent Systems and Technology (TIST)"},{"key":"e_1_3_2_1_180_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411495.3421352"},{"key":"e_1_3_2_1_181_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-2003"}],"event":{"name":"FAccT '22: 2022 ACM Conference on Fairness, Accountability, and Transparency","location":"Seoul Republic of Korea","acronym":"FAccT '22","sponsor":["ACM Association for Computing Machinery"]},"container-title":["2022 ACM Conference on Fairness Accountability and Transparency"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3531146.3533233","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3531146.3533233","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:31:30Z","timestamp":1750188690000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3531146.3533233"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,6,20]]},"references-count":181,"alternative-id":["10.1145\/3531146.3533233","10.1145\/3531146"],"URL":"https:\/\/doi.org\/10.1145\/3531146.3533233","relation":{},"subject":[],"published":{"date-parts":[[2022,6,20]]},"assertion":[{"value":"2022-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}