{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T00:00:14Z","timestamp":1775520014900,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,4,19]],"date-time":"2023-04-19T00:00:00Z","timestamp":1681862400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,4,19]]},"DOI":"10.1145\/3544548.3581482","type":"proceedings-article","created":{"date-parts":[[2023,4,20]],"date-time":"2023-04-20T04:28:44Z","timestamp":1681964924000},"page":"1-13","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["Kaleidoscope: Semantically-grounded, context-specific ML model evaluation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9769-4947","authenticated-orcid":false,"given":"Harini","family":"Suresh","sequence":"first","affiliation":[{"name":"CSAIL, MIT, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1994-053X","authenticated-orcid":false,"given":"Divya","family":"Shanmugam","sequence":"additional","affiliation":[{"name":"Clinical and Applied Machine Learning Group, MIT, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7483-3596","authenticated-orcid":false,"given":"Tiffany","family":"Chen","sequence":"additional","affiliation":[{"name":"CSAIL, MIT, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1140-8576","authenticated-orcid":false,"given":"Annie G","family":"Bryan","sequence":"additional","affiliation":[{"name":"CSAIL, MIT, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7984-3366","authenticated-orcid":false,"given":"Alexander","family":"D'Amour","sequence":"additional","affiliation":[{"name":"Google Research, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0992-0906","authenticated-orcid":false,"given":"John","family":"Guttag","sequence":"additional","affiliation":[{"name":"CSAIL, MIT, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5564-635X","authenticated-orcid":false,"given":"Arvind","family":"Satyanarayan","sequence":"additional","affiliation":[{"name":"CSAIL, MIT, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,4,19]]},"reference":[{"key":"e_1_3_3_3_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3308560.3317593"},{"key":"e_1_3_3_3_2_1","volume-title":"Using thematic analysis in psychology. Qualitative research in psychology 3, 2","author":"Braun Virginia","year":"2006","unstructured":"Virginia Braun and Victoria Clarke. 2006. Using thematic analysis in psychology. Qualitative research in psychology 3, 2 (2006), 77\u2013101."},{"key":"e_1_3_3_3_3_1","volume-title":"Conference on fairness, accountability and transparency. PMLR, 77\u201391","author":"Buolamwini Joy","year":"2018","unstructured":"Joy Buolamwini and Timnit Gebru. 2018. Gender shades: Intersectional accuracy disparities in commercial gender classification. In Conference on fairness, accountability and transparency. PMLR, 77\u201391."},{"key":"e_1_3_3_3_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3359206"},{"key":"e_1_3_3_3_5_1","doi-asserted-by":"crossref","unstructured":"Jose Camacho-Collados Kiamehr Rezaee Talayeh Riahi Asahi Ushio Daniel Loureiro Dimosthenis Antypas Joanne Boisson Luis Espinosa-Anke Fangyu Liu Eugenio Mart\u00ednez-C\u00e1mara 2022. TweetNLP: Cutting-Edge Natural Language Processing for Social Media. arXiv preprint arXiv:2206.14774(2022).","DOI":"10.18653\/v1\/2022.emnlp-demos.5"},{"key":"e_1_3_3_3_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-2029"},{"key":"e_1_3_3_3_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3274301"},{"key":"e_1_3_3_3_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533108"},{"key":"e_1_3_3_3_9_1","unstructured":"Alexander D\u2019Amour Katherine Heller Dan Moldovan Ben Adlam Babak Alipanahi Alex Beutel Christina Chen Jonathan Deaton Jacob Eisenstein Matthew\u00a0D Hoffman 2020. Underspecification presents challenges for credibility in modern machine learning. arXiv preprint arXiv:2011.03395(2020)."},{"key":"e_1_3_3_3_10_1","volume-title":"Hilary Nicole, and Morgan\u00a0Klaus Scheuerman.","author":"Denton Emily","year":"2020","unstructured":"Emily Denton, Alex Hanna, Razvan Amironesei, Andrew Smart, Hilary Nicole, and Morgan\u00a0Klaus Scheuerman. 2020. Bringing the people back in: Contesting benchmark machine learning datasets. arXiv preprint arXiv:2007.07399(2020)."},{"key":"e_1_3_3_3_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3534647"},{"key":"e_1_3_3_3_12_1","volume-title":"Data feminism","author":"D\u2019ignazio Catherine","unstructured":"Catherine D\u2019ignazio and Lauren\u00a0F Klein. 2020. Data feminism. MIT press."},{"key":"e_1_3_3_3_13_1","unstructured":"Finale Doshi-Velez and Been Kim. 2017. Towards a rigorous science of interpretable machine learning. arXiv preprint arXiv:1702.08608(2017)."},{"key":"e_1_3_3_3_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3373157"},{"key":"e_1_3_3_3_15_1","doi-asserted-by":"publisher","DOI":"10.1609\/icwsm.v12i1.15033"},{"key":"e_1_3_3_3_16_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-020-00257-z"},{"key":"e_1_3_3_3_17_1","doi-asserted-by":"publisher","DOI":"10.1177\/1473871611416549"},{"key":"e_1_3_3_3_18_1","doi-asserted-by":"crossref","unstructured":"Karan Goel Nazneen Rajani Jesse Vig Samson Tan Jason Wu Stephan Zheng Caiming Xiong Mohit Bansal and Christopher R\u00e9. 2021. Robustness gym: Unifying the nlp evaluation landscape. arXiv preprint arXiv:2101.04840(2021).","DOI":"10.18653\/v1\/2021.naacl-demos.6"},{"key":"e_1_3_3_3_19_1","unstructured":"Thomas\u00a0RG Green. 1989. Cognitive dimensions of notations. People and computers V(1989) 443\u2013460."},{"key":"e_1_3_3_3_20_1","volume-title":"International conference on machine learning. PMLR, 1321\u20131330","author":"Guo Chuan","year":"2017","unstructured":"Chuan Guo, Geoff Pleiss, Yu Sun, and Kilian\u00a0Q Weinberger. 2017. On calibration of modern neural networks. In International conference on machine learning. PMLR, 1321\u20131330."},{"key":"e_1_3_3_3_21_1","doi-asserted-by":"publisher","DOI":"10.1037\/0278-7393.18.4.691"},{"key":"e_1_3_3_3_22_1","unstructured":"Laura Hanu and Unitary team. 2020. Detoxify. Github. https:\/\/github.com\/unitaryai\/detoxify."},{"key":"e_1_3_3_3_23_1","volume-title":"Feminist theory reader","author":"Haraway Donna","unstructured":"Donna Haraway. 2020. Situated knowledges: The science question in feminism and the privilege of partial perspective. In Feminist theory reader. Routledge, 303\u2013310."},{"key":"e_1_3_3_3_24_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=HJz6tiCqYm","author":"Hendrycks Dan","year":"2019","unstructured":"Dan Hendrycks and Thomas Dietterich. 2019. Benchmarking Neural Network Robustness to Common Corruptions and Perturbations. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=HJz6tiCqYm"},{"key":"e_1_3_3_3_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442188.3445923"},{"key":"e_1_3_3_3_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00159"},{"key":"e_1_3_3_3_27_1","volume-title":"Ethics of Data and Analytics","author":"Kluttz N","unstructured":"Daniel\u00a0N Kluttz, Nitin Kohli, and Deirdre\u00a0K Mulligan. 2022. Shaping our tools: Contestability as a means to promote responsible algorithmic decision making in the professions. In Ethics of Data and Analytics. Auerbach Publications, 420\u2013428."},{"key":"e_1_3_3_3_28_1","volume-title":"International Conference on Machine Learning. PMLR, 5637\u20135664","author":"Koh Pang\u00a0Wei","year":"2021","unstructured":"Pang\u00a0Wei Koh, Shiori Sagawa, Henrik Marklund, Sang\u00a0Michael Xie, Marvin Zhang, Akshay Balsubramani, Weihua Hu, Michihiro Yasunaga, Richard\u00a0Lanas Phillips, Irena Gao, 2021. Wilds: A benchmark of in-the-wild distribution shifts. In International Conference on Machine Learning. PMLR, 5637\u20135664."},{"key":"e_1_3_3_3_29_1","volume-title":"Text visualization techniques: Taxonomy, visual survey, and community insights. In 2015 IEEE Pacific visualization symposium (pacificVis)","author":"Kucher Kostiantyn","unstructured":"Kostiantyn Kucher and Andreas Kerren. 2015. Text visualization techniques: Taxonomy, visual survey, and community insights. In 2015 IEEE Pacific visualization symposium (pacificVis). IEEE, 117\u2013121."},{"key":"e_1_3_3_3_30_1","volume-title":"Seventeenth Symposium on Usable Privacy and Security (SOUPS","author":"Kumar Deepak","year":"2021","unstructured":"Deepak Kumar, Patrick\u00a0Gage Kelley, Sunny Consolvo, Joshua Mason, Elie Bursztein, Zakir Durumeric, Kurt Thomas, and Michael Bailey. 2021. Designing toxic content classification for a diversity of perspectives. In Seventeenth Symposium on Usable Privacy and Security (SOUPS 2021). 299\u2013318."},{"key":"e_1_3_3_3_31_1","doi-asserted-by":"crossref","unstructured":"Alyssa Lees Vinh\u00a0Q Tran Yi Tay Jeffrey Sorensen Jai Gupta Donald Metzler and Lucy Vasserman. 2022. A new generation of perspective api: Efficient multilingual character-level transformers. arXiv preprint arXiv:2202.11176(2022).","DOI":"10.1145\/3534678.3539147"},{"key":"e_1_3_3_3_32_1","unstructured":"Yunhui Long Vincent Bindschaedler and Carl\u00a0A Gunter. 2017. Towards measuring membership privacy. arXiv preprint arXiv:1712.09136(2017)."},{"key":"e_1_3_3_3_33_1","volume-title":"Andrew Smart, and William\u00a0S Isaac","author":"Martin\u00a0Jr Donald","year":"2020","unstructured":"Donald Martin\u00a0Jr, Vinodkumar Prabhakaran, Jill Kuhlberg, Andrew Smart, and William\u00a0S Isaac. 2020. Extending the machine learning abstraction boundary: A Complex systems approach to incorporate societal context. arXiv preprint arXiv:2006.09663(2020)."},{"key":"e_1_3_3_3_34_1","volume-title":"The magical number seven, plus or minus two: Some limits on our capacity for processing information.Psychological review 63, 2","author":"Miller A","year":"1956","unstructured":"George\u00a0A Miller. 1956. The magical number seven, plus or minus two: Some limits on our capacity for processing information.Psychological review 63, 2 (1956), 81."},{"key":"e_1_3_3_3_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3287560.3287596"},{"key":"e_1_3_3_3_36_1","volume-title":"International Conference on Machine Learning. PMLR, 4901\u20134911","author":"Odena Augustus","year":"2019","unstructured":"Augustus Odena, Catherine Olsson, David Andersen, and Ian Goodfellow. 2019. Tensorfuzz: Debugging neural networks with coverage-guided fuzzing. In International Conference on Machine Learning. PMLR, 4901\u20134911."},{"key":"e_1_3_3_3_37_1","volume-title":"Can you trust your model\u2019s uncertainty? evaluating predictive uncertainty under dataset shift. Advances in neural information processing systems 32","author":"Ovadia Yaniv","year":"2019","unstructured":"Yaniv Ovadia, Emily Fertig, Jie Ren, Zachary Nado, David Sculley, Sebastian Nowozin, Joshua Dillon, Balaji Lakshminarayanan, and Jasper Snoek. 2019. Can you trust your model\u2019s uncertainty? evaluating predictive uncertainty under dataset shift. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_3_3_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3375627.3375841"},{"key":"e_1_3_3_3_39_1","unstructured":"Pew Research Center. 2017. Online Harassment 2017. https:\/\/www.pewresearch.org\/internet\/2017\/07\/11\/online-harassment-2017\/"},{"key":"e_1_3_3_3_40_1","volume-title":"Dataset shift in machine learning","author":"Quinonero-Candela Joaquin","unstructured":"Joaquin Quinonero-Candela, Masashi Sugiyama, Anton Schwaighofer, and Neil\u00a0D Lawrence. 2008. Dataset shift in machine learning. Mit Press."},{"key":"e_1_3_3_3_41_1","unstructured":"Deborah Raji Emily Denton Emily\u00a0M. Bender Alex Hanna and Amandalynne Paullada. 2021. AI and the Everything in the Whole Wide World Benchmark. In Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks J.\u00a0Vanschoren and S.\u00a0Yeung (Eds.). Vol.\u00a01. https:\/\/datasets-benchmarks-proceedings.neurips.cc\/paper\/2021\/file\/084b6fbb10729ed4da8c3d3f5a3ae7c9-Paper-round2.pdf"},{"key":"e_1_3_3_3_42_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.442"},{"key":"e_1_3_3_3_43_1","unstructured":"Suchi Saria and Adarsh Subbaswamy. 2019. Tutorial: safe and reliable machine learning. arXiv preprint arXiv:1904.07204(2019)."},{"key":"e_1_3_3_3_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3372827"},{"key":"e_1_3_3_3_45_1","volume-title":"NIPS 2017 workshop: Machine Learning for the Developing World.","author":"Shankar Shreya","unstructured":"Shreya Shankar, Yoni Halpern, Eric Breck, James Atwood, Jimbo Wilson, and D. Sculley. 2017. No Classification without Representation: Assessing Geodiversity Issues in Open Data Sets for the Developing World. In NIPS 2017 workshop: Machine Learning for the Developing World."},{"key":"e_1_3_3_3_46_1","volume-title":"30th USENIX Security Symposium (USENIX Security 21)","author":"Song Liwei","year":"2021","unstructured":"Liwei Song and Prateek Mittal. 2021. Systematic evaluation of privacy risks of machine learning models. In 30th USENIX Security Symposium (USENIX Security 21). 2615\u20132632."},{"key":"e_1_3_3_3_47_1","volume-title":"Guidelines for effective usage of text highlighting techniques","author":"Strobelt Hendrik","year":"2015","unstructured":"Hendrik Strobelt, Daniela Oelke, Bum\u00a0Chul Kwon, Tobias Schreck, and Hanspeter Pfister. 2015. Guidelines for effective usage of text highlighting techniques. IEEE transactions on visualization and computer graphics 22, 1(2015), 489\u2013498."},{"key":"e_1_3_3_3_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445088"},{"key":"e_1_3_3_3_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3490099.3511160"},{"key":"e_1_3_3_3_50_1","volume-title":"Towards Intersectional Feminist and Participatory ML: A Case Study in Supporting Feminicide Counterdata Collection. In 2022 ACM Conference on Fairness, Accountability, and Transparency. 667\u2013678","author":"Suresh Harini","year":"2022","unstructured":"Harini Suresh, Rajiv Movva, Amelia\u00a0Lee Dogan, Rahul Bhargava, Isadora Crux\u00ean, \u00c1ngeles\u00a0Martinez Cuba, Guilia Taurino, Wonyoung So, and Catherine D\u2019Ignazio. 2022. Towards Intersectional Feminist and Participatory ML: A Case Study in Supporting Feminicide Counterdata Collection. In 2022 ACM Conference on Fairness, Accountability, and Transparency. 667\u2013678."},{"key":"e_1_3_3_3_51_1","doi-asserted-by":"publisher","DOI":"10.6084\/m9.figshare.4563973.v2"},{"key":"e_1_3_3_3_52_1","unstructured":"Victor Veitch Alexander D\u2019Amour Steve Yadlowsky and Jacob Eisenstein. 2021. Counterfactual invariance to spurious correlations: Why and how to pass stress tests. arXiv preprint arXiv:2106.00545(2021)."},{"key":"e_1_3_3_3_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2008.172"},{"key":"e_1_3_3_3_54_1","doi-asserted-by":"crossref","unstructured":"Thomas Wolf Lysandre Debut Victor Sanh Julien Chaumond Clement Delangue Anthony Moi Pierric Cistac Tim Rault R\u00e9mi Louf Morgan Funtowicz 2019. Huggingface\u2019s transformers: State-of-the-art natural language processing. arXiv preprint arXiv:1910.03771(2019).","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"e_1_3_3_3_55_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1073"},{"key":"e_1_3_3_3_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3375709"}],"event":{"name":"CHI '23: CHI Conference on Human Factors in Computing Systems","location":"Hamburg Germany","acronym":"CHI '23","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 2023 CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3544548.3581482","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3544548.3581482","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:46:56Z","timestamp":1750178816000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3544548.3581482"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,4,19]]},"references-count":56,"alternative-id":["10.1145\/3544548.3581482","10.1145\/3544548"],"URL":"https:\/\/doi.org\/10.1145\/3544548.3581482","relation":{},"subject":[],"published":{"date-parts":[[2023,4,19]]},"assertion":[{"value":"2023-04-19","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}