{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T12:59:04Z","timestamp":1776085144262,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":110,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,23]]},"DOI":"10.1145\/3715275.3732005","type":"proceedings-article","created":{"date-parts":[[2025,6,23]],"date-time":"2025-06-23T17:01:18Z","timestamp":1750698078000},"page":"45-60","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Examining the Expanding Role of Synthetic Data Throughout the AI Development Pipeline"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0152-4311","authenticated-orcid":false,"given":"Shivani","family":"Kapania","sequence":"first","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5174-4654","authenticated-orcid":false,"given":"Stephanie","family":"Ballard","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3863-8877","authenticated-orcid":false,"given":"Alex","family":"Kessler","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7807-2018","authenticated-orcid":false,"given":"Jennifer Wortman","family":"Vaughan","sequence":"additional","affiliation":[{"name":"Microsoft, New York City, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,6,23]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-10925-7_31"},{"key":"e_1_3_3_1_3_2","unstructured":"Marah Abdin Jyoti Aneja Harkirat Behl S\u00e9bastien Bubeck Ronen Eldan Suriya Gunasekar Michael Harrison Russell\u00a0J Hewett Mojan Javaheripi Piero Kauffmann et\u00a0al. 2024. Phi-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.08905 (2024)."},{"key":"e_1_3_3_1_4_2","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia\u00a0Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et\u00a0al. 2023. Gpt-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642703"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/3383455.3422554"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.3386\/w32474"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00352"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3372859"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"Emily\u00a0M Bender and Batya Friedman. 2018. Data statements for natural language processing: Toward mitigating system bias and enabling better science. Transactions of the Association for Computational Linguistics 6 (2018) 587\u2013604.","DOI":"10.1162\/tacl_a_00041"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1037\/13620-004"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Virginia Braun and Victoria Clarke. 2019. Reflecting on reflexive thematic analysis. Qualitative research in sport exercise and health 11 4 (2019) 589\u2013597.","DOI":"10.1080\/2159676X.2019.1628806"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"Sarah Burkhardt and Bernhard Rieder. 2024. Foundation models are platform models: Prompting and the political economy of AI. Big Data & Society 11 2 (2024) 20539517241247839.","DOI":"10.1177\/20539517241247839"},{"key":"e_1_3_3_1_14_2","unstructured":"Cheng-Han Chiang and Hung-yi Lee. 2023. Can large language models be an alternative to human evaluations? arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.01937 (2023)."},{"key":"e_1_3_3_1_15_2","unstructured":"Kasia Chmielinski Sarah Newman Chris\u00a0N. Kranzinger Michael Hind Jennifer\u00a0Wortman Vaughan Margaret Mitchell Julia Stoyanovich Angelina McMillan-Major Emily McReynolds Kathleen Esfahany Mary\u00a0L. Gray Audrey Chang and Maui Hudson. 2024. The CLeAR Documentation Framework for AI Transparency: Recommendations for Practitioners & Context for Policymakers. Harvard Kennedy School Shorenstein Center discussion paper."},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Anamaria Crisan Brittany Fiore-Gartland and Melanie Tory. 2020. Passing the data baton: A retrospective analysis on data science work and workers. IEEE Transactions on Visualization and Computer Graphics 27 2 (2020) 1860\u20131870.","DOI":"10.1109\/TVCG.2020.3030340"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"crossref","unstructured":"Jessamyn Dahmen and Diane Cook. 2019. SynSys: A synthetic data generation system for healthcare applications. Sensors 19 5 (2019) 1181.","DOI":"10.3390\/s19051181"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3617694.3623261"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533113"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3593013.3594037"},{"key":"e_1_3_3_1_21_2","unstructured":"Yann Dubois Chen\u00a0Xuechen Li Rohan Taori Tianyi Zhang Ishaan Gulrajani Jimmy Ba Carlos Guestrin Percy\u00a0S Liang and Tatsunori\u00a0B Hashimoto. 2024. Alpacafarm: A simulation framework for methods that learn from human feedback. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Madeleine\u00a0Clare Elish and Danah Boyd. 2018. Situating methods in the magic of Big Data and AI. Communication monographs 85 1 (2018) 57\u201380.","DOI":"10.1080\/03637751.2017.1375130"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Alexander\u00a0R Fabbri Wojciech Kry\u015bci\u0144ski Bryan McCann Caiming Xiong Richard Socher and Dragomir Radev. 2021. Summeval: Re-evaluating summarization evaluation. Transactions of the Association for Computational Linguistics 9 (2021) 391\u2013409.","DOI":"10.1162\/tacl_a_00373"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"crossref","unstructured":"Shangbin Feng Vidhisha Balachandran Yuyang Bai and Yulia Tsvetkov. 2023. Factkb: Generalizable factuality evaluation using language models enhanced with factual knowledge. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.08281 (2023).","DOI":"10.18653\/v1\/2023.emnlp-main.59"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","unstructured":"Andrew Fitzgerald. 2024. Why Synthetic Data Can Never Be Ethical: A Lesson from Media Ethics. Surveillance & Society 22 4 (Dec. 2024) 477\u2013482. 10.24908\/ss.v22i4.18324","DOI":"10.24908\/ss.v22i4.18324"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"crossref","unstructured":"Timnit Gebru Jamie Morgenstern Briana Vecchione Jennifer\u00a0Wortman Vaughan Hanna Wallach Hal Daum\u00e9\u00a0III and Kate Crawford. 2021. Datasheets for datasets. Commun. ACM 64 12 (December 2021) 86\u201392.","DOI":"10.1145\/3458723"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Fabrizio Gilardi Meysam Alizadeh and Ma\u00ebl Kubli. 2023. ChatGPT outperforms crowd workers for text-annotation tasks. Proceedings of the National Academy of Sciences 120 30 (2023) e2305016120.","DOI":"10.1073\/pnas.2305016120"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Lisa Gitelman. 2013. \u201cRaw Data\u201d Is an Oxymoron.","DOI":"10.7551\/mitpress\/9302.001.0001"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Ian Goodfellow Jean Pouget-Abadie Mehdi Mirza Bing Xu David Warde-Farley Sherjil Ozair Aaron Courville and Yoshua Bengio. 2020. Generative adversarial networks. Commun. ACM 63 11 (2020) 139\u2013144.","DOI":"10.1145\/3422622"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"crossref","unstructured":"Jianping Gou Baosheng Yu Stephen\u00a0J. Maybank and Dacheng Tao. 2021. Knowledge Distillation: A Survey. International Journal of Computer Vision 129 (2021) 1789\u20131819.","DOI":"10.1007\/s11263-021-01453-z"},{"key":"e_1_3_3_1_31_2","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et\u00a0al. 2024. The llama 3 herd of models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.21783 (2024)."},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"crossref","unstructured":"Igor Grossmann Matthew Feinberg Dawn\u00a0C Parker Nicholas\u00a0A Christakis Philip\u00a0E Tetlock and William\u00a0A Cunningham. 2023. AI and the transformation of social science research. Science 380 6650 (2023) 1108\u20131109.","DOI":"10.1126\/science.adi1778"},{"key":"e_1_3_3_1_33_2","unstructured":"Tom Gunter Zirui Wang Chong Wang Ruoming Pang Andy Narayanan Aonan Zhang Bowen Zhang Chen Chen Chung-Cheng Chiu David Qiu et\u00a0al. 2024. Apple intelligence foundation language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.21075 (2024)."},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"crossref","unstructured":"Alon Halevy Peter Norvig and Fernando Pereira. 2009. The unreasonable effectiveness of data. IEEE intelligent systems 24 2 (2009) 8\u201312.","DOI":"10.1109\/MIS.2009.36"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"crossref","unstructured":"Lei Han Tianwa Chen Gianluca Demartini Marta Indulska and Shazia Sadiq. 2023. A data-driven analysis of behaviors in data curation processes. ACM Transactions on Information Systems 41 3 (2023) 1\u201335.","DOI":"10.1145\/3567419"},{"key":"e_1_3_3_1_36_2","unstructured":"Alex Hanna and Tina\u00a0M Park. 2020. Against scale: Provocations and resistances to scale thinking. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.08850 (2020)."},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"crossref","unstructured":"Thomas Hartvigsen Saadia Gabriel Hamid Palangi Maarten Sap Dipankar Ray and Ece Kamar. 2022. ToxiGen: A Large-Scale Machine-Generated Dataset for Adversarial and Implicit Hate Speech Detection. http:\/\/arxiv.org\/abs\/2203.09509 arXiv:https:\/\/arXiv.org\/abs\/2203.09509 [cs].","DOI":"10.18653\/v1\/2022.acl-long.234"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"crossref","unstructured":"Amy\u00a0K Heger Liz\u00a0B Marquis Mihaela Vorvoreanu Hanna Wallach and Jennifer Wortman\u00a0Vaughan. 2022. Understanding machine learning practitioners\u2019 data documentation perceptions needs challenges and desiderata. Proceedings of the ACM on Human-Computer Interaction 6 CSCW2 (2022) 1\u201329.","DOI":"10.1145\/3555760"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","unstructured":"Paula Helm Benjamin Lipp and Roser Pujadas. 2024. Generating reality and silencing debate: Synthetic data as discursive device. Big Data & Society 11 2 (June 2024) 20539517241249447. 10.1177\/20539517241249447Publisher: SAGE Publications Ltd.","DOI":"10.1177\/20539517241249447"},{"key":"e_1_3_3_1_40_2","unstructured":"Geoffrey\u00a0E. Hinton Oriol Vinyals and Jeffrey Dean. 2015. Distilling the Knowledge in a Neural Network. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1503.02531."},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300830"},{"key":"e_1_3_3_1_42_2","unstructured":"Qisheng Hu Kaixin Li Xu Zhao Yuxi Xie Tiedong Liu Hui Chen Qizhe Xie and Junxian He. 2023. Instructcoder: Empowering language models for code editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.20329 (2023)."},{"key":"e_1_3_3_1_43_2","unstructured":"Wenlong Huang Fei Xia Ted Xiao Harris Chan Jacky Liang Pete Florence Andy Zeng Jonathan Tompson Igor Mordatch Yevgen Chebotar et\u00a0al. 2022. Inner monologue: Embodied reasoning through planning with language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2207.05608 (2022)."},{"key":"e_1_3_3_1_44_2","unstructured":"IBM. 2023. What is Synthetic Data? | IBM. https:\/\/www.ibm.com\/think\/topics\/synthetic-data [Online; accessed 2025-01-21]."},{"key":"e_1_3_3_1_45_2","unstructured":"Apple Inc.2024. Introducing Apple\u2019s On-Device and Server Foundation Models - Apple Machine Learning Research. https:\/\/machinelearning.apple.com\/research\/introducing-apple-foundation-models [Online; accessed 2025-01-21]."},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","unstructured":"Benjamin\u00a0N Jacobsen. 2023. Machine learning and the politics of synthetic data. Big Data & Society 10 1 (Jan. 2023) 20539517221145372. 10.1177\/20539517221145372Publisher: SAGE Publications Ltd.","DOI":"10.1177\/20539517221145372"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00394"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600211.3604681"},{"key":"e_1_3_3_1_49_2","unstructured":"James Jordon Lukasz Szpruch Florimond Houssiau Mirko Bottarelli Giovanni Cherubin Carsten Maple Samuel\u00a0N Cohen and Adrian Weller. 2022. Synthetic Data\u2013what why and how? arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2205.03257 (2022)."},{"key":"e_1_3_3_1_50_2","volume-title":"International conference on learning representations","author":"Jordon James","year":"2018","unstructured":"James Jordon, Jinsung Yoon, and Mihaela Van Der\u00a0Schaar. 2018. PATE-GAN: Generating synthetic data with differential privacy guarantees. In International conference on learning representations."},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580645"},{"key":"e_1_3_3_1_52_2","unstructured":"Diederik\u00a0P Kingma. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1312.6114 (2013)."},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"publisher","unstructured":"Laura Koesten Elena Simperl Tom Blount Emilia Kacprzak and Jeni Tennison. 2020. Everything you always wanted to know about a dataset: Studies in data summarisation. International Journal of Human-Computer Studies 135 (March 2020) 102367. 10.1016\/j.ijhcs.2019.10.004","DOI":"10.1016\/j.ijhcs.2019.10.004"},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2019.00279"},{"key":"e_1_3_3_1_55_2","doi-asserted-by":"crossref","unstructured":"Francis Lee Saghi Hajisharif and Ericka Johnson. 2025. The ontological politics of synthetic data: Normalities outliers and intersectional hallucinations. Big Data & Society 12 2 (2025) 20539517251318289.","DOI":"10.1177\/20539517251318289"},{"key":"e_1_3_3_1_56_2","doi-asserted-by":"crossref","unstructured":"Peter Lee. 2024. Synthetic Data and the Future of AI. (2024).","DOI":"10.2139\/ssrn.5281032"},{"key":"e_1_3_3_1_57_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160591"},{"key":"e_1_3_3_1_58_2","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong\u00a0Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems 36 (2024)."},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"publisher","unstructured":"Ruibo Liu Jerry Wei Fangyu Liu Chenglei Si Yanzhe Zhang Jinmeng Rao Steven Zheng Daiyi Peng Diyi Yang Denny Zhou and Andrew\u00a0M. Dai. 2024. Best Practices and Lessons Learned on Synthetic Data for Language Models. 10.48550\/arXiv.2404.07503arXiv:https:\/\/arXiv.org\/abs\/2404.07503 [cs].","DOI":"10.48550\/arXiv.2404.07503"},{"key":"e_1_3_3_1_60_2","doi-asserted-by":"crossref","unstructured":"Dieuwertje Luitse and Wiebke Denkena. 2021. The great transformer: Examining the role of large language models in the political economy of AI. Big Data & Society 8 2 (2021) 20539517211047734.","DOI":"10.1177\/20539517211047734"},{"key":"e_1_3_3_1_61_2","unstructured":"Haipeng Luo Qingfeng Sun Can Xu Pu Zhao Jianguang Lou Chongyang Tao Xiubo Geng Qingwei Lin Shifeng Chen and Dongmei Zhang. 2023. Wizardmath: Empowering mathematical reasoning for large language models via reinforced evol-instruct. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.09583 (2023)."},{"key":"e_1_3_3_1_62_2","doi-asserted-by":"crossref","unstructured":"Michael Madaio Lisa Egede Hariharan Subramonyam Jennifer Wortman\u00a0Vaughan and Hanna Wallach. 2022. Assessing the Fairness of AI Systems: AI Practitioners\u2019 Processes Challenges and Needs for Support. Proceedings of the ACM on Human-Computer Interaction 6 CSCW1 (2022) 26\u00a0pages.","DOI":"10.1145\/3512899"},{"key":"e_1_3_3_1_63_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376445"},{"key":"e_1_3_3_1_64_2","doi-asserted-by":"crossref","unstructured":"Daniella Meeker Crystal Kallem Yan Heras Stephanie Garcia and Casey Thompson. 2022. Case report: evaluation of an open-source synthetic data platform for simulation studies. JAMIA open 5 3 (2022) ooac067.","DOI":"10.1093\/jamiaopen\/ooac067"},{"key":"e_1_3_3_1_65_2","unstructured":"Cade Metz Cecilia Kang Sheera Frenkel Stuart\u00a0A. Thompson and Nico Grant. 2024. How Tech Giants Cut Corners to Harvest Data for A.I. - The New York Times. https:\/\/www.nytimes.com\/2024\/04\/06\/technology\/tech-giants-harvest-data-artificial-intelligence.html [Online; accessed 2025-01-21]."},{"key":"e_1_3_3_1_66_2","doi-asserted-by":"crossref","unstructured":"Milagros Miceli Martin Schuessler and Tianling Yang. 2020. Between subjectivity and imposition: Power dynamics in data annotation for computer vision. Proceedings of the ACM on Human-Computer Interaction 4 CSCW2 (2020) 1\u201325.","DOI":"10.1145\/3415186"},{"key":"e_1_3_3_1_67_2","doi-asserted-by":"publisher","DOI":"10.1145\/3287560.3287596"},{"key":"e_1_3_3_1_68_2","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300356"},{"key":"e_1_3_3_1_69_2","unstructured":"Laura Nader. 1972. Up the anthropologist: Perspectives gained from studying up. (1972)."},{"key":"e_1_3_3_1_70_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-75178-4"},{"key":"e_1_3_3_1_71_2","unstructured":"Code of Practice Working\u00a0Groups. 2024. Second Draft of the General-Purpose AI Code of Practice. https:\/\/digital-strategy.ec.europa.eu\/en\/library\/second-draft-general-purpose-ai-code-practice-published-written-independent-experts [Online; accessed 2025-01-20]."},{"key":"e_1_3_3_1_72_2","unstructured":"Office of\u00a0the Assistant Secretary\u00a0for Planning and Evaluation (ASPE). 2022. A Synthetic Health Data Generation Engine to Accelerate Patient-Centered Outcomes Research | ASPE. https:\/\/aspe.hhs.gov\/synthetic-health-data-generation-engine-accelerate-patient-centered-outcomes-research [Online; accessed 2025-01-22]."},{"key":"e_1_3_3_1_73_2","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray et\u00a0al. 2022. Training language models to follow instructions with human feedback. Advances in neural information processing systems 35 (2022) 27730\u201327744."},{"key":"e_1_3_3_1_74_2","first-page":"311","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318."},{"key":"e_1_3_3_1_75_2","doi-asserted-by":"publisher","DOI":"10.1145\/2998181.2998331"},{"key":"e_1_3_3_1_76_2","doi-asserted-by":"crossref","unstructured":"Samir Passi and Steven\u00a0J Jackson. 2018. Trust in data science: Collaboration translation and accountability in corporate data science projects. Proceedings of the ACM on Human-Computer Interaction 2 CSCW (2018) 1\u201328.","DOI":"10.1145\/3274405"},{"key":"e_1_3_3_1_77_2","unstructured":"Personal Data Protection\u00a0Commission (PDPC). 2024. Privacy Enhancing Technology (PET): Proposed Guide On Synthetic Data Generation. https:\/\/www.pdpc.gov.sg\/-\/media\/files\/pdpc\/pdf-files\/other-guides\/proposed-guide-on-synthetic-data-generation.pdf [Online; accessed 2025-01-20]."},{"key":"e_1_3_3_1_78_2","doi-asserted-by":"crossref","unstructured":"Ethan Perez Saffron Huang Francis Song Trevor Cai Roman Ring John Aslanides Amelia Glaese Nat McAleese and Geoffrey Irving. 2022. Red Teaming Language Models with Language Models. arXiv:https:\/\/arXiv.org\/abs\/2202.03286 [cs] (Feb. 2022). http:\/\/arxiv.org\/abs\/2202.03286 arXiv:https:\/\/arXiv.org\/abs\/2202.03286.","DOI":"10.18653\/v1\/2022.emnlp-main.225"},{"key":"e_1_3_3_1_79_2","unstructured":"Crystal Qian Michael\u00a0Xieyang Liu Emily Reif Grady Simon Nada Hussein Nathan Clement James Wexler Carrie\u00a0J Cai Michael Terry and Minsuk Kahng. 2024. The Evolution of LLM Adoption in Industry Data Curation Practices. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.16089 (2024)."},{"key":"e_1_3_3_1_80_2","doi-asserted-by":"crossref","unstructured":"Bogdana Rakova Jingying Yang Henriette Cramer and Rumman Chowdhury. 2021. Where responsible AI meets reality: Practitioner perspectives on enablers for shifting organizational practices. Proceedings of the ACM on Human-Computer Interaction 5 CSCW1 (2021) 1\u201323.","DOI":"10.1145\/3449081"},{"key":"e_1_3_3_1_81_2","unstructured":"Arij Riabi Thomas Scialom Rachel Keraron Beno\u00eet Sagot Djam\u00e9 Seddah and Jacopo Staiano. 2020. Synthetic data augmentation for zero-shot cross-lingual question answering. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.12643 (2020)."},{"key":"e_1_3_3_1_82_2","unstructured":"Kevin Roose. 2024. Data for A.I. Training Is Disappearing Fast Study Shows - The New York Times. https:\/\/www.nytimes.com\/2024\/07\/19\/technology\/ai-data-restrictions.html [Online; accessed 2025-01-21]."},{"key":"e_1_3_3_1_83_2","doi-asserted-by":"crossref","unstructured":"Roy\u00a0A Ruddle James Cheshire and Sara\u00a0Johansson Fernstad. 2023. Tasks and visualizations used for data profiling: A survey and interview study. IEEE Transactions on Visualization and Computer Graphics (2023).","DOI":"10.1109\/TVCG.2023.3234337"},{"key":"e_1_3_3_1_84_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445518"},{"key":"e_1_3_3_1_85_2","unstructured":"Siamak Shakeri Noah Constant Mihir\u00a0Sanjay Kale and Linting Xue. 2020. Towards zero-shot multilingual synthetic question and answer generation for cross-lingual reading comprehension. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.12008 (2020)."},{"key":"e_1_3_3_1_86_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-00536-8_1"},{"key":"e_1_3_3_1_87_2","doi-asserted-by":"crossref","unstructured":"Ilia Shumailov Zakhar Shumaylov Yiren Zhao Nicolas Papernot Ross Anderson and Yarin Gal. 2024. AI models collapse when trained on recursively generated data. Nature 631 (2024) 755\u2013759.","DOI":"10.1038\/s41586-024-07566-y"},{"key":"e_1_3_3_1_88_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445092"},{"key":"e_1_3_3_1_89_2","doi-asserted-by":"crossref","unstructured":"Daniel Susser Daniel\u00a0S Schiff Sara Gerke Laura\u00a0Y Cabrera I\u00a0Glenn Cohen Megan Doerr Jordan Harrod Kristin Kostick-Quenet Jasmine McNealy Michelle\u00a0N Meyer et\u00a0al. 2024. Synthetic Health Data: Real Ethical Promise and Peril. Hastings Center Report 54 5 (2024) 8\u201313.","DOI":"10.1002\/hast.4911"},{"key":"e_1_3_3_1_90_2","doi-asserted-by":"publisher","unstructured":"Daniel Susser Daniel\u00a0S. Schiff Sara Gerke Laura\u00a0Y. Cabrera I.\u00a0Glenn Cohen Megan Doerr Jordan Harrod Kristin Kostick-Quenet Jasmine McNealy Michelle\u00a0N. Meyer W.\u00a0Nicholson Price\u00a0II and Jennifer\u00a0K. Wagner. 2024. Synthetic Health Data: Real Ethical Promise and Peril. Hastings Center Report 54 5 (2024) 8\u201313. 10.1002\/hast.4911_eprint: https:\/\/onlinelibrary.wiley.com\/doi\/pdf\/10.1002\/hast.4911.","DOI":"10.1002\/hast.4911"},{"key":"e_1_3_3_1_91_2","unstructured":"Daniel Susser and Jeremy Seeman. [n. d.]. Dialogue Critical Provocations for Synthetic Data. ([n. d.])."},{"key":"e_1_3_3_1_92_2","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew\u00a0M Dai Anja Hauth Katie Millican et\u00a0al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.11805 (2023)."},{"key":"e_1_3_3_1_93_2","doi-asserted-by":"crossref","unstructured":"Risto Uuk Annemieke Brouwer Tim Schreier Noemi Dreksler Valeria Pulignano and Rishi Bommasani. 2024. Effective Mitigations for Systemic Risks from General-Purpose AI. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.02145 (2024).","DOI":"10.2139\/ssrn.5021463"},{"key":"e_1_3_3_1_94_2","doi-asserted-by":"publisher","DOI":"10.1145\/1958824.1958906"},{"key":"e_1_3_3_1_95_2","unstructured":"Angelina Wang Jamie Morgenstern and John\u00a0P. Dickerson. 2024. Large language models should not replace human participants because they can misportray and flatten identity groups. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.01908."},{"key":"e_1_3_3_1_96_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3502121"},{"key":"e_1_3_3_1_97_2","doi-asserted-by":"publisher","DOI":"10.1145\/3630106.3659002"},{"key":"e_1_3_3_1_98_2","doi-asserted-by":"crossref","unstructured":"David\u00a0Gray Widder and Dawn Nafus. 2023. Dislocated accountabilities in the \u201cAI supply chain\u201d: Modularity and developers\u2019 notions of responsibility. Big Data & Society 10 1 (2023) 20539517231177620.","DOI":"10.1177\/20539517231177620"},{"key":"e_1_3_3_1_99_2","doi-asserted-by":"crossref","unstructured":"David\u00a0Gray Widder Sarah West and Meredith Whittaker. 2023. Open (for business): Big tech concentrated power and the political economy of open AI. Concentrated Power and the Political Economy of Open AI (August 17 2023) (2023).","DOI":"10.2139\/ssrn.4543807"},{"key":"e_1_3_3_1_100_2","unstructured":"David\u00a0Gray Widder and Richmond Wong. 2023. Thinking upstream: Ethics and policy opportunities in AI supply chains. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.07529 (2023)."},{"key":"e_1_3_3_1_101_2","doi-asserted-by":"publisher","DOI":"10.1145\/3593013.3594012"},{"key":"e_1_3_3_1_102_2","doi-asserted-by":"publisher","unstructured":"Tanja Wiehn. 2024. Synthetic Data: From Data Scarcity to Data Pollution. Surveillance & Society 22 4 (Dec. 2024) 472\u2013476. 10.24908\/ss.v22i4.18327","DOI":"10.24908\/ss.v22i4.18327"},{"key":"e_1_3_3_1_103_2","unstructured":"Matteo Wong. 2024. The GPT Era Is Already Ending - The Atlantic. https:\/\/www.theatlantic.com\/technology\/archive\/2024\/12\/openai-o1-reasoning-models\/680906\/?utm_source=chatgpt.com [Online; accessed 2025-01-22]."},{"key":"e_1_3_3_1_104_2","unstructured":"Kanit Wongsuphasawat Yang Liu and Jeffrey Heer. 2019. Goals process and challenges of exploratory data analysis: An interview study. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1911.00568 (2019)."},{"key":"e_1_3_3_1_105_2","doi-asserted-by":"publisher","DOI":"10.1145\/3630106.3659029"},{"key":"e_1_3_3_1_106_2","unstructured":"Liyang Xie Kaixiang Lin Shu Wang Fei Wang and Jiayu Zhou. 2018. Differentially private generative adversarial network. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1802.06739 (2018)."},{"key":"e_1_3_3_1_107_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01113"},{"key":"e_1_3_3_1_108_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580900"},{"key":"e_1_3_3_1_109_2","unstructured":"Longhui Yu Weisen Jiang Han Shi Jincheng Yu Zhengying Liu Yu Zhang James\u00a0T Kwok Zhenguo Li Adrian Weller and Weiyang Liu. 2023. Metamath: Bootstrap your own mathematical questions for large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.12284 (2023)."},{"key":"e_1_3_3_1_110_2","doi-asserted-by":"crossref","unstructured":"Amy\u00a0X Zhang Michael Muller and Dakuo Wang. 2020. How do data science workers collaborate? Roles workflows and tools. Proceedings of the ACM on Human-Computer Interaction 4 CSCW1 (2020) 1\u201323.","DOI":"10.1145\/3392826"},{"key":"e_1_3_3_1_111_2","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric Xing et\u00a0al. 2023. Judging llm-as-a-judge with mt-bench and chatbot arena. Advances in Neural Information Processing Systems 36 (2023) 46595\u201346623."}],"event":{"name":"FAccT '25: The 2025 ACM Conference on Fairness, Accountability, and Transparency","location":"Athens Greece","acronym":"FAccT '25"},"container-title":["Proceedings of the 2025 ACM Conference on Fairness, Accountability, and Transparency"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3715275.3732005","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,24]],"date-time":"2025-06-24T11:12:13Z","timestamp":1750763533000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3715275.3732005"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,23]]},"references-count":110,"alternative-id":["10.1145\/3715275.3732005","10.1145\/3715275"],"URL":"https:\/\/doi.org\/10.1145\/3715275.3732005","relation":{},"subject":[],"published":{"date-parts":[[2025,6,23]]},"assertion":[{"value":"2025-06-23","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}