{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T20:08:59Z","timestamp":1779480539506,"version":"3.53.1"},"publisher-location":"New York, NY, USA","reference-count":131,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,3,3]],"date-time":"2026-03-03T00:00:00Z","timestamp":1772496000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,3]]},"DOI":"10.1145\/3788646.3789521","type":"proceedings-article","created":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T19:48:12Z","timestamp":1779479292000},"page":"1-14","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["A Common Pool of Privacy Problems: Legal and Technical Lessons from a Large-Scale Web-Scraped Machine Learning Dataset"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-4275-653X","authenticated-orcid":false,"given":"Rachel","family":"Hong","sequence":"first","affiliation":[{"name":"University of Washington, Seattle, Washington, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3312-1733","authenticated-orcid":false,"given":"Jevan","family":"Hutson","sequence":"additional","affiliation":[{"name":"University of Washington, Seattle, Washington, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1362-554X","authenticated-orcid":false,"given":"William","family":"Agnew","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, Pennsylvania, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7825-2850","authenticated-orcid":false,"given":"Imaad","family":"Huda","sequence":"additional","affiliation":[{"name":"University of Washington, Seattle, Washington, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4899-226X","authenticated-orcid":false,"given":"Tadayoshi","family":"Kohno","sequence":"additional","affiliation":[{"name":"Georgetown University, Washington, D.C., USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3753-8405","authenticated-orcid":false,"given":"Jamie","family":"Morgenstern","sequence":"additional","affiliation":[{"name":"University of Washington, Seattle, Washington, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,5,22]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"California Consumer Privacy Act of","author":"Civ CA","year":"2018","unstructured":"CA Civ Code \u00a7 1798.192. 2018. California Consumer Privacy Act of 2018. https:\/\/leginfo.legislature.ca.gov\/faces\/codes_displayText.xhtml?division=3.&part=4.&lawCode=CIV&title=1.81.5"},{"key":"e_1_3_2_1_2_1","volume-title":"Oregon Consumer Privacy Act of","author":"OR SB","year":"2018","unstructured":"OR SB 619. 2018. Oregon Consumer Privacy Act of 2018. https:\/\/olis.oregonlegislature.gov\/liz\/2023R1\/Downloads\/MeasureDocument\/SB619\/Enrolled"},{"key":"e_1_3_2_1_3_1","volume-title":"Children's Online Privacy Protection Act of","author":"U.S.C.","year":"1998","unstructured":"15 U.S.C. \u00a7 6501. 1998. Children's Online Privacy Protection Act of 1998. https:\/\/uscode.house.gov\/view.xhtml?req=granuleid%3AUSC-prelim-title15-section6501&edition=prelim"},{"key":"e_1_3_2_1_4_1","unstructured":"Adobe. 2025. Content Credentials overview. https:\/\/helpx.adobe.com\/creative-cloud\/help\/content-credentials.html"},{"key":"e_1_3_2_1_5_1","unstructured":"Stability AI. 2025. https:\/\/stability.ai\/news\/stable-diffusion-public-release"},{"key":"e_1_3_2_1_6_1","unstructured":"Spawning AI. 2025. Spawning API. https:\/\/api.spawning.ai\/spawning-api"},{"key":"e_1_3_2_1_7_1","unstructured":"Amazon. 2025. DetectFaces. https:\/\/docs.aws.amazon.com\/rekognition\/latest\/APIReference\/API_DetectFaces.html"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","first-page":"55320","DOI":"10.52202\/075280-2415","article-title":"Ethical considerations for responsible data curation","volume":"36","author":"Andrews Jerone","year":"2023","unstructured":"Jerone Andrews, Dora Zhao, William Thong, Apostolos Modas, Orestis Papakyriakopoulos, and Alice Xiang. 2023. Ethical considerations for responsible data curation. Advances in Neural Information Processing Systems 36 (2023), 55320\u201355360.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_9_1","unstructured":"Internet Archive. 2013. Wayback Machine APIs. https:\/\/archive.org\/help\/wayback_api.php"},{"key":"e_1_3_2_1_10_1","unstructured":"Rosana Ardila Megan Branson Kelly Davis Michael Kohler Josh Meyer Michael Henretty Reuben Morais Lindsay Saunders Francis Tyers and Gregor Weber. 2020. Common Voice: A Massively-Multilingual Speech Corpus. In Proceedings of the Twelfth Language Resources and Evaluation Conference Nicoletta Calzolari Fr\u00e9d\u00e9ric B\u00e9chet Philippe Blache Khalid Choukri Christopher Cieri Thierry Declerck Sara Goggi Hitoshi Isahara Bente Maegaard Joseph Mariani H\u00e9l\u00e8ne Mazo Asuncion Moreno Jan Odijk and Stelios Piperidis (Eds.). European Language Resources Association Marseille France 4218\u20134222. https:\/\/aclanthology.org\/2020.lrec-1.520\/"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1214\/aos\/1013699998"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533083"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems. Curran Associates Inc.","author":"Birhane Abeba","year":"2023","unstructured":"Abeba Birhane, Vinay Prabhu, Sang Han, Vishnu Naresh Boddeti, and Alexandra Sasha Luccioni. 2023. Into the LAION's den: Investigating hate in multimodal datasets. In Proceedings of the 37th International Conference on Neural Information Processing Systems. Curran Associates Inc., New Orleans, LA, USA, Article 930, 17 pages."},{"key":"e_1_3_2_1_14_1","volume-title":"Vinay Uday Prabhu, and Emmanuel Kahembwe","author":"Birhane Abeba","year":"2021","unstructured":"Abeba Birhane, Vinay Uday Prabhu, and Emmanuel Kahembwe. 2021. Multimodal datasets: Misogyny, pornography, and malignant stereotypes."},{"key":"e_1_3_2_1_15_1","volume-title":"2nd IEEE Conference on Secure and Trustworthy Machine Learning. IEEE","author":"Birhane Abeba","year":"2024","unstructured":"Abeba Birhane, Ryan Steed, Victor Ojewale, Briana Vecchione, and Inioluwa Deborah Raji. 2024. SoK: AI Auditing: The Broken Bus on the Road to AI Accountability. In 2nd IEEE Conference on Secure and Trustworthy Machine Learning. IEEE, Toronto, Canada, 33 pages."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.52202\/068431-0265"},{"key":"e_1_3_2_1_17_1","unstructured":"Rishi Bommasani Drew A Hudson Ehsan Adeli Russ Altman Simran Arora Sydney von Arx Michael S Bernstein Jeannette Bohg Antoine Bosselut Emma Brunskill et al. 2021. On the opportunities and risks of foundation models."},{"key":"e_1_3_2_1_18_1","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. 2020. Language models are few-shot learners. Advances in Neural Information Processing Systems 33 (2020), 1877\u20131901.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1023\/A:1021316409277"},{"key":"e_1_3_2_1_20_1","volume-title":"Gregg Vanderheiden, Wendy Chisholm, John Slatin, and Jason White.","author":"Caldwell Ben","year":"2008","unstructured":"Ben Caldwell, Michael Cooper, Loretta Guarino Reid, Gregg Vanderheiden, Wendy Chisholm, John Slatin, and Jason White. 2008. Web content accessibility guidelines (WCAG) 2.0. WWW Consortium (W3C) 290, 1\u201334 (2008), 5\u201312."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3704262"},{"key":"e_1_3_2_1_22_1","volume-title":"2023 International Conference on Digital Image Computing: Techniques and Applications. IEEE, Port Macquarie, New South Wales, Australia, 348\u2013355","author":"Chen Yunzhuo","year":"2023","unstructured":"Yunzhuo Chen, Nur Al Hasan Haldar, Naveed Akhtar, and Ajmal Mian. 2023. Text-image guided Diffusion Model for generating Deepfake celebrity interactions. In 2023 International Conference on Digital Image Computing: Techniques and Applications. IEEE, Port Macquarie, New South Wales, Australia, 348\u2013355."},{"key":"e_1_3_2_1_23_1","unstructured":"Cloudflare. 2024. Cloudflare API v4 documentation: Get multiple domain details. https:\/\/developers.cloudflare.com\/api\/operations\/domain-intelligenceget-multiple-domain-details"},{"key":"e_1_3_2_1_24_1","unstructured":"Federal Trade Commission. 2025. COPPA Safe Harbor Program. https:\/\/www.ftc.gov\/enforcement\/coppa-safe-harbor-program"},{"key":"e_1_3_2_1_25_1","unstructured":"Creative Commons. 2025. CC BY 4.0. https:\/\/creativecommons.org\/licenses\/by\/4.0\/deed.en"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533143"},{"key":"e_1_3_2_1_27_1","unstructured":"Common Crawl. 2025. Common Crawl. https:\/\/commoncrawl.org"},{"key":"e_1_3_2_1_28_1","unstructured":"Common Crawl. 2025. Frequently asked questions. https:\/\/commoncrawl.org\/faq"},{"key":"e_1_3_2_1_29_1","unstructured":"DataComp. 2023. DataComp. https:\/\/github.com\/mlfoundations\/datacomp"},{"key":"e_1_3_2_1_30_1","unstructured":"DataComp. 2023. Is there overlap between common-pool and laion-5B? https:\/\/github.com\/mlfoundations\/datacomp\/issues\/19"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1177\/20539517211035955"},{"key":"e_1_3_2_1_32_1","volume-title":"An archival perspective on pretraining data. Patterns 5, 4","author":"Desai Meera A","year":"2024","unstructured":"Meera A Desai, Irene V Pasquetto, Abigail Z Jacobs, and Dallas Card. 2024. An archival perspective on pretraining data. Patterns 5, 4 (2024), 11 pages."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.98"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.741"},{"key":"e_1_3_2_1_35_1","volume-title":"Ethical and Privacy Considerations for Research Using Online Fandom Data. Transformative works and cultures 33","author":"Dym Brianna","year":"2020","unstructured":"Brianna Dym and Casey Fiesler. 2020. Ethical and Privacy Considerations for Research Using Online Fandom Data. Transformative works and cultures 33 (2020), 19 pages."},{"key":"e_1_3_2_1_36_1","unstructured":"EasyOCR. 2025. EasyOCR. https:\/\/www.jaided.ai\/easyocr\/"},{"key":"e_1_3_2_1_37_1","unstructured":"European Parliament and Council of the European Union. 2016. Regulation (EU) 2016\/679 of the European Parliament and of the Council. European Union. https:\/\/data.europa.eu\/eli\/reg\/2016\/679\/oj"},{"key":"e_1_3_2_1_38_1","unstructured":"Hugging Face. 2025. https:\/\/huggingface.co\/api\/datasets\/mlfoundations\/datacomp_pools?expand%5B%5D=downloads&expand%5B%5D=downloadsAllTime"},{"key":"e_1_3_2_1_39_1","volume-title":"Participant","author":"Fiesler Casey","year":"2018","unstructured":"Casey Fiesler and Nicholas Proferes. 2018. \u201cParticipant\u201d perceptions of Twitter research ethics. Social Media+ Society 4, 1 (2018), 2056305118763366."},{"key":"e_1_3_2_1_40_1","unstructured":"Flaticon. 2025. https:\/\/www.flaticon.com\/authors\/freepik"},{"key":"e_1_3_2_1_41_1","unstructured":"Freepik. 2025. https:\/\/www.freepik.com"},{"key":"e_1_3_2_1_42_1","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems. Curran Associates Inc.","author":"Gadre Samir Yitzhak","year":"2023","unstructured":"Samir Yitzhak Gadre, Gabriel Ilharco, Alex Fang, Jonathan Hayase, Georgios Smyrnis, Thao Nguyen, Ryan Marten, Mitchell Wortsman, Dhruba Ghosh, Jieyu Zhang, Eyal Orgad, Rahim Entezari, Giannis Daras, Sarah Pratt, Vivek Ramanujan, Yonatan Bitton, Kalyani Marathe, Stephen Mussmann, Richard Vencu, Mehdi Cherti, Ranjay Krishna, Pang Wei Koh, Olga Saukh, Alexander Ratner, Shuran Song, Hannaneh Hajishirzi, Ali Farhadi, Romain Beaumont, Sewoong Oh, Alex Dimakis, Jenia Jitsev, Yair Carmon, Vaishaal Shankar, and Ludwig Schmidt. 2023. DataComp: in search of the next generation of multimodal datasets. In Proceedings of the 37th International Conference on Neural Information Processing Systems. Curran Associates Inc., New Orleans, LA, USA, Article 1179, 21 pages."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3713171"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3715275.3732195"},{"key":"e_1_3_2_1_45_1","unstructured":"Github. 2023. img2dataset ignores X-Robots-Tag. https:\/\/github.com\/rom1504\/img2dataset\/issues\/298"},{"key":"e_1_3_2_1_46_1","unstructured":"Github. 2023. Implement Robots.txt support. https:\/\/github.com\/rom1504\/img2dataset\/issues\/48"},{"key":"e_1_3_2_1_47_1","unstructured":"Github. 2023. Metadata download error - OSError: Consistency check failed. https:\/\/github.com\/mlfoundations\/datacomp\/issues\/33"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1007\/s43681-021-00095-8"},{"key":"e_1_3_2_1_49_1","unstructured":"Jia Guo Jiankang Deng Alexandros Lattas and Stefanos Zafeiriou. 2021. Sample and computation redistribution for efficient face detection."},{"key":"e_1_3_2_1_50_1","unstructured":"Ritwik Gupta Leah Walker Rodolfo Corona Stephanie Fu Suzanne Petryk Janet Napolitano Trevor Darrell and Andrew W Reddie. 2024. Data-Centric AI Governance: Addressing the Limitations of Model-Focused Policies."},{"key":"e_1_3_2_1_51_1","volume-title":"We must fix the lack of transparency around the data used to train foundation models. Harvard Data Science Review Special Issue 5 (May","author":"Hardinges Jack","year":"2024","unstructured":"Jack Hardinges, Elena Simperl, and Nigel Shadbolt. 2024. We must fix the lack of transparency around the data used to train foundation models. Harvard Data Science Review Special Issue 5 (May 2024), 5 pages. https:\/\/hdsr.mitpress.mit.edu\/pub\/xau9dza3."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.21552\/edpl\/2018\/4\/5"},{"key":"e_1_3_2_1_53_1","first-page":"459","article-title":"The Public Information Fallacy","volume":"99","author":"Hartzog Woodrow","year":"2019","unstructured":"Woodrow Hartzog. 2019. The Public Information Fallacy. BUL Rev. 99 (2019), 459.","journal-title":"BUL Rev."},{"key":"e_1_3_2_1_54_1","first-page":"1343","article-title":"Surveillance as loss of obscurity","volume":"72","author":"Hartzog Woodrow","year":"2015","unstructured":"Woodrow Hartzog and Evan Selinger. 2015. Surveillance as loss of obscurity. Wash. & Lee L. Rev. 72 (2015), 1343.","journal-title":"Wash. & Lee L. Rev."},{"key":"e_1_3_2_1_55_1","volume-title":"International Conference on Financial Cryptography and Data Security. Springer, Barbados, 77\u201388","author":"Henne Benjamin","year":"2014","unstructured":"Benjamin Henne, Maximilian Koch, and Matthew Smith. 2014. On the awareness, control and privacy of shared photo metadata. In International Conference on Financial Cryptography and Data Security. Springer, Barbados, 77\u201388."},{"key":"e_1_3_2_1_56_1","first-page":"439","article-title":"From Individual Control to Social Protection: New Paradigms for Privacy Law in the Age of Predictive Analytics'(2020)","volume":"79","author":"Hirsch Dennis D","year":"2020","unstructured":"Dennis D Hirsch. 2020. From Individual Control to Social Protection: New Paradigms for Privacy Law in the Age of Predictive Analytics'(2020). Md L Rev 79 (2020), 439.","journal-title":"Md L Rev"},{"key":"e_1_3_2_1_57_1","volume-title":"Proceedings of the 4th ACM Conference on Equity and Access in Algorithms, Mechanisms, and Optimization. ACM, San Luis Potos\u00ed, Mexico, 1\u201317","author":"Hong Rachel","year":"2024","unstructured":"Rachel Hong, William Agnew, Tadayoshi Kohno, and Jamie Morgenstern. 2024. Who's in and who's out? A case study of multimodal CLIP-filtering in DataComp. In Proceedings of the 4th ACM Conference on Equity and Access in Algorithms, Mechanisms, and Optimization. ACM, San Luis Potos\u00ed, Mexico, 1\u201317."},{"key":"e_1_3_2_1_58_1","article-title":"Forget Me Not? Machine Unlearning's Implication for Privacy Law","volume":"27","author":"Hutson Jevan","year":"2025","unstructured":"Jevan Hutson, Cedric Whitney, and Jay T Conrad. 2025. Forget Me Not? Machine Unlearning's Implication for Privacy Law. Science and Technology Law Review 27, 1 (2025), 57 pages.","journal-title":"Science and Technology Law Review"},{"key":"e_1_3_2_1_59_1","first-page":"124","article-title":"America's next\u201c Stop Model!","volume":"8","author":"Hutson Jevan","year":"2024","unstructured":"Jevan Hutson and Ben Winters. 2024. America's next\u201c Stop Model!\u201d Model Deletion. Geo. L. Tech. Rev. 8 (2024), 124.","journal-title":"Model Deletion. Geo. L. Tech. Rev."},{"key":"e_1_3_2_1_60_1","unstructured":"ICO. 2025. Overview - Data Protection and the EU. https:\/\/ico.org.uk\/for-organisations\/data-protection-and-the-eu\/overview-data-protection-and-the-eu\/"},{"key":"e_1_3_2_1_61_1","unstructured":"iKeepSafe. 2025. Certified Products. https:\/\/ikeepsafe.org\/products\/#coppa"},{"key":"e_1_3_2_1_62_1","first-page":"171","article-title":"The subjects and stages of AI dataset development: A framework for dataset accountability","volume":"19","author":"Khan Mehtab","year":"2022","unstructured":"Mehtab Khan and Alex Hanna. 2022. The subjects and stages of AI dataset development: A framework for dataset accountability. Ohio St. Tech. LJ 19 (2022), 171.","journal-title":"Ohio St. Tech. LJ"},{"key":"e_1_3_2_1_63_1","unstructured":"kidSAFE. 2025. kidSAFE Seal Program Member List. https:\/\/www.kidsafeseal.com\/certiedproducts.html"},{"key":"e_1_3_2_1_64_1","volume-title":"32nd USENIX Security Symposium (USENIX Security 23)","author":"Kohno Tadayoshi","year":"2023","unstructured":"Tadayoshi Kohno, Yasemin Acar, and Wulf Loh. 2023. Ethical frameworks and computer security trolley problems: Foundations for conversations. In 32nd USENIX Security Symposium (USENIX Security 23). USENIX, Anaheim, CA, 5145\u20135162."},{"key":"e_1_3_2_1_65_1","unstructured":"LAION. 2025. Privacy Policy. https:\/\/laion.ai\/privacy-policy\/"},{"key":"e_1_3_2_1_66_1","volume-title":"Mattia Nee, Catherine Arnett, Pavel Chizhov, Eliot Krzystof Jones, Ir\u00e8ne Girard, David Mach, Anastasia Stasenko, and Ivan P Yamshchikov.","author":"Langlais Pierre-Carl","year":"2025","unstructured":"Pierre-Carl Langlais, Carlos Rosas Hinostroza, Mattia Nee, Catherine Arnett, Pavel Chizhov, Eliot Krzystof Jones, Ir\u00e8ne Girard, David Mach, Anastasia Stasenko, and Ivan P Yamshchikov. 2025. Common Corpus: The Largest Collection of Ethical Data for LLM Pre-Training."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1177\/20539517231188723"},{"key":"e_1_3_2_1_68_1","unstructured":"Christina Lee. 2025. Beyond Algorithmic Disgorgement: Remedying Algorithmic Harms."},{"key":"e_1_3_2_1_69_1","unstructured":"Chung Peng Lee Rachel Hong Harry Jiang Aster Plotnik William Agnew and Jamie Morgenstern. 2025. How do data owners say no? A case study of data consent mechanisms in web-scraped vision-language AI training datasets."},{"key":"e_1_3_2_1_70_1","first-page":"707","article-title":"Binary codes capable of correcting deletions, insertions, and reversals. Soviet physics","volume":"10","author":"Levenshtein Vladimir I.","year":"1965","unstructured":"Vladimir I. Levenshtein. 1965. Binary codes capable of correcting deletions, insertions, and reversals. Soviet physics. Doklady 10 (1965), 707\u2013710.","journal-title":"Doklady"},{"key":"e_1_3_2_1_71_1","unstructured":"Dongfang Li Zetian Sun Xinshuo Hu Zhenyu Liu Ziyang Chen Baotian Hu Aiguo Wu and Min Zhang. 2023. A survey of large language models attribution."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","unstructured":"Minghao Li Tengchao Lv Jingye Chen Lei Cui Yijuan Lu Dinei Florencio Cha Zhang Zhoujun Li and Furu Wei. 2023. TrOCR: Transformer-based optical character recognition with pre-trained models. In Proceedings of the Thirty-Seventh AAAI Conference on Artificial Intelligence and Thirty-Fifth Conference on Innovative Applications of Artificial Intelligence and Thirteenth Symposium on Educational Advances in Artificial Intelligence. AAAI Press Washington DC USA Article 1469 9 pages. doi:10.1609\/aaai.v37i11.26538","DOI":"10.1609\/aaai.v37i11.26538"},{"key":"e_1_3_2_1_73_1","volume-title":"2024 IEEE Conference on Secure and Trustworthy Machine Learning. IEEE, IEEE","author":"Li Zhangheng","year":"2024","unstructured":"Zhangheng Li, Junyuan Hong, Bo Li, and Zhangyang Wang. 2024. Shake to leak: Fine-tuning diffusion models can amplify the generative privacy risk. In 2024 IEEE Conference on Secure and Trustworthy Machine Learning. IEEE, IEEE, Toronto, Canada, 18\u201332."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-024-00878-8"},{"key":"e_1_3_2_1_75_1","volume-title":"Proceedings of the 38th International Conference on Neural Information Processing Systems. Curran Associates Inc.","author":"Longpre Shayne","year":"2024","unstructured":"Shayne Longpre, Robert Mahari, Ariel Lee, Campbell Lund, Hamidah Oderinwale, William Brannon, Nayan Saxena, Naana Obeng-Marnu, Tobin South, Cole Hunter, Kevin Klyman, Christopher Klamm, Hailey Schoelkopf, Nikhil Singh, Manuel Cherep, Ahmad Mustafa Anis, An Dinh, Caroline Chitongo, Da Yin, Damien Sileo, Deividas Mataciunas, Diganta Misra, Emad Alghamdi, Enrico Shippole, Jianguo Zhang, Joanna Materzynska, Kun Qian, Kush Tiwary, Lester Miranda, Manan Dey, Minnie Liang, Mohammed Hamdy, Niklas Muennighoff, Seonghyeon Ye, Seungone Kim, Shrestha Mohanty, Vipul Gupta, Vivek Sharma, Vu Minh Chien, Xuhui Zhou, Yizhi Li, Caiming Xiong, Luis Villa, Stella Biderman, Hanlin Li, Daphne Ippolito, Sara Hooker, Jad Kabbara, Sandy Pentland, and Data Provenance Initiative. 2024. Consent in crisis: The rapid decline of the AI data commons. In Proceedings of the 38th International Conference on Neural Information Processing Systems. Curran Associates Inc., Vancouver, BC, Canada, Article 3431, 46 pages."},{"key":"e_1_3_2_1_76_1","volume-title":"Proceedings of the 41st International Conference on Machine Learning. JMLR","author":"Longpre Shayne","year":"2024","unstructured":"Shayne Longpre, Robert Mahari, Naana Obeng-Marnu, William Brannon, Tobin South, Katy Gero, Sandy Pentland, and Jad Kabbara. 2024. Position: data authenticity, consent, & provenance for AI are all broken: what will it take to fix: them?. In Proceedings of the 41st International Conference on Machine Learning. JMLR, Vienna, Austria, Article 1328, 15 pages."},{"key":"e_1_3_2_1_77_1","unstructured":"Richard McPherson Reza Shokri and Vitaly Shmatikov. 2016. Defeating image obfuscation with deep learning."},{"key":"e_1_3_2_1_78_1","volume-title":"Rizwan Ahmed Khan, and Mueen Uddin","author":"Memon Jamshed","year":"2020","unstructured":"Jamshed Memon, Maira Sami, Rizwan Ahmed Khan, and Mueen Uddin. 2020. Handwritten optical character recognition (OCR): A comprehensive systematic literature review (SLR). IEEE access 8 (2020), 142642\u2013142668."},{"key":"e_1_3_2_1_79_1","volume-title":"2021 34th SIBGRAPI Conference on Graphics, Patterns and Images (SIBGRAPI). IEEE, IEEE, Online, 247\u2013254","author":"Menezes Hanna F","year":"2021","unstructured":"Hanna F Menezes, Arthur SC Ferreira, Eanes T Pereira, and Herman M Gomes. 2021. Bias and fairness in face detection. In 2021 34th SIBGRAPI Conference on Graphics, Patterns and Images (SIBGRAPI). IEEE, IEEE, Online, 247\u2013254."},{"key":"e_1_3_2_1_80_1","unstructured":"Microsoft. 2025. Presidio. https:\/\/microsoft.github.io\/presidio\/"},{"key":"e_1_3_2_1_81_1","unstructured":"Midjourney. 2025. https:\/\/www.midjourney.com\/home"},{"key":"e_1_3_2_1_82_1","unstructured":"Niloofar Mireshghallah Maria Antoniak Yash More Yejin Choi and Golnoosh Farnadi. 2024. Trust no bot: Discovering personal disclosures in human-llm conversations in the wild."},{"key":"e_1_3_2_1_83_1","unstructured":"Mozilla. 2025. X-Robots-Tag. https:\/\/developer.mozilla.org\/en-US\/docs\/Web\/HTTP\/Reference\/Headers\/X-Robots-Tag"},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"crossref","first-page":"3","DOI":"10.1075\/li.30.1.03nad","article-title":"A survey of named entity recognition and classication","volume":"30","author":"Nadeau David","year":"2007","unstructured":"David Nadeau and Satoshi Sekine. 2007. A survey of named entity recognition and classication. Lingvisticae Investigationes 30, 1 (2007), 3\u201326.","journal-title":"Lingvisticae Investigationes"},{"key":"e_1_3_2_1_85_1","unstructured":"Milad Nasr Nicholas Carlini Jonathan Hayase Matthew Jagielski A Feder Cooper Daphne Ippolito Christopher A Choquette-Choo Eric Wallace Florian Tram\u00e8r and Katherine Lee. 2023. Scalable extraction of training data from (production) language models."},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","DOI":"10.1145\/3322905.3322917"},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"publisher","DOI":"10.1145\/3476887.3476888"},{"key":"e_1_3_2_1_88_1","doi-asserted-by":"publisher","DOI":"10.1145\/3749987"},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"publisher","DOI":"10.1162\/DAED_a_00113"},{"key":"e_1_3_2_1_90_1","volume-title":"The ethics of information technologies","author":"Nissenbaum Helen","unstructured":"Helen Nissenbaum. 2020. Protecting privacy in an information age: The problem of privacy in public. In The ethics of information technologies. Routledge, London, UK, 141\u2013178."},{"key":"e_1_3_2_1_91_1","unstructured":"Future of Privacy Forum. 2024. An Omnibus Definition of \u201cSensitive Data\u201d Across Comprehensive State Privacy Laws. https:\/\/cdn.sanity.io\/files\/3tzzh18d\/production\/eac1440d340a728f1f2c00ab6c27aff446bce67d.pdf"},{"key":"e_1_3_2_1_92_1","volume-title":"Proceedings of the 14th European Conference on Computer Vision. Springer, Springer Nature","author":"Oh Seong Joon","year":"2016","unstructured":"Seong Joon Oh, Rodrigo Benenson, Mario Fritz, and Bernt Schiele. 2016. Faceless person recognition: Privacy implications in social media. In Proceedings of the 14th European Conference on Computer Vision. Springer, Springer Nature, Amsterdam, The Netherlands, 19\u201335."},{"key":"e_1_3_2_1_93_1","unstructured":"OpenAI. 2025. ChatGPT. https:\/\/openai.com\/chatgpt\/overview\/"},{"key":"e_1_3_2_1_94_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. IEEE","author":"Orekondy Tribhuvanesh","year":"2018","unstructured":"Tribhuvanesh Orekondy, Mario Fritz, and Bernt Schiele. 2018. Connecting pixels to privacy and utility: Automatic redaction of private information in images. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. IEEE, Salt Lake City, Utah, USA, 8466\u20138475."},{"key":"e_1_3_2_1_95_1","volume-title":"Proceedings of the IEEE International Conference on Computer Vision. IEEE","author":"Orekondy Tribhuvanesh","year":"2017","unstructured":"Tribhuvanesh Orekondy, Bernt Schiele, and Mario Fritz. 2017. Towards a visual privacy advisor: Understanding and predicting privacy risks in images. In Proceedings of the IEEE International Conference on Computer Vision. IEEE, Venice, Italy, 3686\u20133695."},{"key":"e_1_3_2_1_96_1","unstructured":"PaddleOCR. 2025. PaddleOCR. https:\/\/paddlepaddle.github.io\/PaddleOCR\/latest\/en\/index.html"},{"key":"e_1_3_2_1_97_1","volume-title":"Emily M Bender, Emily Denton, and Alex Hanna.","author":"Paullada Amandalynne","year":"2021","unstructured":"Amandalynne Paullada, Inioluwa Deborah Raji, Emily M Bender, Emily Denton, and Alex Hanna. 2021. Data and its (dis) contents: A survey of dataset development and use in machine learning research. Patterns 2, 11 (2021), 17 pages."},{"key":"e_1_3_2_1_98_1","unstructured":"Charles Preston. 2020. List of religious populations. britannica.com\/topic\/List-of-religious-populations"},{"key":"e_1_3_2_1_99_1","unstructured":"PRIVO. 2025. COPPA Safe Harbor Program. https:\/\/www.privo.com\/coppa-safe-harbor-program"},{"key":"e_1_3_2_1_100_1","doi-asserted-by":"crossref","first-page":"1583","DOI":"10.1017\/glj.2021.79","article-title":"The difficulty of defining sensitive data\u2014the concept of sensitive data in the EU data protection framework","volume":"22","author":"Quinn Paul","year":"2021","unstructured":"Paul Quinn and Gianclaudio Malgieri. 2021. The difficulty of defining sensitive data\u2014the concept of sensitive data in the EU data protection framework. German Law Journal 22, 8 (2021), 1583\u20131612.","journal-title":"German Law Journal"},{"key":"e_1_3_2_1_101_1","volume-title":"International Conference on Machine Learning. PMLR, Online, 8748\u20138763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, Online, 8748\u20138763."},{"key":"e_1_3_2_1_102_1","first-page":"1461","article-title":"The pathologies of digital consent","volume":"96","author":"Richards Neil","year":"2018","unstructured":"Neil Richards and Woodrow Hartzog. 2018. The pathologies of digital consent. Wash. UL Rev. 96 (2018), 1461.","journal-title":"Wash. UL Rev."},{"key":"e_1_3_2_1_103_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_104_1","volume-title":"Proceedings of the 22nd ACM Internet Measurement Conference. ACM","author":"Ruth Kimberly","year":"2022","unstructured":"Kimberly Ruth, Deepak Kumar, Brandon Wang, Luke Valenta, and Zakir Durumeric. 2022. Toppling top lists: Evaluating the accuracy of popular website lists. In Proceedings of the 22nd ACM Internet Measurement Conference. ACM, Nice, France, 374\u2013387."},{"key":"e_1_3_2_1_105_1","first-page":"36479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume":"35","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in Neural Information Processing Systems 35 (2022), 36479\u201336494.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_106_1","doi-asserted-by":"publisher","DOI":"10.1145\/3476058"},{"key":"e_1_3_2_1_107_1","first-page":"25278","article-title":"LAION-5b: An open large-scale dataset for training next generation image-text models","volume":"35","author":"Schuhmann Christoph","year":"2022","unstructured":"Christoph Schuhmann, Romain Beaumont, Richard Vencu, Cade Gordon, Ross Wightman, Mehdi Cherti, Theo Coombes, Aarush Katta, Clayton Mullis, Mitchell Wortsman, et al. 2022. LAION-5b: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems 35 (2022), 25278\u201325294.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_108_1","doi-asserted-by":"publisher","DOI":"10.1177\/2378023120967171"},{"key":"e_1_3_2_1_109_1","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533148"},{"key":"e_1_3_2_1_110_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.4236\/jsea.2024.171001","article-title":"Whispered tuning: Data privacy preservation in fine-tuning LLMs through differential privacy","volume":"17","author":"Singh Tanmay","year":"2024","unstructured":"Tanmay Singh, Harshvardhan Aditya, Vijay K Madisetti, and Arshdeep Bahga. 2024. Whispered tuning: Data privacy preservation in fine-tuning LLMs through differential privacy. Journal of Software Engineering and Applications 17, 1 (2024), 1\u201322.","journal-title":"Journal of Software Engineering and Applications"},{"key":"e_1_3_2_1_111_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2007.4376991"},{"key":"e_1_3_2_1_112_1","first-page":"477","article-title":"A taxonomy of privacy. U. Pa. l","volume":"154","author":"Solove Daniel J","year":"2005","unstructured":"Daniel J Solove. 2005. A taxonomy of privacy. U. Pa. l. Rev. 154 (2005), 477.","journal-title":"Rev."},{"key":"e_1_3_2_1_113_1","first-page":"1880","article-title":"Privacy self-management and the consent dilemma","volume":"126","author":"Solove Daniel J","year":"2013","unstructured":"Daniel J Solove. 2013. Privacy self-management and the consent dilemma. Harvard Law Review 126 (2013), 1880.","journal-title":"Harvard Law Review"},{"key":"e_1_3_2_1_114_1","first-page":"593","article-title":"Murky consent: an approach to the fictions of consent in privacy law","volume":"104","author":"Solove Daniel J","year":"2024","unstructured":"Daniel J Solove. 2024. Murky consent: an approach to the fictions of consent in privacy law. BUL Rev. 104 (2024), 593.","journal-title":"BUL Rev."},{"key":"e_1_3_2_1_115_1","first-page":"1","article-title":"Artificial Intelligence and Privacy. Fla","volume":"77","author":"Solove Daniel J.","year":"2025","unstructured":"Daniel J. Solove. 2025. Artificial Intelligence and Privacy. Fla. L. Rev. 77, 1 (Jan. 2025), 1\u201373.","journal-title":"L. Rev."},{"key":"e_1_3_2_1_116_1","first-page":"1021","article-title":"Kafka in the Age of AI and the Futility of Privacy as Control","volume":"104","author":"Solove Daniel J","year":"2024","unstructured":"Daniel J Solove and Woodrow Hartzog. 2024. Kafka in the Age of AI and the Futility of Privacy as Control. BUL Rev. 104 (2024), 1021.","journal-title":"BUL Rev."},{"key":"e_1_3_2_1_117_1","first-page":"1521","article-title":"The Great Scrape: The clash between scraping and privacy","volume":"113","author":"Solove Daniel J","year":"2025","unstructured":"Daniel J Solove and Woodrow Hartzog. 2025. The Great Scrape: The clash between scraping and privacy. Cal. L. Rev. 113 (2025), 1521.","journal-title":"Cal. L. Rev."},{"key":"e_1_3_2_1_118_1","doi-asserted-by":"publisher","DOI":"10.1145\/1242572.1242726"},{"key":"e_1_3_2_1_119_1","first-page":"239","article-title":"Big data for all: Privacy and user control in the age of analytics","volume":"11","author":"Tene Omer","year":"2012","unstructured":"Omer Tene and Jules Polonetsky. 2012. Big data for all: Privacy and user control in the age of analytics. Nw. J. Tech. & Intell. Prop. 11 (2012), 239.","journal-title":"Nw. J. Tech. & Intell. Prop."},{"key":"e_1_3_2_1_120_1","first-page":"3","article-title":"Identifying and eliminating csam in generative ml training data and models","volume":"23","author":"Thiel David","year":"2023","unstructured":"David Thiel. 2023. Identifying and eliminating csam in generative ml training data and models. Stanford Internet Observatory, Cyber Policy Center, December 23 (2023), 3.","journal-title":"Stanford Internet Observatory, Cyber Policy Center"},{"key":"e_1_3_2_1_121_1","first-page":"617","article-title":"Meaningful choice: A history of consent and alternatives to the consent myth","volume":"22","author":"Tschider Charlotte A","year":"2020","unstructured":"Charlotte A Tschider. 2020. Meaningful choice: A history of consent and alternatives to the consent myth. NCJL & Tech. 22 (2020), 617.","journal-title":"NCJL & Tech."},{"key":"e_1_3_2_1_122_1","volume-title":"2022 6th International Conference on Electronics, Communication and Aerospace Technology. IEEE, IEEE","author":"Vedhaviyassh DR","year":"2022","unstructured":"DR Vedhaviyassh, R Sudhan, G Saranya, Mozhgan Safa, and D Arun. 2022. Comparative analysis of EasyOCR and TesseractOCR for automatic license plate recognition using deep learning algorithm. In 2022 6th International Conference on Electronics, Communication and Aerospace Technology. IEEE, IEEE, Coimbatore, Tamil Nadu, India, 966\u2013971."},{"key":"e_1_3_2_1_123_1","unstructured":"W3C. 2011. Write Web Crawler. https:\/\/www.w3.org\/wiki\/Write_Web_Crawler"},{"key":"e_1_3_2_1_124_1","first-page":"559","article-title":"Privacy as trust: Sharing personal information in a networked world","volume":"69","author":"Waldman Ari Ezra","year":"2014","unstructured":"Ari Ezra Waldman. 2014. Privacy as trust: Sharing personal information in a networked world. U. Miami L. Rev. 69 (2014), 559.","journal-title":"U. Miami L. Rev."},{"key":"e_1_3_2_1_125_1","unstructured":"Benjamin Wilson Judy Hoffman and Jamie Morgenstern. 2019. Predictive inequity in object detection."},{"key":"e_1_3_2_1_126_1","volume-title":"Michael Duan, Hyunwoo Kim, Yejin Choi, Yulia Tsvetkov, Sewoong Oh, and Pang Wei Koh.","author":"Xin Rui","year":"2025","unstructured":"Rui Xin, Niloofar Mireshghallah, Shuyue Stella Li, Michael Duan, Hyunwoo Kim, Yejin Choi, Yulia Tsvetkov, Sewoong Oh, and Pang Wei Koh. 2025. A false sense of privacy: Evaluating textual data sanitization beyond surface-level privacy leakage."},{"key":"e_1_3_2_1_127_1","volume-title":"International Conference on Machine Learning. PMLR, PMLR","author":"Yang Kaiyu","year":"2022","unstructured":"Kaiyu Yang, Jacqueline H Yau, Li Fei-Fei, Jia Deng, and Olga Russakovsky. 2022. A study of face obfuscation in ImageNet. In International Conference on Machine Learning. PMLR, PMLR, Baltimore, Maryland, USA, 25313\u201325330."},{"key":"e_1_3_2_1_128_1","volume-title":"a manually verified dataset of globally famous biographies. Scientific data 3, 1","author":"Yu Amy Zhao","year":"2016","unstructured":"Amy Zhao Yu, Shahar Ronen, Kevin Hu, Tiffany Lu, and C\u00e9sar A Hidalgo. 2016. Pantheon 1.0, a manually verified dataset of globally famous biographies. Scientific data 3, 1 (2016), 1\u201316."},{"key":"e_1_3_2_1_129_1","doi-asserted-by":"publisher","unstructured":"Jing Zhao Heliang Zheng Chaoyue Wang Long Lan Wanrong Huang and Yuhua Tang. 2025. MagicNaming: consistent identity generation by finding a \u201cname space\u201d in T2I diffusion models. In Proceedings of the Thirty-Ninth AAAI Conference on Artificial Intelligence and Thirty-Seventh Conference on Innovative Applications of Artificial Intelligence and Fifteenth Symposium on Educational Advances in Artificial Intelligence. AAAI Press Philadelphia Pennsylvania USA Article 1160 9 pages. doi:10.1609\/aaai.v39i10.33133","DOI":"10.1609\/aaai.v39i10.33133"},{"key":"e_1_3_2_1_130_1","volume-title":"Principal visual word discovery for automatic license plate detection","author":"Zhou Wengang","year":"2012","unstructured":"Wengang Zhou, Houqiang Li, Yijuan Lu, and Qi Tian. 2012. Principal visual word discovery for automatic license plate detection. IEEE transactions on image processing 21, 9 (2012), 4269\u20134279."},{"key":"e_1_3_2_1_131_1","volume-title":"Addressing conceptual gaps in big data research ethics: An application of contextual integrity. Social Media+ Society 4, 2","author":"Zimmer Michael","year":"2018","unstructured":"Michael Zimmer. 2018. Addressing conceptual gaps in big data research ethics: An application of contextual integrity. Social Media+ Society 4, 2 (2018), 2056305118768300."}],"event":{"name":"CSLAW '26: Symposium on Computer Science and Law","location":"Berkeley CA USA","acronym":"CSLAW '26","sponsor":["ACM Association for Computing Machinery"]},"container-title":["Proceedings of the Symposium on Computer Science and Law"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3788646.3789521","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T19:49:03Z","timestamp":1779479343000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3788646.3789521"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,3]]},"references-count":131,"alternative-id":["10.1145\/3788646.3789521","10.1145\/3788646"],"URL":"https:\/\/doi.org\/10.1145\/3788646.3789521","relation":{},"subject":[],"published":{"date-parts":[[2026,3,3]]},"assertion":[{"value":"2026-05-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}