{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,30]],"date-time":"2026-01-30T04:39:18Z","timestamp":1769747958140,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,14]],"date-time":"2024-04-14T00:00:00Z","timestamp":1713052800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-sa\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,14]]},"DOI":"10.1145\/3650105.3652298","type":"proceedings-article","created":{"date-parts":[[2024,6,12]],"date-time":"2024-06-12T16:01:35Z","timestamp":1718208095000},"page":"74-85","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["An Exploratory Investigation into Code License Infringements in Large Language Model Training Datasets"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-9574-2414","authenticated-orcid":false,"given":"Jonathan","family":"Katzy","sequence":"first","affiliation":[{"name":"Delft University of Technology, Delft, Netherlands"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6251-770X","authenticated-orcid":false,"given":"Razvan","family":"Popescu","sequence":"additional","affiliation":[{"name":"Delft University of Technology, Delft, Netherlands"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4850-3312","authenticated-orcid":false,"given":"Arie","family":"Van Deursen","sequence":"additional","affiliation":[{"name":"Delft University of Technology, Delft, Netherlands"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5093-5523","authenticated-orcid":false,"given":"Maliheh","family":"Izadi","sequence":"additional","affiliation":[{"name":"Delft University of Technology, Delft, Netherlands"}]}],"member":"320","published-online":{"date-parts":[[2024,6,12]]},"reference":[{"issue":"1","key":"e_1_3_2_1_1_1","first-page":"23","article-title":"Getty Images (US), Inc. v. Stability AI","year":"2023","unstructured":"2023. Getty Images (US), Inc. v. Stability AI, Inc. United States District Court for the District of Delaware. Case No. 1:23-cv-00135-UNA.","journal-title":"Inc. United States District Court for the District of Delaware. Case"},{"issue":"1","key":"e_1_3_2_1_2_1","first-page":"23","article-title":"Mike Huckabee, Relevate Group, David Kinnaman, Tsh Oxenreider, Lysa TerKeurst, and John Blase, Plaintiffs, v. Meta Platforms, Inc., Bloomberg L.P., Bloomberg Finance, L.P., Microsoft Corporation, and The Eleutherai Institute","year":"2023","unstructured":"2023. Mike Huckabee, Relevate Group, David Kinnaman, Tsh Oxenreider, Lysa TerKeurst, and John Blase, Plaintiffs, v. Meta Platforms, Inc., Bloomberg L.P., Bloomberg Finance, L.P., Microsoft Corporation, and The Eleutherai Institute, Defendants. United States District Court Southern District of New York. Case No. 1:23-cv-09152-LGS.","journal-title":"Defendants. United States District Court Southern District of New York. Case"},{"issue":"1","key":"e_1_3_2_1_3_1","first-page":"23","article-title":"The New York Times Company v. Microsoft Corporation, OpenAI, Inc., OpenAI LP, OpenAI GP, LLC, OpenAI LLC, OpenAI OpCo LLC, OpenAI Global LLC, OAI Corporation, LLC, and OpenAI Holdings","year":"2023","unstructured":"2023. The New York Times Company v. Microsoft Corporation, OpenAI, Inc., OpenAI LP, OpenAI GP, LLC, OpenAI LLC, OpenAI OpCo LLC, OpenAI Global LLC, OAI Corporation, LLC, and OpenAI Holdings, LLC. United States District Court Southern District of New York. Case No. 1:23-cv-11195.","journal-title":"LLC. United States District Court Southern District of New York. Case"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/NLBSE59153.2023.00008"},{"key":"e_1_3_2_1_5_1","volume-title":"Targeted Attack on GPT-Neo for the SATML Language Model Data Extraction Challenge. arXiv preprint arXiv:2302.07735","author":"Al-Kaswan Ali","year":"2023","unstructured":"Ali Al-Kaswan, Maliheh Izadi, and Arie van Deursen. 2023. Targeted Attack on GPT-Neo for the SATML Language Model Data Extraction Challenge. arXiv preprint arXiv:2302.07735 (2023)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3597503.3639133"},{"key":"e_1_3_2_1_7_1","volume-title":"Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero","author":"Allal Loubna Ben","year":"2023","unstructured":"Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo Garc\u00eda del R\u00edo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, and Leandro von Werra. 2023. Santa-Coder: don't reach for the stars! arXiv:2301.03988 [cs.SE]"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3511265.3550449"},{"key":"e_1_3_2_1_9_1","volume-title":"Quantifying Memorization Across Neural Language Models. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=TatRHT_1cK","author":"Carlini Nicholas","year":"2023","unstructured":"Nicholas Carlini, Daphne Ippolito, Matthew Jagielski, Katherine Lee, Florian Tramer, and Chiyuan Zhang. 2023. Quantifying Memorization Across Neural Language Models. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=TatRHT_1cK"},{"key":"e_1_3_2_1_10_1","volume-title":"Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al.","author":"Chen Mark","year":"2021","unstructured":"Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. 2021. Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374 (2021)."},{"key":"e_1_3_2_1_11_1","unstructured":"Together Computer. 2023. RedPajama: An Open Source Recipe to Reproduce LLaMA training dataset. https:\/\/github.com\/togethercomputer\/RedPajama-Data"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/1985441.1985468"},{"key":"e_1_3_2_1_13_1","volume-title":"Large language models for software engineering: Survey and open problems. arXiv preprint arXiv:2310.03533","author":"Fan Angela","year":"2023","unstructured":"Angela Fan, Beliz Gokkaya, Mark Harman, Mitya Lyubarskiy, Shubho Sengupta, Shin Yoo, and Jie M Zhang. 2023. Large language models for software engineering: Survey and open problems. arXiv preprint arXiv:2310.03533 (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"A bibliometric review of large language models research from 2017 to","author":"Fan Lizhou","year":"2023","unstructured":"Lizhou Fan, Lingyao Li, Zihui Ma, Sanggyu Lee, Huizi Yu, and Libby Hemphill. 2023. A bibliometric review of large language models research from 2017 to 2023. arXiv preprint arXiv:2304.02020 (2023)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.139"},{"key":"e_1_3_2_1_16_1","unstructured":"The Apache Software Foundation. 2004. Apache License Version 2.0. https:\/\/www.apache.org\/licenses\/LICENSE-2.0"},{"key":"e_1_3_2_1_17_1","volume-title":"InCoder: A Generative Model for Code Infilling and Synthesis. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=hQwb-lbM6EL","author":"Fried Daniel","year":"2023","unstructured":"Daniel Fried, Armen Aghajanyan, Jessy Lin, Sida Wang, Eric Wallace, Freda Shi, Ruiqi Zhong, Scott Yih, Luke Zettlemoyer, and Mike Lewis. 2023. InCoder: A Generative Model for Code Infilling and Synthesis. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=hQwb-lbM6EL"},{"key":"e_1_3_2_1_18_1","volume-title":"The Pile: An 800GB Dataset of Diverse Text for Language Modeling. arXiv:2101.00027 [cs.CL]","author":"Gao Leo","year":"2020","unstructured":"Leo Gao, Stella Biderman, Sid Black, Laurence Golding, Travis Hoppe, Charles Foster, Jason Phang, Horace He, Anish Thite, Noa Nabeshima, Shawn Presser, and Connor Leahy. 2020. The Pile: An 800GB Dataset of Diverse Text for Language Modeling. arXiv:2101.00027 [cs.CL]"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE.2009.5070520"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/1858996.1859088"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.499"},{"key":"e_1_3_2_1_22_1","volume-title":"GraphCodeBERT: Pre-training Code Representations with Data Flow. In International Conference on Learning Representations.","author":"Guo Daya","year":"2020","unstructured":"Daya Guo, Shuo Ren, Shuai Lu, Zhangyin Feng, Duyu Tang, LIU Shujie, Long Zhou, Nan Duan, Alexey Svyatkovskiy, Shengyu Fu, et al. 2020. GraphCodeBERT: Pre-training Code Representations with Data Flow. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_23_1","volume-title":"Foundation models and fair use. arXiv preprint arXiv:2303.15715","author":"Henderson Peter","year":"2023","unstructured":"Peter Henderson, Xuechen Li, Dan Jurafsky, Tatsunori Hashimoto, Mark A Lemley, and Percy Liang. 2023. Foundation models and fair use. arXiv preprint arXiv:2303.15715 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Large language models for software engineering: A systematic literature review. arXiv preprint arXiv:2308.10620","author":"Hou Xinyi","year":"2023","unstructured":"Xinyi Hou, Yanjie Zhao, Yue Liu, Zhou Yang, Kailong Wang, Li Li, Xiapu Luo, David Lo, John Grundy, and Haoyu Wang. 2023. Large language models for software engineering: A systematic literature review. arXiv preprint arXiv:2308.10620 (2023)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3523273"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.inlg-main.3"},{"key":"e_1_3_2_1_27_1","volume-title":"Jia Li, Chenghao Mou, Carlos Mu\u00f1oz Ferrandis, Yacine Jernite, Margaret Mitchell, Sean Hughes, Thomas Wolf, Dzmitry Bahdanau, Leandro von Werra, and Harm de Vries.","author":"Kocetkov Denis","year":"2022","unstructured":"Denis Kocetkov, Raymond Li, Loubna Ben Allal, Jia Li, Chenghao Mou, Carlos Mu\u00f1oz Ferrandis, Yacine Jernite, Margaret Mitchell, Sean Hughes, Thomas Wolf, Dzmitry Bahdanau, Leandro von Werra, and Harm de Vries. 2022. The Stack: 3 TB of permissively licensed source code. arXiv:2211.15533 [cs.CL]"},{"key":"e_1_3_2_1_28_1","volume-title":"Yangtian Zi, Niklas Muennighoff, Denis Kocetkov, Chenghao Mou, Marc Marone, Christopher Akiki, Jia Li, Jenny Chim, et al.","author":"Li Raymond","year":"2023","unstructured":"Raymond Li, Loubna Ben Allal, Yangtian Zi, Niklas Muennighoff, Denis Kocetkov, Chenghao Mou, Marc Marone, Christopher Akiki, Jia Li, Jenny Chim, et al. 2023. StarCoder: may the source be with you! arXiv preprint arXiv:2305.06161 (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"Shengyu Fu, and Shujie LIU.","author":"Lu Shuai","year":"2021","unstructured":"Shuai Lu, Daya Guo, Shuo Ren, Junjie Huang, Alexey Svyatkovskiy, Ambrosio Blanco, Colin Clement, Dawn Drain, Daxin Jiang, Duyu Tang, Ge Li, Lidong Zhou, Linjun Shou, Long Zhou, Michele Tufano, MING GONG, Ming Zhou, Nan Duan, Neel Sundaresan, Shao Kun Deng, Shengyu Fu, and Shujie LIU. 2021. CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1). https:\/\/openreview.net\/forum?id=6lE4dQXaUcb"},{"key":"e_1_3_2_1_30_1","volume-title":"At Which Training Stage Does Code Data Help LLMs Reasoning? arXiv preprint arXiv:2309.16298","author":"Ma Yingwei","year":"2023","unstructured":"Yingwei Ma, Yue Liu, Yue Yu, Yuanliang Zhang, Yu Jiang, Changjian Wang, and Shanshan Li. 2023. At Which Training Stage Does Code Data Help LLMs Reasoning? arXiv preprint arXiv:2309.16298 (2023)."},{"key":"e_1_3_2_1_31_1","unstructured":"Rettigheds Alliancen. 2023. Rights Alliance Removes the Illegal Books3 Dataset Used to Train Artificial Intelligence. https:\/\/rettighedsalliancen.com\/rights-alliance-removes-the-illegal-books3-dataset-used-to-train-artificial-intelligence\/"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_33_1","unstructured":"GNU Operating System. 2022. What is copyleft? https:\/\/www.gnu.org\/licenses\/licenses.html#WhatIsCopyleft"},{"key":"e_1_3_2_1_34_1","volume-title":"Software testing with large language model: Survey, landscape, and vision. arXiv preprint arXiv:2307.07221","author":"Wang Junjie","year":"2023","unstructured":"Junjie Wang, Yuchao Huang, Chunyang Chen, Zhe Liu, Song Wang, and Qing Wang. 2023. Software testing with large language model: Survey, landscape, and vision. arXiv preprint arXiv:2307.07221 (2023)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.68"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.174"},{"key":"e_1_3_2_1_37_1","volume-title":"arXiv preprint arXiv:2310.01166","author":"Yang Zhou","year":"2023","unstructured":"Zhou Yang, Zhipeng Zhao, Chenyu Wang, Jieke Shi, Dongsum Kim, Donggyun Han, and David Lo. 2023. Gotcha! This Model Uses My Code! Evaluating Membership Leakage Risks in Code Models. arXiv preprint arXiv:2310.01166 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"What do code models memorize? an empirical study on large language models of code. arXiv preprint arXiv:2308.09932","author":"Yang Zhou","year":"2023","unstructured":"Zhou Yang, Zhipeng Zhao, Chenyu Wang, Jieke Shi, Dongsun Kim, DongGyun Han, and David Lo. 2023. What do code models memorize? an empirical study on large language models of code. arXiv preprint arXiv:2308.09932 (2023)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.411"},{"key":"e_1_3_2_1_40_1","volume-title":"Code Membership Inference for Detecting Unauthorized Data Use in Code Pre-trained Language Models. arXiv preprint arXiv:2312.07200","author":"Zhang Sheng","year":"2023","unstructured":"Sheng Zhang and Hui Li. 2023. Code Membership Inference for Detecting Unauthorized Data Use in Code Pre-trained Language Models. arXiv preprint arXiv:2312.07200 (2023)."},{"key":"e_1_3_2_1_41_1","unstructured":"Wayne Xin Zhao Kun Zhou Junyi Li Tianyi Tang Xiaolei Wang Yupeng Hou Yingqian Min Beichen Zhang Junjie Zhang Zican Dong et al. 2023. A survey of large language models. arXiv preprint arXiv:2303.18223 (2023)."}],"event":{"name":"FORGE '24: 2024 IEEE\/ACM First International Conference on AI Foundation Models and Software Engineering","location":"Lisbon Portugal","acronym":"FORGE '24","sponsor":["SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 2024 IEEE\/ACM First International Conference on AI Foundation Models and Software Engineering"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650105.3652298","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3650105.3652298","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:03:43Z","timestamp":1750291423000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650105.3652298"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,14]]},"references-count":41,"alternative-id":["10.1145\/3650105.3652298","10.1145\/3650105"],"URL":"https:\/\/doi.org\/10.1145\/3650105.3652298","relation":{},"subject":[],"published":{"date-parts":[[2024,4,14]]},"assertion":[{"value":"2024-06-12","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}