{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:00:46Z","timestamp":1750309246560,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,15]],"date-time":"2024-04-15T00:00:00Z","timestamp":1713139200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,15]]},"DOI":"10.1145\/3643991.3644864","type":"proceedings-article","created":{"date-parts":[[2024,7,2]],"date-time":"2024-07-02T13:05:13Z","timestamp":1719925513000},"page":"637-641","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CodeLL: A Lifelong Learning Dataset to Support the Co-Evolution of Data and Language Models of Code"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5987-850X","authenticated-orcid":false,"given":"Martin","family":"Weyssow","sequence":"first","affiliation":[{"name":"DIRO, University of Montreal, Montreal, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9872-9542","authenticated-orcid":false,"given":"Claudio","family":"Di Sipio","sequence":"additional","affiliation":[{"name":"University of l'Aquila, L'Aquila, Italy"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5077-6793","authenticated-orcid":false,"given":"Davide","family":"Di Ruscio","sequence":"additional","affiliation":[{"name":"University of L'Aquila, L'Aquila, Italy"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6304-9926","authenticated-orcid":false,"given":"Houari","family":"Sahraoui","sequence":"additional","affiliation":[{"name":"DIRO, University of Montreal, Montreal, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,7,2]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Hussein Alrubaye Deema AlShoaibi Mohamed Wiem Mkaouer and Ali Ouni. [n. d.]. How Does API Migration Impact Software Quality and Comprehension? An Empirical Study. ([n. d.]) 12."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/SANER.2018.8330249"},{"key":"e_1_3_2_1_3_1","volume-title":"Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al.","author":"Chen Mark","year":"2021","unstructured":"Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. 2021. Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374 (2021)."},{"key":"e_1_3_2_1_4_1","volume-title":"The codrep machine learning on source code competition. arXiv preprint arXiv:1807.03200","author":"Chen Zimin","year":"2018","unstructured":"Zimin Chen and Martin Monperrus. 2018. The codrep machine learning on source code competition. arXiv preprint arXiv:1807.03200 (2018)."},{"key":"e_1_3_2_1_5_1","volume-title":"International Conference on Digital Preservation. https:\/\/api.semanticscholar.org\/CorpusID:53597098","author":"Cosmo Roberto Di","year":"2017","unstructured":"Roberto Di Cosmo and Stefano Zacchiroli. 2017. Software Heritage: Why and How to Preserve Software Source Code. In International Conference on Digital Preservation. https:\/\/api.semanticscholar.org\/CorpusID:53597098"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453478"},{"key":"e_1_3_2_1_7_1","unstructured":"Leo Gao Stella Biderman Sid Black Laurence Golding Travis Hoppe Charles Foster Jason Phang Horace He Anish Thite Noa Nabeshima et al. 2020. The pile: An 800gb dataset of diverse text for language modeling. arXiv preprint arXiv:2101.00027 (2020)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00015"},{"key":"e_1_3_2_1_9_1","volume-title":"Codesearchnet challenge: Evaluating the state of semantic code search. arXiv preprint arXiv:1909.09436","author":"Husain Hamel","year":"2019","unstructured":"Hamel Husain, Ho-Hsiang Wu, Tiferet Gazit, Miltiadis Allamanis, and Marc Brockschmidt. 2019. Codesearchnet challenge: Evaluating the state of semantic code search. arXiv preprint arXiv:1909.09436 (2019)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/MSR59073.2023.00075"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379597.3387491"},{"key":"e_1_3_2_1_12_1","volume-title":"Jia Li, Chenghao Mou, Carlos Mu\u00f1oz Ferrandis, Yacine Jernite, Margaret Mitchell, Sean Hughes, Thomas Wolf, Dzmitry Bahdanau, Leandro von Werra, and Harm de Vries.","author":"Kocetkov Denis","year":"2022","unstructured":"Denis Kocetkov, Raymond Li, Loubna Ben Allal, Jia Li, Chenghao Mou, Carlos Mu\u00f1oz Ferrandis, Yacine Jernite, Margaret Mitchell, Sean Hughes, Thomas Wolf, Dzmitry Bahdanau, Leandro von Werra, and Harm de Vries. 2022. The Stack: 3 TB of permissively licensed source code. Preprint (2022)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10664-017-9521-5"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589806.3600043"},{"key":"e_1_3_2_1_15_1","volume-title":"CCT5: A Code-Change-Oriented Pre-Trained Model. arXiv preprint arXiv:2305.10785","author":"Lin Bo","year":"2023","unstructured":"Bo Lin, Shangwen Wang, Zhongxin Liu, Yepang Liu, Xin Xia, and Xiaoguang Mao. 2023. CCT5: A Code-Change-Oriented Pre-Trained Model. arXiv preprint arXiv:2305.10785 (2023)."},{"key":"e_1_3_2_1_16_1","volume-title":"Codexglue: A machine learning benchmark dataset for code understanding and generation. arXiv preprint arXiv:2102.04664","author":"Lu Shuai","year":"2021","unstructured":"Shuai Lu, Daya Guo, Shuo Ren, Junjie Huang, Alexey Svyatkovskiy, Ambrosio Blanco, Colin Clement, Dawn Drain, Daxin Jiang, Duyu Tang, et al. 2021. Codexglue: A machine learning benchmark dataset for code understanding and generation. arXiv preprint arXiv:2102.04664 (2021)."},{"key":"e_1_3_2_1_17_1","volume-title":"Megadiff: A dataset of 600k java source code changes categorized by diff size. arXiv preprint arXiv:2108.04631","author":"Monperrus Martin","year":"2021","unstructured":"Martin Monperrus, Matias Martinez, He Ye, Fernanda Madeiral, Thomas Durieux, and Zhongxing Yu. 2021. Megadiff: A dataset of 600k java source code changes categorized by diff size. arXiv preprint arXiv:2108.04631 (2021)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2022.117267"},{"key":"e_1_3_2_1_19_1","unstructured":"Erik Nijkamp Bo Pang Hiroaki Hayashi Lifu Tu Huan Wang Yingbo Zhou Silvio Savarese and Caiming Xiong. 2023. CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis. arXiv:2203.13474 [cs.LG]"},{"key":"e_1_3_2_1_20_1","volume-title":"Gorilla: Large language model connected with massive apis. arXiv preprint arXiv:2305.15334","author":"Patil Shishir G","year":"2023","unstructured":"Shishir G Patil, Tianjun Zhang, Xin Wang, and Joseph E Gonzalez. 2023. Gorilla: Large language model connected with massive apis. arXiv preprint arXiv:2305.15334 (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"Codenet: A large-scale ai for code dataset for learning a diversity of coding tasks. arXiv preprint arXiv:2105.12655","author":"Puri Ruchir","year":"2021","unstructured":"Ruchir Puri, David S Kung, Geert Janssen, Wei Zhang, Giacomo Domeniconi, Vladimir Zolotov, Julian Dolby, Jie Chen, Mihir Choudhury, Lindsey Decker, et al. 2021. Codenet: A large-scale ai for code dataset for learning a diversity of coding tasks. arXiv preprint arXiv:2105.12655 (2021)."},{"key":"e_1_3_2_1_22_1","volume-title":"Toolllm: Facilitating large language models to master 16000+ real-world apis. arXiv preprint arXiv:2307.16789","author":"Qin Yujia","year":"2023","unstructured":"Yujia Qin, Shihao Liang, Yining Ye, Kunlun Zhu, Lan Yan, Yaxi Lu, Yankai Lin, Xin Cong, Xiangru Tang, Bill Qian, et al. 2023. Toolllm: Facilitating large language models to master 16000+ real-world apis. arXiv preprint arXiv:2307.16789 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Yossi Adi, Jingyu Liu, Tal Remez, J\u00e9r\u00e9my Rapin, et al.","author":"Roziere Baptiste","year":"2023","unstructured":"Baptiste Roziere, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, J\u00e9r\u00e9my Rapin, et al. 2023. Code llama: Open foundation models for code. arXiv preprint arXiv:2308.12950 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Exploring Parameter-Efficient Fine-Tuning Techniques for Code Generation with Large Language Models. arXiv preprint arXiv:2308.10462","author":"Weyssow Martin","year":"2023","unstructured":"Martin Weyssow, Xin Zhou, Kisub Kim, David Lo, and Houari Sahraoui. 2023. Exploring Parameter-Efficient Fine-Tuning Techniques for Code Generation with Large Language Models. arXiv preprint arXiv:2308.10462 (2023)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","unstructured":"Martin Weyssow Xin Zhou Kisub Kim David Lo and Houari Sahraoui. 2023. On the Usage of Continual Learning for Out-of-Distribution Generalization in Pre-trained Language Models of Code. arXiv:2305.04106 [cs]. 10.48550\/arXiv.2305.04106","DOI":"10.48550\/arXiv.2305.04106"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPC.2019.00052"},{"key":"e_1_3_2_1_27_1","volume-title":"The Devil is in the Tails: How Long-Tailed Code Distributions Impact Large Language Models. arXiv preprint arXiv:2309.03567","author":"Zhou Xin","year":"2023","unstructured":"Xin Zhou, Kisub Kim, Bowen Xu, Jiakun Liu, DongGyun Han, and David Lo. 2023. The Devil is in the Tails: How Long-Tailed Code Distributions Impact Large Language Models. arXiv preprint arXiv:2309.03567 (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"Xlcost: A benchmark dataset for cross-lingual code intelligence. arXiv preprint arXiv:2206.08474","author":"Zhu Ming","year":"2022","unstructured":"Ming Zhu, Aneesh Jain, Karthik Suresh, Roshan Ravindran, Sindhu Tipirneni, and Chandan K Reddy. 2022. Xlcost: A benchmark dataset for cross-lingual code intelligence. arXiv preprint arXiv:2206.08474 (2022)."}],"event":{"name":"MSR '24: 21st International Conference on Mining Software Repositories","sponsor":["SIGSOFT ACM Special Interest Group on Software Engineering","IEEE CS"],"location":"Lisbon Portugal","acronym":"MSR '24"},"container-title":["Proceedings of the 21st International Conference on Mining Software Repositories"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3643991.3644864","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3643991.3644864","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T23:56:44Z","timestamp":1750291004000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3643991.3644864"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,15]]},"references-count":28,"alternative-id":["10.1145\/3643991.3644864","10.1145\/3643991"],"URL":"https:\/\/doi.org\/10.1145\/3643991.3644864","relation":{},"subject":[],"published":{"date-parts":[[2024,4,15]]},"assertion":[{"value":"2024-07-02","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}