{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T17:02:15Z","timestamp":1776099735737,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":85,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,27]],"date-time":"2024-10-27T00:00:00Z","timestamp":1729987200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100005230","name":"Natural Science Foundation of Chongqing","doi-asserted-by":"publisher","award":["cstc2021jcyj-msxmX1115"],"award-info":[{"award-number":["cstc2021jcyj-msxmX1115"]}],"id":[{"id":"10.13039\/501100005230","id-type":"DOI","asserted-by":"publisher"}]},{"name":"ARC Laureate Fellowship","award":["FL190100035"],"award-info":[{"award-number":["FL190100035"]}]},{"name":"Ningbo Natural Science Foundation","award":["2023J292"],"award-info":[{"award-number":["2023J292"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,27]]},"DOI":"10.1145\/3691620.3695061","type":"proceedings-article","created":{"date-parts":[[2024,10,18]],"date-time":"2024-10-18T15:39:19Z","timestamp":1729265959000},"page":"656-668","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":13,"title":["What Makes a High-Quality Training Dataset for Large Language Models: A Practitioners' Perspective"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4473-3068","authenticated-orcid":false,"given":"Xiao","family":"Yu","sequence":"first","affiliation":[{"name":"Huawei, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7235-1170","authenticated-orcid":false,"given":"Zexian","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Artificial Intelligence, Wuhan University of Technology, Wuhan, China"},{"name":"Wuhan University of Technology Chongqing Research Institute, Chongqing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4123-4554","authenticated-orcid":false,"given":"Feifei","family":"Niu","sequence":"additional","affiliation":[{"name":"School of Electrical Engineering and Computer Science, University of Ottawa, Ottawa, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0093-3292","authenticated-orcid":false,"given":"Xing","family":"Hu","sequence":"additional","affiliation":[{"name":"The State Key Laboratory of Blockchain and Data Security, Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6302-3256","authenticated-orcid":false,"given":"Xin","family":"Xia","sequence":"additional","affiliation":[{"name":"Huawei, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4928-7076","authenticated-orcid":false,"given":"John","family":"Grundy","sequence":"additional","affiliation":[{"name":"Faculty of Information Technology, Monash University, Victoria, Australia"}]}],"member":"320","published-online":{"date-parts":[[2024,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2024. Apache Griffin. https:\/\/griffin.apache.org\/."},{"key":"e_1_3_2_1_2_1","unstructured":"2024. Deequ. https:\/\/github.com\/awslabs\/deequ.git."},{"key":"e_1_3_2_1_3_1","unstructured":"2024. Great Expectations. https:\/\/github.com\/great-expectations\/great_expectations."},{"key":"e_1_3_2_1_4_1","unstructured":"2024. Nvivo qualitative software. https:\/\/lumivero.com\/products\/nvivo\/."},{"key":"e_1_3_2_1_5_1","unstructured":"2024. Qualitis. https:\/\/github.com\/WeBankFinTech\/Qualitis."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","unstructured":"2024. Supplemental Materials. 10.6084\/m9.figshare.25928863","DOI":"10.6084\/m9.figshare.25928863"},{"key":"e_1_3_2_1_7_1","unstructured":"2024. wenjuanxing software. https:\/\/www.wjx.cn."},{"key":"e_1_3_2_1_8_1","volume-title":"Semdedup: Data-efficient learning at web-scale through semantic deduplication. arXiv preprint arXiv:2303.09540","author":"Abbas Amro","year":"2023","unstructured":"Amro Abbas, Kushal Tirumala, D\u00e1niel Simig, Surya Ganguli, and Ari S Morcos. 2023. Semdedup: Data-efficient learning at web-scale through semantic deduplication. arXiv preprint arXiv:2303.09540 (2023)."},{"key":"e_1_3_2_1_9_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_10_1","volume-title":"Shayne Longpre, Nathan Lambert, Xinyi Wang, Niklas Muennighoff, Bairu Hou, Liangming Pan, Haewon Jeong, Colin Raffel, Shiyu Chang, Tatsunori Hashimoto, and William Yang Wang.","author":"Albalak Alon","year":"2024","unstructured":"Alon Albalak, Yanai Elazar, Sang Michael Xie, Shayne Longpre, Nathan Lambert, Xinyi Wang, Niklas Muennighoff, Bairu Hou, Liangming Pan, Haewon Jeong, Colin Raffel, Shiyu Chang, Tatsunori Hashimoto, and William Yang Wang. 2024. A Survey on Data Selection for Language Models. arXiv:2402.16827 [cs.CL]"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3359591.3359735"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3484828"},{"key":"e_1_3_2_1_13_1","unstructured":"Rohan Anil Andrew M Dai Orhan Firat Melvin Johnson Dmitry Lepikhin Alexandre Passos Siamak Shakeri Emanuel Taropa Paige Bailey Zhifeng Chen et al. 2023. Palm 2 technical report. arXiv preprint arXiv:2305.10403 (2023)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1111\/opo.12131"},{"key":"e_1_3_2_1_15_1","volume-title":"Methodologies for data quality assessment and improvement. ACM computing surveys (CSUR) 41, 3","author":"Batini Carlo","year":"2009","unstructured":"Carlo Batini, Cinzia Cappiello, Chiara Francalanci, and Andrea Maurino. 2009. Methodologies for data quality assessment and improvement. ACM computing surveys (CSUR) 41, 3 (2009), 1--52."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-46002-9_23"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.14722\/madweb.2023.23043"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1002\/int.10074"},{"key":"e_1_3_2_1_19_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_20_1","volume-title":"The effects of data quality on machine learning performance. arXiv preprint arXiv:2207.14529","author":"Budach Lukas","year":"2022","unstructured":"Lukas Budach, Moritz Feuerpfeil, Nina Ihde, Andrea Nathansen, Nele Noack, Hendrik Patzlaff, Felix Naumann, and Hazar Harmouch. 2022. The effects of data quality on machine learning performance. arXiv preprint arXiv:2207.14529 (2022)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/2961111.2962597"},{"key":"e_1_3_2_1_22_1","volume-title":"CodeT: Code Generation with Generated Tests. In The Eleventh International Conference on Learning Representations, ICLR","author":"Chen Bei","year":"2023","unstructured":"Bei Chen, Fengji Zhang, Anh Nguyen, Daoguang Zan, Zeqi Lin, Jian-Guang Lou, and Weizhu Chen. 2023. CodeT: Code Generation with Generated Tests. In The Eleventh International Conference on Learning Representations, ICLR 2023."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3317573"},{"key":"e_1_3_2_1_24_1","volume-title":"Are large-scale datasets necessary for self-supervised pre-training? arXiv preprint arXiv:2112.10740","author":"El-Nouby Alaaeldin","year":"2021","unstructured":"Alaaeldin El-Nouby, Gautier Izacard, Hugo Touvron, Ivan Laptev, Herv\u00e9 Jegou, and Edouard Grave. 2021. Are large-scale datasets necessary for self-supervised pre-training? arXiv preprint arXiv:2112.10740 (2021)."},{"key":"e_1_3_2_1_25_1","volume-title":"Large language models for software engineering: Survey and open problems. arXiv preprint arXiv:2310.03533","author":"Fan Angela","year":"2023","unstructured":"Angela Fan, Beliz Gokkaya, Mark Harman, Mitya Lyubarskiy, Shubho Sengupta, Shin Yoo, and Jie M Zhang. 2023. Large language models for software engineering: Survey and open problems. arXiv preprint arXiv:2310.03533 (2023)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.2307\/2340521"},{"key":"e_1_3_2_1_27_1","volume-title":"Incoder: A generative model for code infilling and synthesis. arXiv preprint arXiv:2204.05999","author":"Fried Daniel","year":"2022","unstructured":"Daniel Fried, Armen Aghajanyan, Jessy Lin, Sida Wang, Eric Wallace, Freda Shi, Ruiqi Zhong, Wen-tau Yih, Luke Zettlemoyer, and Mike Lewis. 2022. Incoder: A generative model for code infilling and synthesis. arXiv preprint arXiv:2204.05999 (2022)."},{"key":"e_1_3_2_1_28_1","volume-title":"Llm-based nlg evaluation: Current status and challenges. arXiv preprint arXiv:2402.01383","author":"Gao Mingqi","year":"2024","unstructured":"Mingqi Gao, Xinyu Hu, Jie Ruan, Xiao Pu, and Xiaojun Wan. 2024. Llm-based nlg evaluation: Current status and challenges. arXiv preprint arXiv:2402.01383 (2024)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE-SEIP52600.2021.00039"},{"key":"e_1_3_2_1_30_1","unstructured":"Danny Hernandez Tom Brown Tom Conerly Nova DasSarma Dawn Drain Sheer El-Showk Nelson Elhage Zac Hatfield-Dodds Tom Henighan Tristan Hume et al. 2022. Scaling laws and interpretability of learning from repeated data. arXiv preprint arXiv:2205.10487 (2022)."},{"key":"e_1_3_2_1_31_1","volume-title":"Data quality and record linkage techniques","author":"Herzog Thomas N","unstructured":"Thomas N Herzog, Fritz J Scheuren, and William E Winkler. 2007. Data quality and record linkage techniques. Vol. 1. Springer."},{"key":"e_1_3_2_1_32_1","volume-title":"RAG and RAU: A Survey on Retrieval-Augmented Language Model in Natural Language Processing. arXiv preprint arXiv:2404.19543","author":"Hu Yucheng","year":"2024","unstructured":"Yucheng Hu and Yuxing Lu. 2024. RAG and RAU: A Survey on Retrieval-Augmented Language Model in Natural Language Processing. arXiv preprint arXiv:2404.19543 (2024)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.future.2020.10.025"},{"key":"e_1_3_2_1_34_1","volume-title":"International Conference on Machine Learning. PMLR, 10697--10707","author":"Kandpal Nikhil","year":"2022","unstructured":"Nikhil Kandpal, Eric Wallace, and Colin Raffel. 2022. Deduplicating training data mitigates privacy risks in language models. In International Conference on Machine Learning. PMLR, 10697--10707."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3635059.3635104"},{"key":"e_1_3_2_1_36_1","volume-title":"Jia Li, Chenghao Mou, Carlos Mu\u00f1oz Ferrandis, Yacine Jernite, Margaret Mitchell, Sean Hughes, Thomas Wolf, et al.","author":"Kocetkov Denis","year":"2022","unstructured":"Denis Kocetkov, Raymond Li, Loubna Ben Allal, Jia Li, Chenghao Mou, Carlos Mu\u00f1oz Ferrandis, Yacine Jernite, Margaret Mitchell, Sean Hughes, Thomas Wolf, et al. 2022. The stack: 3 tb of permissively licensed source code. arXiv preprint arXiv:2211.15533 (2022)."},{"key":"e_1_3_2_1_37_1","volume-title":"Fran\u00e7ois Yvon, Matthias Gall\u00e9, et al.","author":"Scao Teven Le","year":"2023","unstructured":"Teven Le Scao, Angela Fan, Christopher Akiki, Ellie Pavlick, Suzana Ili\u0107, Daniel Hesslow, Roman Castagn\u00e9, Alexandra Sasha Luccioni, Fran\u00e7ois Yvon, Matthias Gall\u00e9, et al. 2023. Bloom: A 176b-parameter open-access multilingual language model. (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"Deduplicating training data makes language models better. arXiv preprint arXiv:2107.06499","author":"Lee Katherine","year":"2021","unstructured":"Katherine Lee, Daphne Ippolito, Andrew Nystrom, Chiyuan Zhang, Douglas Eck, Chris Callison-Burch, and Nicholas Carlini. 2021. Deduplicating training data makes language models better. arXiv preprint arXiv:2107.06499 (2021)."},{"key":"e_1_3_2_1_39_1","volume-title":"Large language model ChatGPT versus small deep learning models for self-admitted technical debt detection: Why not together? Software: Practice and Experience","author":"Li Jun","year":"2024","unstructured":"Jun Li, Lixian Li, Jin Liu, Xiao Yu, Xiao Liu, and Jacky Wai Keung. 2024. Large language model ChatGPT versus small deep learning models for self-admitted technical debt detection: Why not together? Software: Practice and Experience (2024)."},{"key":"e_1_3_2_1_40_1","volume-title":"From quantity to quality: Boosting llm performance with self-guided data selection for instruction tuning. arXiv preprint arXiv:2308.12032","author":"Li Ming","year":"2023","unstructured":"Ming Li, Yong Zhang, Zhitao Li, Jiuhai Chen, Lichang Chen, Ning Cheng, Jianzong Wang, Tianyi Zhou, and Jing Xiao. 2023. From quantity to quality: Boosting llm performance with self-guided data selection for instruction tuning. arXiv preprint arXiv:2308.12032 (2023)."},{"key":"e_1_3_2_1_41_1","volume-title":"Yangtian Zi, Niklas Muennighoff, Denis Kocetkov, Chenghao Mou, Marc Marone, Christopher Akiki, Jia Li, Jenny Chim, et al.","author":"Li Raymond","year":"2023","unstructured":"Raymond Li, Loubna Ben Allal, Yangtian Zi, Niklas Muennighoff, Denis Kocetkov, Chenghao Mou, Marc Marone, Christopher Akiki, Jia Li, Jenny Chim, et al. 2023. Starcoder: may the source be with you! arXiv preprint arXiv:2305.06161 (2023)."},{"key":"e_1_3_2_1_42_1","volume-title":"Agustin Dal Lago, et al","author":"Li Yujia","year":"2022","unstructured":"Yujia Li, David Choi, Junyoung Chung, Nate Kushman, Julian Schrittwieser, R\u00e9mi Leblond, Tom Eccles, James Keeling, Felix Gimeno, Agustin Dal Lago, et al. 2022. Competition-level code generation with alphacode. Science 378, 6624 (2022), 1092--1097."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3597503.3608128"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3540250.3549082"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/2786805.2786809"},{"key":"e_1_3_2_1_46_1","volume-title":"Oyvind Tafjord, Dustin Schwenk, Evan Pete Walsh, Yanai Elazar, Kyle Lo, et al.","author":"Magnusson Ian","year":"2023","unstructured":"Ian Magnusson, Akshita Bhagia, Valentin Hofmann, Luca Soldaini, Ananya Harsh Jha, Oyvind Tafjord, Dustin Schwenk, Evan Pete Walsh, Yanai Elazar, Kyle Lo, et al. 2023. Paloma: A Benchmark for Evaluating Language Model Fit. arXiv preprint arXiv:2312.10523 (2023)."},{"key":"e_1_3_2_1_47_1","unstructured":"maketing evolution. 2022. What is Data Quality? Definition & Dimensions. https:\/\/www.marketingevolution.com\/marketing-essentials\/data-quality."},{"key":"e_1_3_2_1_48_1","volume-title":"Codegen: An open large language model for code with multi-turn program synthesis. arXiv preprint arXiv:2203.13474","author":"Nijkamp Erik","year":"2022","unstructured":"Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, and Caiming Xiong. 2022. Codegen: An open large language model for code with multi-turn program synthesis. arXiv preprint arXiv:2203.13474 (2022)."},{"key":"e_1_3_2_1_49_1","volume-title":"A conversational paradigm for program synthesis. arXiv preprint arXiv:2203.13474 30","author":"Nijkamp Erik","year":"2022","unstructured":"Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, and Caiming Xiong. 2022. A conversational paradigm for program synthesis. arXiv preprint arXiv:2203.13474 30 (2022)."},{"key":"e_1_3_2_1_50_1","unstructured":"Nostalgebraist. 2022. chinchilla's wild implications. https:\/\/www.alignmentforum.org\/posts\/6Fpvch8RR29qLEWNH\/chinchilla-s-wild-implications."},{"key":"e_1_3_2_1_51_1","volume-title":"The refinedweb dataset for falcon llm: Outperforming curated corpora with web data only. Advances in Neural Information Processing Systems 36","author":"Penedo Guilherme","year":"2024","unstructured":"Guilherme Penedo, Quentin Malartic, Daniel Hesslow, Ruxandra Cojocaru, Hamza Alobeidli, Alessandro Cappelli, Baptiste Pannier, Ebtesam Almazrouei, and Julien Launay. 2024. The refinedweb dataset for falcon llm: Outperforming curated corpora with web data only. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_52_1","unstructured":"Qlik. 2024. Data Quality. https:\/\/www.qlik.com\/us\/data-governance\/data-quality."},{"key":"e_1_3_2_1_53_1","unstructured":"Jack W Rae Sebastian Borgeaud Trevor Cai Katie Millican Jordan Hoffmann Francis Song John Aslanides Sarah Henderson Roman Ring Susannah Young et al. 2021. Scaling language models: Methods analysis & insights from training gopher. arXiv preprint arXiv:2112.11446 (2021)."},{"key":"e_1_3_2_1_54_1","volume-title":"Data quality for the information age","author":"Redman Thomas C","unstructured":"Thomas C Redman. 1997. Data quality for the information age. Artech House, Inc."},{"key":"e_1_3_2_1_55_1","volume-title":"Data quality: the field guide","author":"Redman Thomas C","unstructured":"Thomas C Redman. 2001. Data quality: the field guide. Digital press."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00112"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASE56229.2023.00143"},{"key":"e_1_3_2_1_58_1","volume-title":"Yossi Adi, Jingyu Liu, Tal Remez, J\u00e9r\u00e9my Rapin, et al.","author":"Roziere Baptiste","year":"2023","unstructured":"Baptiste Roziere, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, J\u00e9r\u00e9my Rapin, et al. 2023. Code llama: Open foundation models for code. arXiv preprint arXiv:2308.12950 (2023)."},{"key":"e_1_3_2_1_59_1","first-page":"1","article-title":"Data quality under a computer science perspective","volume":"2","author":"Scannapieco Monica","year":"2002","unstructured":"Monica Scannapieco and Tiziana Catarci. 2002. Data quality under a computer science perspective. Archivi & Computer 2 (2002), 1--15.","journal-title":"Archivi & Computer"},{"key":"e_1_3_2_1_60_1","unstructured":"Robert Sheldon. 2024. What is data management and why is it important? https:\/\/www.techtarget.com\/searchdatamanagement\/definition\/data-quality."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/InfRKM.2012.6204995"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592534"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CHASE.2013.6614738"},{"key":"e_1_3_2_1_64_1","volume-title":"Dolma: An Open Corpus of Three Trillion Tokens for Language Model Pretraining Research. arXiv preprint arXiv:2402.00159","author":"Soldaini Luca","year":"2024","unstructured":"Luca Soldaini, Rodney Kinney, Akshita Bhagia, Dustin Schwenk, David Atkinson, Russell Authur, Ben Bogin, Khyathi Chandu, Jennifer Dumas, Yanai Elazar, et al. 2024. Dolma: An Open Corpus of Three Trillion Tokens for Language Model Pretraining Research. arXiv preprint arXiv:2402.00159 (2024)."},{"key":"e_1_3_2_1_65_1","unstructured":"Myles Suer. 2023. What Is Data Quality and Why Is It Important? https:\/\/www.alation.com\/blog\/what-is-data-quality-why-is-it-important\/."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSME52107.2021.00029"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASE56229.2023.00076"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSE.2016.2584050"},{"key":"e_1_3_2_1_69_1","volume-title":"D4: Improving llm pretraining via document de-duplication and diversification. Advances in Neural Information Processing Systems 36","author":"Tirumala Kushal","year":"2024","unstructured":"Kushal Tirumala, Daniel Simig, Armen Aghajanyan, and Ari Morcos. 2024. D4: Improving llm pretraining via document de-duplication and diversification. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_70_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/240455.240479"},{"key":"e_1_3_2_1_72_1","volume-title":"Software testing with large language models: Survey, landscape, and vision","author":"Wang Junjie","year":"2024","unstructured":"Junjie Wang, Yuchao Huang, Chunyang Chen, Zhe Liu, Song Wang, and Qing Wang. 2024. Software testing with large language models: Survey, landscape, and vision. IEEE Transactions on Software Engineering (2024)."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1080\/07421222.1996.11518099"},{"key":"e_1_3_2_1_74_1","volume-title":"Nghi DQ Bui, Junnan Li, and Steven CH Hoi.","author":"Wang Yue","year":"2023","unstructured":"Yue Wang, Hung Le, Akhilesh Deepak Gotmare, Nghi DQ Bui, Junnan Li, and Steven CH Hoi. 2023. Codet5+: Open code large language models for code understanding and generation. arXiv preprint arXiv:2305.07922 (2023)."},{"key":"e_1_3_2_1_75_1","volume-title":"Proceedings of the ACM on HumanComputer Interaction 7, CSCW1","author":"Gon\u00e7alves Pavl\u00edna Wurzel","year":"2023","unstructured":"Pavl\u00edna Wurzel Gon\u00e7alves, G\u00fcl Calikli, Alexander Serebrenik, and Alberto Bacchelli. 2023. Competencies for code review. Proceedings of the ACM on HumanComputer Interaction 7, CSCW1 (2023), 1--33."},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE.2019.00098"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/3520312.3534862"},{"key":"e_1_3_2_1_78_1","volume-title":"Harnessing the power of llms in practice: A survey on chatgpt and beyond. ACM Transactions on Knowledge Discovery from Data 18, 6","author":"Yang Jingfeng","year":"2024","unstructured":"Jingfeng Yang, Hongye Jin, Ruixiang Tang, Xiaotian Han, Qizhang Feng, Haoming Jiang, Shaochen Zhong, Bing Yin, and Xia Hu. 2024. Harnessing the power of llms in practice: A survey on chatgpt and beyond. ACM Transactions on Knowledge Discovery from Data 18, 6 (2024), 1--32."},{"key":"e_1_3_2_1_79_1","volume-title":"Jin Liu, and Xin Xia.","author":"Yu Xiao","year":"2024","unstructured":"Xiao Yu, Lei Liu, Xing Hu, Jacky Wai Keung, Jin Liu, and Xin Xia. 2024. Fight Fire with Fire: How Much Can We Trust ChatGPT on Source Code-Related Tasks? arXiv preprint arXiv:2405.12641 (2024)."},{"key":"e_1_3_2_1_80_1","volume-title":"Jin Liu, and Xin Xia.","author":"Yu Xiao","year":"2024","unstructured":"Xiao Yu, Lei Liu, Xing Hu, Jacky Wai Keung, Jin Liu, and Xin Xia. 2024. Where Are Large Language Models for Code Generation on GitHub? arXiv preprint arXiv:2406.19544 (2024)."},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.411"},{"key":"e_1_3_2_1_82_1","volume-title":"Repocoder: Repository-level code completion through iterative retrieval and generation.","author":"Zhang Fengji","year":"2023","unstructured":"Fengji Zhang, Bei Chen, Yue Zhang, Jacky Keung, Jin Liu, Daoguang Zan, Yi Mao, Jian-Guang Lou, and Weizhu Chen. 2023. Repocoder: Repository-level code completion through iterative retrieval and generation. (2023), 2471--2484."},{"key":"e_1_3_2_1_83_1","volume-title":"Codegeex: A pre-trained model for code generation with multilingual evaluations on humaneval-x. arXiv preprint arXiv:2303.17568","author":"Zheng Qinkai","year":"2023","unstructured":"Qinkai Zheng, Xiao Xia, Xu Zou, Yuxiao Dong, Shan Wang, Yufei Xue, Zihan Wang, Lei Shen, Andi Wang, Yang Li, et al. 2023. Codegeex: A pre-trained model for code generation with multilingual evaluations on humaneval-x. arXiv preprint arXiv:2303.17568 (2023)."},{"key":"e_1_3_2_1_84_1","volume-title":"A survey of large language models for code: Evolution, benchmarking, and future trends. arXiv preprint arXiv:2311.10372","author":"Zheng Zibin","year":"2023","unstructured":"Zibin Zheng, Kaiwen Ning, Yanlin Wang, Jingwen Zhang, Dewu Zheng, Mingxi Ye, and Jiachi Chen. 2023. A survey of large language models for code: Evolution, benchmarking, and future trends. arXiv preprint arXiv:2311.10372 (2023)."},{"key":"e_1_3_2_1_85_1","volume-title":"Understanding self-supervised pretraining with part-aware representation learning. arXiv preprint arXiv:2301.11915","author":"Zhu Jie","year":"2023","unstructured":"Jie Zhu, Jiyang Qi, Mingyu Ding, Xiaokang Chen, Ping Luo, Xinggang Wang, Wenyu Liu, Leye Wang, and Jingdong Wang. 2023. Understanding self-supervised pretraining with part-aware representation learning. arXiv preprint arXiv:2301.11915 (2023)."}],"event":{"name":"ASE '24: 39th IEEE\/ACM International Conference on Automated Software Engineering","location":"Sacramento CA USA","acronym":"ASE '24","sponsor":["SIGAI ACM Special Interest Group on Artificial Intelligence","SIGSOFT ACM Special Interest Group on Software Engineering","IEEE CS"]},"container-title":["Proceedings of the 39th IEEE\/ACM International Conference on Automated Software Engineering"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3691620.3695061","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3691620.3695061","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:04:07Z","timestamp":1750291447000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3691620.3695061"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,27]]},"references-count":85,"alternative-id":["10.1145\/3691620.3695061","10.1145\/3691620"],"URL":"https:\/\/doi.org\/10.1145\/3691620.3695061","relation":{},"subject":[],"published":{"date-parts":[[2024,10,27]]},"assertion":[{"value":"2024-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}