{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T03:03:33Z","timestamp":1773371013770,"version":"3.50.1"},"reference-count":107,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,12,15]],"date-time":"2024-12-15T00:00:00Z","timestamp":1734220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,15]],"date-time":"2024-12-15T00:00:00Z","timestamp":1734220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,12,15]]},"DOI":"10.1109\/bigdata62323.2024.10826108","type":"proceedings-article","created":{"date-parts":[[2025,1,16]],"date-time":"2025-01-16T18:31:23Z","timestamp":1737052283000},"page":"5402-5411","source":"Crossref","is-referenced-by-count":5,"title":["Code LLMs: A Taxonomy-based Survey"],"prefix":"10.1109","author":[{"given":"Nishat","family":"Raihan","sequence":"first","affiliation":[{"name":"George Mason University,Fairfax,VA,USA"}]},{"given":"Christian","family":"Newman","sequence":"additional","affiliation":[{"name":"Rochester Institute of Technology,Rochester,NY,USA"}]},{"given":"Marcos","family":"Zampieri","sequence":"additional","affiliation":[{"name":"George Mason University,Fairfax,VA,USA"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Improving language understanding by generative pre-training","author":"Radford","year":"2018"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.139"},{"key":"ref3","article-title":"Graphcodebert: Pre-training code representations with data flow","author":"Guo","year":"2020"},{"key":"ref4","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018"},{"key":"ref5","article-title":"Electra: Pretraining text encoders as discriminators rather than generators","author":"Clark","year":"2020"},{"key":"ref6","article-title":"Language models are unsupervised multitask learners","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref7","article-title":"Language models are few-shot learners","volume-title":"Advances in neural information processing systems","author":"Brown"},{"key":"ref8","article-title":"The falcon series of open language models","author":"Almazrouei","year":"2023"},{"key":"ref9","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127063"},{"key":"ref11","article-title":"Glu variants improve transformer","author":"Shazeer","year":"2020"},{"key":"ref12","article-title":"Claude technical report","year":"2023"},{"key":"ref13","article-title":"Wikipedia corpus"},{"key":"ref14","article-title":"Common crawl corpus"},{"key":"ref15","article-title":"Large language models in computer science education: A systematic literature review","author":"Raihan","year":"2024"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/WCRE.2010.13"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TSE.2011.84"},{"key":"ref19","first-page":"29","article-title":"Using information retrieval based coupling measures for impact analysis","volume-title":"2007 IEEE International Conference on Software Maintenance","author":"Poshyvanyk"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/WCRE.2010.10"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICSM.2012.6405277"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICSM.2011.6080778"},{"key":"ref23","volume-title":"Refactoring: Improving the Design of Existing Code","author":"Fowler","year":"1999"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICSM.2004.1357820"},{"key":"ref25","article-title":"Gpt-4 technical report","year":"2023"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.685"},{"key":"ref27","article-title":"Code llama: Open foundation models for code","author":"Roziere","year":"2023"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p19-1139"},{"key":"ref29","article-title":"A robustly optimized bert pretraining approach with post-training","volume-title":"China National Conference on Chinese Computational Linguistics","author":"Liu"},{"key":"ref30","article-title":"Learning and evaluating contextual embedding of source code","volume-title":"International Conference on machine learning","author":"Kanade"},{"key":"ref31","article-title":"Treebert: A tree-based pre-trained model for programming language","volume-title":"Uncertainty in Artificial Intelligence","author":"Jiang"},{"key":"ref32","article-title":"Syncobert: Syntax-guided multi-modal contrastive pre-training for code representation","author":"Wang","year":"2021"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-naacl.80"},{"key":"ref34","article-title":"Codesearchnet challenge: Evaluating the state of semantic code search","author":"Husain","year":"2019"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p16-1195"},{"key":"ref36","article-title":"Python 150k: A large-scale dataset for python programs"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.442"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICSME.2014.77"},{"key":"ref39","article-title":"Global relational models of source code","volume-title":"International Conference on learning representations","author":"Hellendoorn"},{"key":"ref40","article-title":"Scalpel: The python static analysis framework","author":"Li","year":"2022"},{"key":"ref41","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","author":"Raffel","year":"2020","journal-title":"The Journal of Machine Learning Research"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.4324\/9781003022022-6"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.728"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.68"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3510003.3510096"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1126\/science.abq1158"},{"key":"ref47","article-title":"Unified pretraining for program understanding and generation","volume-title":"Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","author":"Ahmad"},{"key":"ref48","first-page":"95","article-title":"Gpt-neox-20b: An open-source autoregressive language model","volume-title":"Proceedings of BigScience Episode# 5\u2013Workshop on Challenges & Perspectives in Creating Large Language Models","author":"Black"},{"key":"ref49","article-title":"Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation","author":"Liu","year":"2023"},{"key":"ref50","article-title":"Program synthesis with large language models","author":"Austin","year":"2021"},{"key":"ref51","article-title":"Introducing the next generation of Claude","year":"2024"},{"key":"ref52","article-title":"Gpt-4 omni: A comprehensive multimodal model for language, vision, and beyond","year":"2024"},{"key":"ref53","article-title":"Mistral 7b: A new era in code generation and multilingual capabilities","year":"2023"},{"key":"ref54","article-title":"Mistral large 2: Advancing code generation and mathematical reasoning","year":"2024"},{"key":"ref55","article-title":"The llama 3 herd of models","year":"2024"},{"key":"ref56","article-title":"Wizardcoder: Empowering code large language models with evol-instruct","author":"Luo","year":"2023"},{"key":"ref57","article-title":"Nemotron-4 340b: Technical report on large-scale code models","year":"2024"},{"key":"ref58","article-title":"Granite code models: A family of open foundation models for code intelligence","year":"2024"},{"key":"ref59","article-title":"Magicoder: Source code is all you need","author":"Wei","year":"2023"},{"key":"ref60","article-title":"phind-codellama","year":"2023"},{"key":"ref61","article-title":"Gemini: a family of highly capable multimodal models","author":"Anil","year":"2023"},{"key":"ref62","article-title":"Textbooks are all you need","author":"Gunasekar","year":"2023"},{"key":"ref63","article-title":"Textbooks are all you need ii: phi-1.5 technical report","author":"Li","year":"2023"},{"key":"ref64","article-title":"Starcoder 2 and the stack v2: The next generation","author":"Lozhkov","year":"2024"},{"key":"ref65","article-title":"Palm 2 technical report","author":"Anil","year":"2023"},{"key":"ref66","article-title":"Starcoder: may the source be with you!","author":"Li","year":"2023"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.5040\/9781501365072.10293"},{"key":"ref68","article-title":"Santacoder: don\u2019t reach for the stars!","author":"Allal","year":"2023"},{"key":"ref69","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref70","article-title":"Training compute-optimal large language models","author":"Hoffmann","year":"2022"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127063"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-2074"},{"key":"ref73","article-title":"Mixtral of experts","author":"Jiang","year":"2024"},{"key":"ref74","article-title":"Alice in wonderland: Simple tasks showing complete reasoning breakdown in state-of-the-art large language models","year":"2024"},{"key":"ref75","volume-title":"Natural language processing with transformers","author":"Tunstall","year":"2022"},{"key":"ref76","article-title":"Glu variants improve transformer","author":"Shazeer","year":"2020"},{"key":"ref77","article-title":"Searching for activation functions","author":"Ramachandran","year":"2017"},{"key":"ref78","article-title":"Language modeling with gated convolutional networks","volume-title":"Proceedings of the 34th International Conference on Machine Learning","volume":"70","author":"Dauphin"},{"key":"ref79","article-title":"Rectified linear units improve restricted boltzmann machines","volume-title":"Proceedings of the 27th international conference on machine learning (ICML-10)","author":"Nair"},{"key":"ref80","article-title":"Lora: Low-rank adaptation of large language models","author":"Hu","year":"2021"},{"key":"ref81","article-title":"Qlora: Efficient finetuning of quantized llms","author":"Dettmers","year":"2023"},{"key":"ref82","article-title":"Wizardlm: Empowering large language models to follow complex instructions","author":"Xu","year":"2023"},{"key":"ref83","article-title":"Octopack: Instruction tuning code large language models","author":"Muennighoff","year":"2023"},{"key":"ref84","article-title":"Codegemma: Open code language models for code generation and analysis","author":"Yadav","year":"2024"},{"key":"ref85","article-title":"Gemma: Open models based on gemini research and technology","year":"2024"},{"key":"ref86","article-title":"Mojobench: Language modeling and benchmarks for mojo","author":"Raihan","year":"2024"},{"key":"ref87","article-title":"Evaluating large language models trained on code","author":"Chen","year":"2021"},{"key":"ref88","article-title":"Humaneval-xl: A multilingual code generation benchmark for cross-lingual natural language generalization","author":"Peng","year":"2024"},{"key":"ref89","article-title":"mhumaneval\u2013a multilingual benchmark to evaluate large language models for code generation","author":"Raihan","year":"2024"},{"key":"ref90","article-title":"Multipl-e: A scalable and extensible approach to benchmarking neural code generation","author":"Cassano","year":"2022"},{"key":"ref91","article-title":"Ds-1000: A natural and reliable benchmark for data science code generation","volume-title":"International Conference on Machine Learning","author":"Lai"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-62700-2_5"},{"key":"ref93","doi-asserted-by":"crossref","DOI":"10.21203\/rs.3.rs-5348871\/v1","article-title":"On the performance of large language models on introductory programming assignments","author":"Raihan","year":"2024"},{"key":"ref94","article-title":"Pitfalls in language models for code intelligence: A taxonomy and survey","author":"She","year":"2023"},{"key":"ref95","article-title":"Holistic evaluation of language models","author":"Liang","year":"2022"},{"key":"ref96","article-title":"Codexglue: A machine learning benchmark dataset for code understanding and generation","author":"Lu","year":"2021"},{"key":"ref97","article-title":"An impact study of code quality and quantity on ai code generation","author":"Peng","year":"2023"},{"key":"ref98","article-title":"Software robustness for ai\/ml components","author":"Pierazzi","year":"2020"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/SP46214.2022.9833571"},{"key":"ref100","article-title":"The unsolved problems in ai security","author":"Hendrycks","year":"2021"},{"key":"ref101","article-title":"The stack: 3 tb of permissively licensed source code","author":"Kocetkov","year":"2022"},{"key":"ref102","article-title":"Bloom: A 176b-parameter open-access multilingual language model","author":"Workshop","year":"2022"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.845"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645643"},{"key":"ref105","article-title":"The era of 1-bit llms: All large language models are in 1.58 bits","author":"Ma","year":"2024"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.747"},{"key":"ref107","article-title":"Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Liu"}],"event":{"name":"2024 IEEE International Conference on Big Data (BigData)","location":"Washington, DC, USA","start":{"date-parts":[[2024,12,15]]},"end":{"date-parts":[[2024,12,18]]}},"container-title":["2024 IEEE International Conference on Big Data (BigData)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10824975\/10824942\/10826108.pdf?arnumber=10826108","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,17]],"date-time":"2025-01-17T07:49:17Z","timestamp":1737100157000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10826108\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,15]]},"references-count":107,"URL":"https:\/\/doi.org\/10.1109\/bigdata62323.2024.10826108","relation":{},"subject":[],"published":{"date-parts":[[2024,12,15]]}}}