{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,28]],"date-time":"2026-03-28T13:10:53Z","timestamp":1774703453317,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":14,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,18]]},"DOI":"10.1145\/3703323.3704802","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T12:03:28Z","timestamp":1750853008000},"page":"378-380","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Data preparation for fine tuning Large Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-9517-7267","authenticated-orcid":false,"given":"Parameswaran","family":"Selvam","sequence":"first","affiliation":[{"name":"IBM Research, bengaluru, karnataka, India"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0031-8428","authenticated-orcid":false,"given":"Hima","family":"Patel","sequence":"additional","affiliation":[{"name":"IBM Research, bengaluru, karnataka, India"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4599-7203","authenticated-orcid":false,"given":"Saptha","family":"Surendran","sequence":"additional","affiliation":[{"name":"IBM Research, bengaluru, karnataka, India"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8463-1771","authenticated-orcid":false,"given":"Shivdeep","family":"Singh","sequence":"additional","affiliation":[{"name":"IBM Research, Gurgaon, Delhi, IN"}]}],"member":"320","published-online":{"date-parts":[[2025,6,25]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"BigCode. 2022. BigCode Project is an open scientific collaboration run by Hugging Face and ServiceNow Research focused on open and responsible development of LLMs for code.https:\/\/www.bigcode-project.org\/"},{"key":"e_1_3_3_1_3_2","unstructured":"Daya Guo Qihao Zhu Dejian Yang Zhenda Xie Kai Dong Wentao Zhang Guanting Chen Xiao Bi Y Wu YK Li et\u00a0al. 2024. DeepSeek-Coder: When the Large Language Model Meets Programming\u2013The Rise of Code Intelligence. arXiv preprint arXiv:2401.14196 (2024)."},{"key":"e_1_3_3_1_4_2","unstructured":"Denis Kocetkov Raymond Li Loubna\u00a0Ben Allal Jia Li Chenghao Mou Carlos\u00a0Mu\u00f1oz Ferrandis Yacine Jernite Margaret Mitchell Sean Hughes Thomas Wolf et\u00a0al. 2022. The stack: 3 tb of permissively licensed source code. arXiv preprint arXiv:2211.15533 (2022)."},{"key":"e_1_3_3_1_5_2","unstructured":"Raymond Li Loubna\u00a0Ben Allal Yangtian Zi Niklas Muennighoff Denis Kocetkov Chenghao Mou Marc Marone Christopher Akiki Jia Li Jenny Chim et\u00a0al. 2023. Starcoder: may the source be with you! arXiv preprint arXiv:2305.06161 (2023)."},{"key":"e_1_3_3_1_6_2","unstructured":"Anton Lozhkov Raymond Li Loubna\u00a0Ben Allal Federico Cassano Joel Lamy-Poirier Nouamane Tazi Ao Tang Dmytro Pykhtar Jiawei Liu Yuxiang Wei et\u00a0al. 2024. StarCoder 2 and The Stack v2: The Next Generation. arXiv preprint arXiv:2402.19173 (2024)."},{"key":"e_1_3_3_1_7_2","unstructured":"Mayank Mishra Matt Stallone Gaoyuan Zhang Yikang Shen Aditya Prasad Adriana\u00a0Meza Soria Michele Merler Parameswaran Selvam Saptha Surendran Shivdeep Singh et\u00a0al. 2024. Granite code models: A family of open foundation models for code intelligence. arXiv preprint arXiv:2405.04324 (2024)."},{"key":"e_1_3_3_1_8_2","unstructured":"Open-Source-Community. 2023. Data Prep Kit. https:\/\/github.com\/IBM\/data-prep-kit."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","unstructured":"Guilherme Penedo Hynek Kydl\u00ed\u010dek Leandro von Werra and Thomas Wolf. 2024. FineWeb. 10.57967\/hf\/2493","DOI":"10.57967\/hf\/2493"},{"key":"e_1_3_3_1_10_2","unstructured":"Nikhil Pinnaparaju Reshinth Adithyan Duy Phung Jonathan Tow James Baicoianu Ashish Datta Maksym Zhuravinskyi Dakota Mahan Marco Bellagente Carlos Riquelme et\u00a0al. 2024. Stable code technical report. arXiv preprint arXiv:2404.01226 (2024)."},{"key":"e_1_3_3_1_11_2","unstructured":"Jack\u00a0W Rae Sebastian Borgeaud Trevor Cai Katie Millican Jordan Hoffmann Francis Song John Aslanides Sarah Henderson Roman Ring Susannah Young et\u00a0al. 2021. Scaling language models: Methods analysis & insights from training gopher. arXiv preprint arXiv:2112.11446 (2021)."},{"key":"e_1_3_3_1_12_2","unstructured":"IBM Research. 2024. Technical report on Granite Foundation Models. https:\/\/www.ibm.com\/downloads\/cas\/X9W4O6BM."},{"key":"e_1_3_3_1_13_2","unstructured":"Daria Soboleva Faisal Al-Khateeb Robert Myers Jacob\u00a0R Steeves Joel Hestness and Nolan Dey. 2023. SlimPajama: A 627B token cleaned and deduplicated version of RedPajama. https:\/\/www.cerebras.net\/blog\/slimpajama-a-627b-token-cleaned-and-deduplicated-version-of-redpajama. https:\/\/huggingface.co\/datasets\/cerebras\/SlimPajama-627B"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"Luca Soldaini Rodney Kinney Akshita Bhagia Dustin Schwenk David Atkinson Russell Authur Ben Bogin Khyathi Chandu Jennifer Dumas Yanai Elazar et\u00a0al. 2024. Dolma: An Open Corpus of Three Trillion Tokens for Language Model Pretraining Research. arXiv preprint arXiv:2402.00159 (2024).","DOI":"10.18653\/v1\/2024.acl-long.840"},{"key":"e_1_3_3_1_15_2","unstructured":"Yury Tokpanov Beren Millidge Paolo Glorioso Jonathan Pilault Adam Ibrahim James Whittington and Quentin Anthony. 2024. Zyda: A 1.3 T Dataset for Open Language Modeling. arXiv preprint arXiv:2406.01981 (2024)."}],"event":{"name":"CODS-COMAD 2024: 8th International Conference on Data Science and Management of Data (12th ACM IKDD CODS and 30th COMAD)","location":"Jodhpur India","acronym":"CODS-COMAD Dec '24"},"container-title":["Proceedings of the 8th International Conference on Data Science and Management of Data (12th ACM IKDD CODS and 30th COMAD)"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3703323.3704802","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T13:02:17Z","timestamp":1750856537000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3703323.3704802"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,18]]},"references-count":14,"alternative-id":["10.1145\/3703323.3704802","10.1145\/3703323"],"URL":"https:\/\/doi.org\/10.1145\/3703323.3704802","relation":{},"subject":[],"published":{"date-parts":[[2024,12,18]]},"assertion":[{"value":"2025-06-25","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}