{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:28:32Z","timestamp":1763191712583,"version":"3.45.0"},"reference-count":24,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100000038","name":"Natural Sciences and Engineering Research Council of Canada","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100000038","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001804","name":"Canada Research Chairs","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001804","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/ijcnn64981.2025.11228044","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:15Z","timestamp":1763145975000},"page":"1-10","source":"Crossref","is-referenced-by-count":0,"title":["Training Dynamics of a 1.7B LLaMa Model: A Data-Efficient Approach"],"prefix":"10.1109","author":[{"given":"Miles Q.","family":"Li","sequence":"first","affiliation":[{"name":"McGill University,School of Information Studies,Montreal,Canada"}]},{"given":"Benjamin C. M.","family":"Fung","sequence":"additional","affiliation":[{"name":"McGill University,School of Information Studies,Montreal,Canada"}]},{"given":"Shih-Chia","family":"Huang","sequence":"additional","affiliation":[{"name":"National Taipei University of Technology,Department of Electronic Engineering,Taipei,Taiwan"}]}],"member":"263","reference":[{"key":"ref1","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"article-title":"Palm: Scaling language modeling with pathways","year":"2022","author":"Chowdhery","key":"ref2"},{"article-title":"Llama: Open and efficient foundation language models","year":"2023","author":"Touvron","key":"ref3"},{"article-title":"Root mean square layer normalization","year":"2019","author":"Zhang","key":"ref4"},{"article-title":"Gaussian error linear units (gelus)","year":"2016","author":"Hendrycks","key":"ref5"},{"article-title":"The fineweb datasets: Decanting the web for the finest text data at scale","year":"2024","author":"Penedo","key":"ref6"},{"article-title":"The llama 3 herd of models","year":"2024","author":"Dubey","key":"ref7"},{"article-title":"Decoupled weight decay regularization","year":"2017","author":"Loshchilov","key":"ref8"},{"key":"ref9","article-title":"Pytorch: An imperative style, high-performance deep learning library","volume":"32","author":"Paszke","year":"2019","journal-title":"Advances in neural information processing systems"},{"article-title":"Llama 2: Open foundation and fine-tuned chat models","year":"2023","author":"Touvron","key":"ref10"},{"year":"2023","key":"ref11","article-title":"Gpt-4 technical report"},{"article-title":"Qwen2 technical report","year":"2024","author":"Yang","key":"ref12"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1472"},{"article-title":"Tinyllama: An open-source small language model","year":"2024","author":"Zhang","key":"ref14"},{"article-title":"SlimPajama: A 627B token cleaned and deduplicated version of RedPajama","year":"2023","author":"Soboleva","key":"ref15"},{"key":"ref16","first-page":"4791","article-title":"Think you have solved question answering? try arc, the ai2 reasoning challenge","volume-title":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","author":"Clark"},{"article-title":"Boolq: Exploring the surprising difficulty of natural yes\/no questions","year":"2019","author":"Clark","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1260"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6239"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6399"},{"article-title":"Scaling llm test-time compute optimally can be more effective than scaling model parameters","year":"2024","author":"Snell","key":"ref21"},{"article-title":"Stanford alpaca: An instruction-following llama model","year":"2023","author":"Taori","key":"ref22"},{"key":"ref23","first-page":"46595","article-title":"Judging llm-as-a-judge with mt-bench and chatbot arena","volume":"36","author":"Zheng","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"article-title":"Lora: Low-rank adaptation of large language models","year":"2021","author":"Hu","key":"ref24"}],"event":{"name":"2025 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2025,6,30]]},"location":"Rome, Italy","end":{"date-parts":[[2025,7,5]]}},"container-title":["2025 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11227166\/11227148\/11228044.pdf?arnumber=11228044","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:26:19Z","timestamp":1763191579000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11228044\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":24,"URL":"https:\/\/doi.org\/10.1109\/ijcnn64981.2025.11228044","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}