{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T15:29:17Z","timestamp":1775143757100,"version":"3.50.1"},"reference-count":62,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,12,15]],"date-time":"2024-12-15T00:00:00Z","timestamp":1734220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,15]],"date-time":"2024-12-15T00:00:00Z","timestamp":1734220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,12,15]]},"DOI":"10.1109\/bigdata62323.2024.10825098","type":"proceedings-article","created":{"date-parts":[[2025,1,16]],"date-time":"2025-01-16T18:31:23Z","timestamp":1737052283000},"page":"2100-2109","source":"Crossref","is-referenced-by-count":1,"title":["HLAT: High-quality Large Language Model Pre-trained on AWS Trainium"],"prefix":"10.1109","author":[{"given":"Haozheng","family":"Fan","sequence":"first","affiliation":[{"name":"Amazon,Amazon Web Services"}]},{"given":"Hao","family":"Zhou","sequence":"additional","affiliation":[{"name":"Amazon,AWS AI Labs"}]},{"given":"Guangtai","family":"Huang","sequence":"additional","affiliation":[{"name":"Amazon,Amazon Web Services"}]},{"given":"Parameswaran","family":"Raman","sequence":"additional","affiliation":[{"name":"Amazon,Amazon Web Services"}]},{"given":"Xinwei","family":"Fu","sequence":"additional","affiliation":[{"name":"Amazon,Amazon Web Services"}]},{"given":"Gaurav","family":"Gupta","sequence":"additional","affiliation":[{"name":"Amazon,AWS AI Labs"}]},{"given":"Dhananjay","family":"Ram","sequence":"additional","affiliation":[{"name":"Amazon,AGI Foundations"}]},{"given":"Yida","family":"Wang","sequence":"additional","affiliation":[{"name":"Amazon,Amazon Web Services"}]},{"given":"Jun","family":"Huan","sequence":"additional","affiliation":[{"name":"Amazon,AWS AI Labs"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/612"},{"key":"ref3","article-title":"How good are gpt models at machine translation? a comprehensive evaluation","author":"Hendy","year":"2023"},{"key":"ref4","article-title":"Large language models for information retrieval: A survey","author":"Zhu","year":"2023"},{"key":"ref5","article-title":"Code llama: Open foundation models for code","author":"Roziere","year":"2023"},{"key":"ref6","article-title":"A survey of large language models","author":"Zhao","year":"2023"},{"key":"ref7","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Brown","year":"2020"},{"key":"ref8","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref9","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref10","article-title":"The llama 3 herd of models","author":"Dubey","year":"2024"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"ref12","article-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism","author":"Shoeybi","year":"2019"},{"key":"ref13","article-title":"Fairscale: A general purpose modular pytorch library for high performance and large scale training","year":"2021"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.14778\/3611540.3611569"},{"key":"ref15","article-title":"Aligning ai with shared human values","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Hendrycks"},{"key":"ref16","article-title":"TensorFlow: Large-scale machine learning on heterogeneous systems","author":"Abadi","year":"2015"},{"key":"ref17","article-title":"Training deep nets with sublinear memory cost","author":"Chen","year":"2016"},{"key":"ref18","first-page":"341","article-title":"Reducing activation recomputation in large transformer models","volume-title":"Proceedings of Machine Learning and Systems","volume":"5","author":"Korthikanti"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"ref21","article-title":"Redpajama: an open dataset for training large language models","year":"2023"},{"key":"ref22","article-title":"peS2o (Pretraining Efficiently on S2ORC) Dataset","author":"Soldaini","year":"2023","journal-title":"Allen Institute for AI, Tech. Rep."},{"key":"ref23","article-title":"Openwebmath: An open dataset of highquality mathematical web text","author":"Paster","year":"2023"},{"key":"ref24","article-title":"Apache arrow, a crosslanguage development platform for in-memory analytics","author":"Arrow","year":"2020"},{"key":"ref25","article-title":"NeMo: a toolkit for Conversational AI and Large Language Models","author":"Harper"},{"key":"ref26","article-title":"Mixed precision training","author":"Micikevicius","year":"2017"},{"key":"ref27","first-page":"1737","article-title":"Deep learning with limited numerical precision","volume-title":"Proceedings of the 32nd International Conference on International Conference on Machine Learning - Volume 37","author":"Gupta"},{"key":"ref28","article-title":"Glu variants improve transformer","author":"Shazeer","year":"2020"},{"key":"ref29","article-title":"Openllama: An open reproduction of llama","author":"Geng","year":"2023"},{"key":"ref30","article-title":"Spike no more: Stabilizing the pre-training of large language models","author":"Takase","year":"2023"},{"key":"ref31","article-title":"Decoupled weight decay regularization","volume-title":"International Conference on Learning Representations","author":"Loshchilov"},{"key":"ref32","article-title":"Distributed inference and finetuning of large language models over the internet","volume":"36","author":"Borzunov","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref33","article-title":"Averaging weights leads to wider optima and better generalization","author":"Izmailov","year":"2018"},{"key":"ref34","article-title":"The RefinedWeb dataset for Falcon LLM: outperforming curated corpora with web data, and web data only","author":"Penedo","year":"2023"},{"key":"ref35","author":"Li","year":"2023","journal-title":"Starcoder: may the source be with you!"},{"key":"ref36","article-title":"A survey on evaluation of large language models","author":"Chang","year":"2023","journal-title":"ACM Transactions on Intelligent Systems and Technology"},{"key":"ref37","article-title":"Evaluating large language models trained on code","author":"Chen","year":"2021"},{"key":"ref38","article-title":"A framework for few-shot language model evaluation","author":"Gao","year":"2021"},{"key":"ref39","article-title":"Measuring massive multitask language understanding","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Hendrycks"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6239"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1472"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3474381"},{"key":"ref43","article-title":"Think you have solved question answering? try arc, the ai2 reasoning challenge","author":"Clark","year":"2018"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00276"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1147"},{"key":"ref46","article-title":"Boolq: Exploring the surprising difficulty of natural yes\/no questions","author":"Clark","year":"2019","journal-title":"NAACL"},{"key":"ref47","article-title":"Training verifiers to solve math word problems","author":"Cobbe","year":"2021"},{"key":"ref48","article-title":"Textbooks are all you need","author":"Gunasekar","year":"2023"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.841"},{"key":"ref50","article-title":"Truthfulqa: Measuring how models mimic human falsehoods","author":"Lin","year":"2021"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.154"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1145\/3698038.3698535"},{"key":"ref53","article-title":"Siren\u2019s song in the ai ocean: A survey on hallucination in large language models","author":"Zhang","year":"2023"},{"key":"ref54","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019","journal-title":"North American Chapter of the Association for Computational Linguistics"},{"issue":"8","key":"ref55","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref56","article-title":"Roberta: A robustly optimized bert pretraining approach","author":"Liu","year":"2019"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"ref58","article-title":"Training compute-optimal large language models","author":"Hoffmann","year":"2022"},{"key":"ref59","article-title":"Palm 2 technical report","author":"Anil","year":"2023"},{"key":"ref60","article-title":"The falcon series of open language models","author":"Almazrouei","year":"2023"},{"key":"ref61","article-title":"Gemini: a family of highly capable multimodal models","author":"Team","year":"2023"},{"key":"ref62","article-title":"Gpt-4 technical report","year":"2023"}],"event":{"name":"2024 IEEE International Conference on Big Data (BigData)","location":"Washington, DC, USA","start":{"date-parts":[[2024,12,15]]},"end":{"date-parts":[[2024,12,18]]}},"container-title":["2024 IEEE International Conference on Big Data (BigData)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10824975\/10824942\/10825098.pdf?arnumber=10825098","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,17]],"date-time":"2025-01-17T07:45:24Z","timestamp":1737099924000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10825098\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,15]]},"references-count":62,"URL":"https:\/\/doi.org\/10.1109\/bigdata62323.2024.10825098","relation":{},"subject":[],"published":{"date-parts":[[2024,12,15]]}}}