{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,28]],"date-time":"2025-06-28T06:40:03Z","timestamp":1751092803919,"version":"3.41.0"},"reference-count":40,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,25]],"date-time":"2025-05-25T00:00:00Z","timestamp":1748131200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,25]],"date-time":"2025-05-25T00:00:00Z","timestamp":1748131200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,25]]},"DOI":"10.1109\/iscas56072.2025.11044158","type":"proceedings-article","created":{"date-parts":[[2025,6,27]],"date-time":"2025-06-27T17:42:19Z","timestamp":1751046139000},"page":"1-6","source":"Crossref","is-referenced-by-count":0,"title":["LLM Training Workload IO Characteristics"],"prefix":"10.1109","author":[{"given":"Kiran","family":"Gunnam","sequence":"first","affiliation":[{"name":"Micron Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alex","family":"Mohandas","sequence":"additional","affiliation":[{"name":"Micron Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mahesh Kumar","family":"Dhote","sequence":"additional","affiliation":[{"name":"Micron Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rajesh","family":"Bhagwat","sequence":"additional","affiliation":[{"name":"Micron Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Darshan","family":"P","sequence":"additional","affiliation":[{"name":"Micron Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kanaiah","family":"Kothalikar","sequence":"additional","affiliation":[{"name":"Micron Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"article-title":"Understanding the Workload Characteristics of Large Language Model Development","year":"2024","author":"Hu","key":"ref1"},{"article-title":"BurstGPT: A Real-world Workload Dataset to Optimize LLM Serving Systems","year":"2024","author":"Wang","key":"ref2"},{"article-title":"Storage Benchmarking with Deep Learning Workloads","year":"2021","author":"Cheng","key":"ref3"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446763"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3659995.3660038"},{"key":"ref6","first-page":"11117","article-title":"Cramming: Training a Language Model on a single GPU in one day","volume-title":"International Conference on Machine Learning","author":"Geiping"},{"article-title":"FlashNeuron: SSD-Enabled Large-Batch Training of Very Deep Neural Networks","year":"2021","author":"Bae","key":"ref7"},{"article-title":"ByteCheckpoint: A Unified Checkpointing System for LLM Development","year":"2024","author":"Wan","key":"ref8"},{"article-title":"TBA: Faster Large Language Model Training Using SSD-Based Activation Offloading","year":"2024","author":"Wu","key":"ref9"},{"volume-title":"FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness.","year":"2022","author":"Dao","key":"ref10"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611977936.28"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.3390\/app14188186"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3538643.3539750"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1016\/j.sysarc.2020.101828"},{"article-title":"A study of checkpointing in large scale training of deep neural networks","year":"2020","author":"Rojas","key":"ref15"},{"key":"ref16","article-title":"The Cost of Training AI Could Soon Become Too Much to Bear","volume-title":"Yahoo Finance","author":"Meyer","year":"2024"},{"article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","year":"2018","author":"Devlin","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2024.3426514"},{"article-title":"Decoupled Weight Decay Regularization","volume-title":"International Conference on Learning Representations (ICLR)","author":"Loshchilov","key":"ref19"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ipdps49936.2021.00023"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/sc41404.2022.00046"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ipdps.2019.00099"},{"volume-title":"Learning N:M Fine-grained Structured Sparse Neural Networks From Scratch.","year":"2021","author":"Zhou","key":"ref23"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/tcad.2010.2048362"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611977936.28"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.3390\/app14188186"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3538643.3539750"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1016\/j.sysarc.2020.101828"},{"author":"Gunnam","key":"ref29","article-title":"Levaraging HBM for efficient training of large scale Foundational Models"},{"author":"Gunnam","key":"ref30","article-title":"Domain Specific Accelerator for the training of large scale Foundational Models"},{"year":"2024","key":"ref31","article-title":"Interim Report on Binary Floating-point Formats for Machine Learning"},{"year":"2024","key":"ref32","article-title":"Support for no-frills FP8 matmuls"},{"year":"2023","key":"ref33","article-title":"CUTLASS 3.2: New warp-specialized persistent FP8 GEMM kernel"},{"journal-title":"Listserv communication to IEEE P3109 Standards Working Group on Llama3, H100 and FP8 training efficiency","year":"2024","author":"Gunnam","key":"ref34"},{"article-title":"FP8-LM: Training FP8 Large Language Models","year":"2023","author":"Peng","key":"ref35"},{"article-title":"To FP8 and Back Again: Quantifying the Effects of Reducing Precision on LLM Training Stability","year":"2024","author":"Lee","key":"ref36"},{"article-title":"Ultra-Low Precision 4-bit Training of Deep Neural Networks","volume-title":"NeurIPS","author":"Sun","key":"ref37"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.39"},{"article-title":"COAT: Compressing Optimizer States and Activation for Memory-Efficient FP8 Training","volume-title":"ICLR","author":"Xi","key":"ref39"},{"key":"ref40","article-title":"CXL memory use-case for datacenter SSD applications","author":"Bhagwat","year":"2023","journal-title":"IEEE CASS Chapter Egypt"}],"event":{"name":"2025 IEEE International Symposium on Circuits and Systems (ISCAS)","start":{"date-parts":[[2025,5,25]]},"location":"London, United Kingdom","end":{"date-parts":[[2025,5,28]]}},"container-title":["2025 IEEE International Symposium on Circuits and Systems (ISCAS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11043142\/11042930\/11044158.pdf?arnumber=11044158","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,28]],"date-time":"2025-06-28T06:05:54Z","timestamp":1751090754000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11044158\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,25]]},"references-count":40,"URL":"https:\/\/doi.org\/10.1109\/iscas56072.2025.11044158","relation":{},"subject":[],"published":{"date-parts":[[2025,5,25]]}}}