{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T06:28:31Z","timestamp":1778048911679,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,11,13]],"date-time":"2021-11-13T00:00:00Z","timestamp":1636761600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,11,14]]},"DOI":"10.1145\/3458817.3476209","type":"proceedings-article","created":{"date-parts":[[2021,10,21]],"date-time":"2021-10-21T04:49:21Z","timestamp":1634791761000},"page":"1-15","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":424,"title":["Efficient large-scale language model training on GPU clusters using megatron-LM"],"prefix":"10.1145","author":[{"given":"Deepak","family":"Narayanan","sequence":"first","affiliation":[{"name":"NVIDIA and Microsoft Research"}]},{"given":"Mohammad","family":"Shoeybi","sequence":"additional","affiliation":[{"name":"NVIDIA"}]},{"given":"Jared","family":"Casper","sequence":"additional","affiliation":[{"name":"NVIDIA"}]},{"given":"Patrick","family":"LeGresley","sequence":"additional","affiliation":[{"name":"NVIDIA"}]},{"given":"Mostofa","family":"Patwary","sequence":"additional","affiliation":[{"name":"NVIDIA"}]},{"given":"Vijay","family":"Korthikanti","sequence":"additional","affiliation":[{"name":"NVIDIA"}]},{"given":"Dmitri","family":"Vainbrand","sequence":"additional","affiliation":[{"name":"NVIDIA"}]},{"given":"Prethvi","family":"Kashinkunti","sequence":"additional","affiliation":[{"name":"NVIDIA"}]},{"given":"Julie","family":"Bernauer","sequence":"additional","affiliation":[{"name":"NVIDIA"}]},{"given":"Bryan","family":"Catanzaro","sequence":"additional","affiliation":[{"name":"NVIDIA"}]},{"given":"Amar","family":"Phanishayee","sequence":"additional","affiliation":[{"name":"Microsoft Research"}]},{"given":"Matei","family":"Zaharia","sequence":"additional","affiliation":[{"name":"Stanford University"}]}],"member":"320","published-online":{"date-parts":[[2021,11,13]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"Applications of GPT-3. https:\/\/openai.com\/blog\/gpt-3-apps\/.  Applications of GPT-3. https:\/\/openai.com\/blog\/gpt-3-apps\/."},{"key":"e_1_3_2_2_2_1","unstructured":"DeepSpeed: Extreme-Scale Model Training for Everyone. https:\/\/www.microsoft.com\/en-us\/research\/blog\/deepspeed-extreme-scale-model-training-for-everyone\/.  DeepSpeed: Extreme-Scale Model Training for Everyone. https:\/\/www.microsoft.com\/en-us\/research\/blog\/deepspeed-extreme-scale-model-training-for-everyone\/."},{"key":"e_1_3_2_2_3_1","unstructured":"DeepSpeed Repository. https:\/\/www.deepspeed.ai\/.  DeepSpeed Repository. https:\/\/www.deepspeed.ai\/."},{"key":"e_1_3_2_2_4_1","unstructured":"GitHub Copilot. https:\/\/copilot.github.com\/.  GitHub Copilot. https:\/\/copilot.github.com\/."},{"key":"e_1_3_2_2_5_1","unstructured":"Microsoft Translates Spoken Text to Code. https:\/\/techcrunch.com\/2021\/05\/25\/microsoft-uses-gpt-3-to-let-you-code-in-natural-language\/.  Microsoft Translates Spoken Text to Code. https:\/\/techcrunch.com\/2021\/05\/25\/microsoft-uses-gpt-3-to-let-you-code-in-natural-language\/."},{"key":"e_1_3_2_2_6_1","unstructured":"NVIDIA A100 Tensor Core GPU. https:\/\/www.nvidia.com\/en-us\/data-center\/a100\/.  NVIDIA A100 Tensor Core GPU. https:\/\/www.nvidia.com\/en-us\/data-center\/a100\/."},{"key":"e_1_3_2_2_7_1","unstructured":"NVIDIA Collective Communication Library (NCCL). https:\/\/developer.nvidia.com\/nccl.  NVIDIA Collective Communication Library (NCCL). https:\/\/developer.nvidia.com\/nccl."},{"key":"e_1_3_2_2_8_1","unstructured":"NVIDIA Selene Supercomputer. https:\/\/www.top500.org\/system\/179842\/.  NVIDIA Selene Supercomputer. https:\/\/www.top500.org\/system\/179842\/."},{"key":"e_1_3_2_2_9_1","unstructured":"NVLink and NVSwitch. https:\/\/www.nvidia.com\/en-us\/data-center\/nvlink\/.  NVLink and NVSwitch. https:\/\/www.nvidia.com\/en-us\/data-center\/nvlink\/."},{"key":"e_1_3_2_2_10_1","unstructured":"PyTorch JIT. https:\/\/pytorch.org\/docs\/stable\/jit.html.  PyTorch JIT. https:\/\/pytorch.org\/docs\/stable\/jit.html."},{"key":"e_1_3_2_2_11_1","volume-title":"Language Models are Few-Shot Learners. arXiv preprint arXiv:2005.14165","author":"Brown Tom","year":"2020","unstructured":"Tom Brown , Benjamin Mann , Nick Ryder , Melanie Subbiah , and Language Models are Few-Shot Learners. arXiv preprint arXiv:2005.14165 , 2020 . Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, and et al. Language Models are Few-Shot Learners. arXiv preprint arXiv:2005.14165, 2020."},{"key":"e_1_3_2_2_12_1","volume-title":"Training Deep Nets with Sublinear Memory Cost. arXiv preprint arXiv:1604.06174","author":"Chen Tianqi","year":"2016","unstructured":"Tianqi Chen , Bing Xu , Chiyuan Zhang , and Carlos Guestrin . Training Deep Nets with Sublinear Memory Cost. arXiv preprint arXiv:1604.06174 , 2016 . Tianqi Chen, Bing Xu, Chiyuan Zhang, and Carlos Guestrin. Training Deep Nets with Sublinear Memory Cost. arXiv preprint arXiv:1604.06174, 2016."},{"key":"e_1_3_2_2_13_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin , Ming-Wei Chang , Kenton Lee , and Kristina Toutanova . BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv preprint arXiv:1810.04805 , 2018 . Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv preprint arXiv:1810.04805, 2018."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"e_1_3_2_2_15_1","volume-title":"Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity. arXiv preprint arXiv:2101.03961","author":"Fedus William","year":"2021","unstructured":"William Fedus , Barret Zoph , and Noam Shazeer . Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity. arXiv preprint arXiv:2101.03961 , 2021 . William Fedus, Barret Zoph, and Noam Shazeer. Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity. arXiv preprint arXiv:2101.03961, 2021."},{"key":"e_1_3_2_2_16_1","first-page":"77","volume-title":"Domain Parallelism in Training Neural Networks. In Proceedings of the 30th on Symposium on Parallelism in Algorithms and Architectures","author":"Gholami Amir","year":"2018","unstructured":"Amir Gholami , Ariful Azad , Peter Jin , Kurt Keutzer , and Aydin Buluc . Integrated Model , Batch, and Domain Parallelism in Training Neural Networks. In Proceedings of the 30th on Symposium on Parallelism in Algorithms and Architectures , pages 77 -- 86 , 2018 . Amir Gholami, Ariful Azad, Peter Jin, Kurt Keutzer, and Aydin Buluc. Integrated Model, Batch, and Domain Parallelism in Training Neural Networks. In Proceedings of the 30th on Symposium on Parallelism in Algorithms and Architectures, pages 77--86, 2018."},{"key":"e_1_3_2_2_17_1","volume-title":"Large Minibatch SGD: Training ImageNet in 1 Hour. arXiv preprint arXiv:1706.02677","author":"Goyal Priya","year":"2017","unstructured":"Priya Goyal , Piotr Doll\u00e1r , Ross Girshick , Pieter Noordhuis , Lukasz Wesolowski , Aapo Kyrola , Andrew Tulloch , Yangqing Jia , and Kaiming He. Accurate , Large Minibatch SGD: Training ImageNet in 1 Hour. arXiv preprint arXiv:1706.02677 , 2017 . Priya Goyal, Piotr Doll\u00e1r, Ross Girshick, Pieter Noordhuis, Lukasz Wesolowski, Aapo Kyrola, Andrew Tulloch, Yangqing Jia, and Kaiming He. Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour. arXiv preprint arXiv:1706.02677, 2017."},{"key":"e_1_3_2_2_18_1","volume-title":"Revolve: An Implementation of Checkpointing for the Reverse or Adjoint Mode of Computational Differentiation. ACM Transactions on Mathematical Software (TOMS), 26(1):19--45","author":"Griewank Andreas","year":"2000","unstructured":"Andreas Griewank and Andrea Walther . Revolve: An Implementation of Checkpointing for the Reverse or Adjoint Mode of Computational Differentiation. ACM Transactions on Mathematical Software (TOMS), 26(1):19--45 , 2000 . Andreas Griewank and Andrea Walther. Revolve: An Implementation of Checkpointing for the Reverse or Adjoint Mode of Computational Differentiation. ACM Transactions on Mathematical Software (TOMS), 26(1):19--45, 2000."},{"key":"e_1_3_2_2_19_1","volume-title":"PipeTransformer: Automated Elastic Pipelining for Distributed Training of Transformers. arXiv preprint arXiv:2102.03161","author":"He Chaoyang","year":"2021","unstructured":"Chaoyang He , Shen Li , Mahdi Soltanolkotabi , and Salman Avestimehr . PipeTransformer: Automated Elastic Pipelining for Distributed Training of Transformers. arXiv preprint arXiv:2102.03161 , 2021 . Chaoyang He, Shen Li, Mahdi Soltanolkotabi, and Salman Avestimehr. PipeTransformer: Automated Elastic Pipelining for Distributed Training of Transformers. arXiv preprint arXiv:2102.03161, 2021."},{"key":"e_1_3_2_2_20_1","first-page":"103","volume-title":"Advances in Neural Information Processing Systems","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang , Youlong Cheng , Ankur Bapna , Orhan Firat , Dehao Chen , Mia Chen , HyoukJoong Lee , Jiquan Ngiam , Quoc V Le , Yonghui Wu , : Efficient Training of Giant Neural Networks using Pipeline Parallelism . In Advances in Neural Information Processing Systems , pages 103 -- 112 , 2019 . Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. In Advances in Neural Information Processing Systems, pages 103--112, 2019."},{"key":"e_1_3_2_2_21_1","first-page":"497","volume-title":"Ion Stoica. Breaking the Memory Wall with Optimal Tensor Rematerialization. In Proceedings of Machine Learning and Systems 2020","author":"Jain Paras","year":"2020","unstructured":"Paras Jain , Ajay Jain , Aniruddha Nrusimha , Amir Gholami , Pieter Abbeel , Joseph Gonzalez , Kurt Keutzer , and Ion Stoica. Breaking the Memory Wall with Optimal Tensor Rematerialization. In Proceedings of Machine Learning and Systems 2020 , pages 497 -- 511 . 2020 . Paras Jain, Ajay Jain, Aniruddha Nrusimha, Amir Gholami, Pieter Abbeel, Joseph Gonzalez, Kurt Keutzer, and Ion Stoica. Breaking the Memory Wall with Optimal Tensor Rematerialization. In Proceedings of Machine Learning and Systems 2020, pages 497--511. 2020."},{"key":"e_1_3_2_2_22_1","volume-title":"Alex Aiken. Beyond Data and Model Parallelism for Deep Neural Networks. In Proceedings of the 2nd Conference on Machine Learning and Systems (MLSys)","author":"Jia Zhihao","year":"2018","unstructured":"Zhihao Jia , Matei Zaharia , and Alex Aiken. Beyond Data and Model Parallelism for Deep Neural Networks. In Proceedings of the 2nd Conference on Machine Learning and Systems (MLSys) , 2018 . Zhihao Jia, Matei Zaharia, and Alex Aiken. Beyond Data and Model Parallelism for Deep Neural Networks. In Proceedings of the 2nd Conference on Machine Learning and Systems (MLSys), 2018."},{"key":"e_1_3_2_2_23_1","volume-title":"Proceedings of Machine Learning and Systems","author":"Kosson Atli","year":"2021","unstructured":"Atli Kosson , Vitaliy Chiley , Abhinav Venigalla , Joel Hestness , and Urs K\u00f6ster . Pipelined Backpropagation at Scale: Training Large Models without Batches . Proceedings of Machine Learning and Systems , 2021 . Atli Kosson, Vitaliy Chiley, Abhinav Venigalla, Joel Hestness, and Urs K\u00f6ster. Pipelined Backpropagation at Scale: Training Large Models without Batches. Proceedings of Machine Learning and Systems, 2021."},{"key":"e_1_3_2_2_24_1","volume-title":"Scale MLPerf-0.6 Models on Google TPU-v3 Pods. arXiv preprint arXiv:1909.09756","author":"Kumar Sameer","year":"2019","unstructured":"Sameer Kumar , Victor Bitorff , Dehao Chen , Chiachen Chou , Blake Hechtman , HyoukJoong Lee , Naveen Kumar , Peter Mattson , Shibo Wang , Tao Wang , Scale MLPerf-0.6 Models on Google TPU-v3 Pods. arXiv preprint arXiv:1909.09756 , 2019 . Sameer Kumar, Victor Bitorff, Dehao Chen, Chiachen Chou, Blake Hechtman, HyoukJoong Lee, Naveen Kumar, Peter Mattson, Shibo Wang, Tao Wang, et al. Scale MLPerf-0.6 Models on Google TPU-v3 Pods. arXiv preprint arXiv:1909.09756, 2019."},{"key":"e_1_3_2_2_25_1","volume-title":"PyTorch Distributed: Experiences on Accelerating Data Parallel Training. arXiv preprint arXiv:2006.15704","author":"Li Shen","year":"2020","unstructured":"Shen Li , Yanli Zhao , Rohan Varma , Omkar Salpekar , Pieter Noordhuis , Teng Li , Adam Paszke , Jeff Smith , Brian Vaughan , Pritam Damania , PyTorch Distributed: Experiences on Accelerating Data Parallel Training. arXiv preprint arXiv:2006.15704 , 2020 . Shen Li, Yanli Zhao, Rohan Varma, Omkar Salpekar, Pieter Noordhuis, Teng Li, Adam Paszke, Jeff Smith, Brian Vaughan, Pritam Damania, et al. PyTorch Distributed: Experiences on Accelerating Data Parallel Training. arXiv preprint arXiv:2006.15704, 2020."},{"key":"e_1_3_2_2_26_1","volume-title":"TeraPipe: Token-Level Pipeline Parallelism for Training Large-Scale Language Models. arXiv preprint arXiv:2102.07988","author":"Li Zhuohan","year":"2021","unstructured":"Zhuohan Li , Siyuan Zhuang , Shiyuan Guo , Danyang Zhuo , Hao Zhang , Dawn Song , and Ion Stoica . TeraPipe: Token-Level Pipeline Parallelism for Training Large-Scale Language Models. arXiv preprint arXiv:2102.07988 , 2021 . Zhuohan Li, Siyuan Zhuang, Shiyuan Guo, Danyang Zhuo, Hao Zhang, Dawn Song, and Ion Stoica. TeraPipe: Token-Level Pipeline Parallelism for Training Large-Scale Language Models. arXiv preprint arXiv:2102.07988, 2021."},{"key":"e_1_3_2_2_27_1","volume-title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach. CoRR, abs\/1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu , Myle Ott , Naman Goyal , Jingfei Du , Mandar Joshi , Danqi Chen , Omer Levy , Mike Lewis , Luke Zettlemoyer , and Veselin Stoyanov . RoBERTa: A Robustly Optimized BERT Pretraining Approach. CoRR, abs\/1907.11692 , 2019 . Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. RoBERTa: A Robustly Optimized BERT Pretraining Approach. CoRR, abs\/1907.11692, 2019."},{"key":"e_1_3_2_2_28_1","volume-title":"MLPerf Training Benchmark. arXiv preprint arXiv:1910.01500","author":"Mattson Peter","year":"2019","unstructured":"Peter Mattson , Christine Cheng , Cody Coleman , Greg Diamos , Paulius Micikevicius , David Patterson , Hanlin Tang , Gu-Yeon Wei , Peter Bailis , Victor Bittorf , MLPerf Training Benchmark. arXiv preprint arXiv:1910.01500 , 2019 . Peter Mattson, Christine Cheng, Cody Coleman, Greg Diamos, Paulius Micikevicius, David Patterson, Hanlin Tang, Gu-Yeon Wei, Peter Bailis, Victor Bittorf, et al. MLPerf Training Benchmark. arXiv preprint arXiv:1910.01500, 2019."},{"key":"e_1_3_2_2_29_1","first-page":"1","volume-title":"Matei Zaharia. PipeDream: Generalized Pipeline Parallelism for DNN Training. In Proceedings of the 27th ACM Symposium on Operating Systems Principles","author":"Narayanan Deepak","year":"2019","unstructured":"Deepak Narayanan , Aaron Harlap , Amar Phanishayee , Vivek Seshadri , Nikhil R Devanur , Gregory R Ganger , Phillip B Gibbons , and Matei Zaharia. PipeDream: Generalized Pipeline Parallelism for DNN Training. In Proceedings of the 27th ACM Symposium on Operating Systems Principles , pages 1 -- 15 , 2019 . Deepak Narayanan, Aaron Harlap, Amar Phanishayee, Vivek Seshadri, Nikhil R Devanur, Gregory R Ganger, Phillip B Gibbons, and Matei Zaharia. PipeDream: Generalized Pipeline Parallelism for DNN Training. In Proceedings of the 27th ACM Symposium on Operating Systems Principles, pages 1--15, 2019."},{"key":"e_1_3_2_2_30_1","first-page":"7937","volume-title":"Matei Zaharia. Memory-Efficient Pipeline-Parallel DNN Training. In International Conference on Machine Learning","author":"Narayanan Deepak","year":"2021","unstructured":"Deepak Narayanan , Amar Phanishayee , Kaiyu Shi , Xie Chen , and Matei Zaharia. Memory-Efficient Pipeline-Parallel DNN Training. In International Conference on Machine Learning , pages 7937 -- 7947 . PMLR, 2021 . Deepak Narayanan, Amar Phanishayee, Kaiyu Shi, Xie Chen, and Matei Zaharia. Memory-Efficient Pipeline-Parallel DNN Training. In International Conference on Machine Learning, pages 7937--7947. PMLR, 2021."},{"key":"e_1_3_2_2_31_1","first-page":"307","volume-title":"2020 USENIX Annual Technical Conference (USENIX ATC 20)","author":"Park Jay H","year":"2020","unstructured":"Jay H Park , Gyeongchan Yun , M Yi Chang , Nguyen T Nguyen , Seungmin Lee , Jaesik Choi , Sam H Noh , and Young-ri Choi. HetPipe : Enabling Large DNN Training on (Whimpy) Heterogeneous GPU Clusters through Integration of Pipelined Model Parallelism and Data Parallelism . In 2020 USENIX Annual Technical Conference (USENIX ATC 20) , pages 307 -- 321 , 2020 . Jay H Park, Gyeongchan Yun, M Yi Chang, Nguyen T Nguyen, Seungmin Lee, Jaesik Choi, Sam H Noh, and Young-ri Choi. HetPipe: Enabling Large DNN Training on (Whimpy) Heterogeneous GPU Clusters through Integration of Pipelined Model Parallelism and Data Parallelism. In 2020 USENIX Annual Technical Conference (USENIX ATC 20), pages 307--321, 2020."},{"key":"e_1_3_2_2_32_1","volume-title":"PyTorch: An Imperative Style","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke , Sam Gross , Francisco Massa , Adam Lerer , James Bradbury , Gregory Chanan , Trevor Killeen , Zeming Lin , Natalia Gimelshein , Luca Antiga , Alban Desmaison , Andreas Kopf , Edward Yang , Zachary DeVito , Martin Raison , Alykhan Tejani , Sasank Chilamkurthy , Benoit Steiner , Lu Fang , Junjie Bai , and Soumith Chintala . PyTorch: An Imperative Style , High-Performance Deep Learning Library . In Advances in Neural Information Processing Systems, volume 32 , 2019 . Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Advances in Neural Information Processing Systems, volume 32, 2019."},{"key":"e_1_3_2_2_33_1","volume-title":"Improving Language Understanding by Generative Pre-Training","author":"Radford Alec","year":"2018","unstructured":"Alec Radford , Karthik Narasimhan , Tim Salimans , and Ilya Sutskever . Improving Language Understanding by Generative Pre-Training , 2018 . Alec Radford, Karthik Narasimhan, Tim Salimans, and Ilya Sutskever. Improving Language Understanding by Generative Pre-Training, 2018."},{"issue":"8","key":"e_1_3_2_2_34_1","first-page":"9","article-title":"Language Models are Unsupervised Multitask Learners","volume":"1","author":"Radford Alec","year":"2019","unstructured":"Alec Radford , Jeffrey Wu , Rewon Child , David Luan , Dario Amodei , and Ilya Sutskever . Language Models are Unsupervised Multitask Learners . OpenAI Blog , 1 ( 8 ): 9 , 2019 . Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever. Language Models are Unsupervised Multitask Learners. OpenAI Blog, 1(8):9, 2019.","journal-title":"OpenAI Blog"},{"key":"e_1_3_2_2_35_1","volume-title":"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. arXiv:1910.10683","author":"Raffel Colin","year":"2019","unstructured":"Colin Raffel , Noam Shazeer , Adam Roberts , Katherine Lee , Sharan Narang , Michael Matena , Yanqi Zhou , Wei Li , and Peter J. Liu . Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. arXiv:1910.10683 , 2019 . Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. arXiv:1910.10683, 2019."},{"key":"e_1_3_2_2_36_1","volume-title":"ZeRO: Memory Optimization Towards Training A Trillion Parameter Models. arXiv preprint arXiv:1910.02054","author":"Rajbhandari Samyam","year":"2019","unstructured":"Samyam Rajbhandari , Jeff Rasley , Olatunji Ruwase , and Yuxiong He . ZeRO: Memory Optimization Towards Training A Trillion Parameter Models. arXiv preprint arXiv:1910.02054 , 2019 . Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. ZeRO: Memory Optimization Towards Training A Trillion Parameter Models. arXiv preprint arXiv:1910.02054, 2019."},{"key":"e_1_3_2_2_37_1","volume-title":"ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. arXiv preprint arXiv:2104.07857","author":"Rajbhandari Samyam","year":"2021","unstructured":"Samyam Rajbhandari , Olatunji Ruwase , Jeff Rasley , Shaden Smith , and Yuxiong He . ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. arXiv preprint arXiv:2104.07857 , 2021 . Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, and Yuxiong He. ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. arXiv preprint arXiv:2104.07857, 2021."},{"key":"e_1_3_2_2_38_1","volume-title":"Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. ZeRO-Offload: Democratizing Billion-Scale Model Training. arXiv preprint arXiv:2101.06840","author":"Ren Jie","year":"2021","unstructured":"Jie Ren , Samyam Rajbhandari , Reza Yazdani Aminabadi , Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. ZeRO-Offload: Democratizing Billion-Scale Model Training. arXiv preprint arXiv:2101.06840 , 2021 . Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. ZeRO-Offload: Democratizing Billion-Scale Model Training. arXiv preprint arXiv:2101.06840, 2021."},{"key":"e_1_3_2_2_39_1","volume-title":"Neural Information Processing Systems","author":"Shazeer Noam","year":"2018","unstructured":"Noam Shazeer , Youlong Cheng , Niki Parmar , Dustin Tran , Ashish Vaswani , Penporn Koanantakool , Peter Hawkins , HyoukJoong Lee , Mingsheng Hong , Cliff Young , Ryan Sepassi , and Blake Hechtman . Mesh-TensorFlow : Deep Learning for Supercomputers . In Neural Information Processing Systems , 2018 . Noam Shazeer, Youlong Cheng, Niki Parmar, Dustin Tran, Ashish Vaswani, Penporn Koanantakool, Peter Hawkins, HyoukJoong Lee, Mingsheng Hong, Cliff Young, Ryan Sepassi, and Blake Hechtman. Mesh-TensorFlow: Deep Learning for Supercomputers. In Neural Information Processing Systems, 2018."},{"key":"e_1_3_2_2_40_1","volume-title":"Megatron-LM: Training Multi-Billion Parameter Language Models using GPU Model Parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi , Mostofa Patwary , Raul Puri , Patrick LeGresley , Jared Casper , and Bryan Catanzaro . Megatron-LM: Training Multi-Billion Parameter Language Models using GPU Model Parallelism. arXiv preprint arXiv:1909.08053 , 2019 . Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. Megatron-LM: Training Multi-Billion Parameter Language Models using GPU Model Parallelism. arXiv preprint arXiv:1909.08053, 2019."},{"key":"e_1_3_2_2_41_1","first-page":"15451","volume-title":"Advances in Neural Information Processing Systems","author":"Tarnawski Jakub M","year":"2020","unstructured":"Jakub M Tarnawski , Amar Phanishayee , Nikhil Devanur , Divya Mahajan , and Fanny Nina Paravecino . Efficient Algorithms for Device Placement of DNN Graph Operators . In Advances in Neural Information Processing Systems , pages 15451 -- 15463 , 2020 . Jakub M Tarnawski, Amar Phanishayee, Nikhil Devanur, Divya Mahajan, and Fanny Nina Paravecino. Efficient Algorithms for Device Placement of DNN Graph Operators. In Advances in Neural Information Processing Systems, pages 15451--15463, 2020."},{"key":"e_1_3_2_2_42_1","volume-title":"Attention is All You Need. arXiv preprint arXiv:1706.03762","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani , Noam Shazeer , Niki Parmar , Jakob Uszkoreit , Llion Jones , Aidan N Gomez , Lukasz Kaiser , and Illia Polosukhin . Attention is All You Need. arXiv preprint arXiv:1706.03762 , 2017 . Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. Attention is All You Need. arXiv preprint arXiv:1706.03762, 2017."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2015.2472014"},{"key":"e_1_3_2_2_44_1","volume-title":"Automatic Cross-Replica Sharding of Weight Updates in Data-Parallel Training. arXiv preprint arXiv:2004.13336","author":"Xu Yuanzhong","year":"2020","unstructured":"Yuanzhong Xu , HyoukJoong Lee , Dehao Chen , Hongjun Choi , Blake Hechtman , and Shibo Wang . Automatic Cross-Replica Sharding of Weight Updates in Data-Parallel Training. arXiv preprint arXiv:2004.13336 , 2020 . Yuanzhong Xu, HyoukJoong Lee, Dehao Chen, Hongjun Choi, Blake Hechtman, and Shibo Wang. Automatic Cross-Replica Sharding of Weight Updates in Data-Parallel Training. arXiv preprint arXiv:2004.13336, 2020."},{"key":"e_1_3_2_2_45_1","volume-title":"Christopher De Sa. PipeMare: Asynchronous Pipeline Parallel DNN Training. Proceedings of Machine Learning and Systems","author":"Yang Bowen","year":"2021","unstructured":"Bowen Yang , Jian Zhang , Jonathan Li , Christopher R\u00e9 , Christopher Aberger , and Christopher De Sa. PipeMare: Asynchronous Pipeline Parallel DNN Training. Proceedings of Machine Learning and Systems , 2021 . Bowen Yang, Jian Zhang, Jonathan Li, Christopher R\u00e9, Christopher Aberger, and Christopher De Sa. PipeMare: Asynchronous Pipeline Parallel DNN Training. Proceedings of Machine Learning and Systems, 2021."},{"key":"e_1_3_2_2_46_1","volume-title":"XLNet: Generalized Autoregressive Pretraining for Language Understanding. CoRR, abs\/1906.08237","author":"Yang Zhilin","year":"2019","unstructured":"Zhilin Yang , Zihang Dai , Yiming Yang , Jaime G. Carbonell , Ruslan Salakhutdinov , and Quoc V. Le . XLNet: Generalized Autoregressive Pretraining for Language Understanding. CoRR, abs\/1906.08237 , 2019 . Zhilin Yang, Zihang Dai, Yiming Yang, Jaime G. Carbonell, Ruslan Salakhutdinov, and Quoc V. Le. XLNet: Generalized Autoregressive Pretraining for Language Understanding. CoRR, abs\/1906.08237, 2019."},{"key":"e_1_3_2_2_47_1","first-page":"1","volume-title":"Kurt Keutzer. ImageNet Training in Minutes. In Proceedings of the 47th International Conference on Parallel Processing","author":"You Yang","year":"2018","unstructured":"Yang You , Zhao Zhang , Cho-Jui Hsieh , James Demmel , and Kurt Keutzer. ImageNet Training in Minutes. In Proceedings of the 47th International Conference on Parallel Processing , pages 1 -- 10 , 2018 . Yang You, Zhao Zhang, Cho-Jui Hsieh, James Demmel, and Kurt Keutzer. ImageNet Training in Minutes. In Proceedings of the 47th International Conference on Parallel Processing, pages 1--10, 2018."}],"event":{"name":"SC '21: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis Missouri","acronym":"SC '21","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","IEEE CS"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3458817.3476209","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3458817.3476209","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:12:21Z","timestamp":1750191141000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3458817.3476209"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,11,13]]},"references-count":47,"alternative-id":["10.1145\/3458817.3476209","10.1145\/3458817"],"URL":"https:\/\/doi.org\/10.1145\/3458817.3476209","relation":{},"subject":[],"published":{"date-parts":[[2021,11,13]]},"assertion":[{"value":"2021-11-13","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}