{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,2]],"date-time":"2025-08-02T19:01:39Z","timestamp":1754161299568,"version":"3.41.2"},"publisher-location":"New York, NY, USA","reference-count":111,"publisher":"ACM","funder":[{"name":"Natural Science Foundation of China","award":["62272466","U24A20233"],"award-info":[{"award-number":["62272466","U24A20233"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,23]]},"DOI":"10.1145\/3696630.3728538","type":"proceedings-article","created":{"date-parts":[[2025,7,28]],"date-time":"2025-07-28T19:08:09Z","timestamp":1753729689000},"page":"122-133","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["An Empirical Study of Issues in Large Language Model Training Systems"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1899-8561","authenticated-orcid":false,"given":"Yanjie","family":"Gao","sequence":"first","affiliation":[{"name":"Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4236-289X","authenticated-orcid":false,"given":"Ruiming","family":"Lu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9148-5861","authenticated-orcid":false,"given":"Haoxiang","family":"Lin","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2239-4472","authenticated-orcid":false,"given":"Yueguo","family":"Chen","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,7,28]]},"reference":[{"volume-title":"d.]. GitHub Search API. Available at: https:\/\/developer.github.com\/v3\/search\/ (accessed","year":"2020","key":"e_1_3_2_1_1_1","unstructured":"[n. d.]. GitHub Search API. Available at: https:\/\/developer.github.com\/v3\/search\/ (accessed June 27, 2020)."},{"key":"e_1_3_2_1_2_1","volume-title":"https:\/\/github.com\/apache\/opennlp. Accessed","author":"Accessed Year","year":"2024","unstructured":"Year Accessed. Apache OpenNLP. https:\/\/github.com\/apache\/opennlp. Accessed: April 10, 2024."},{"volume-title":"TensorFlow: A System for Large-Scale Machine Learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","author":"Abadi M.","key":"e_1_3_2_1_3_1","unstructured":"M. Abadi, P. Barham, J. Chen, and et al. 2016. TensorFlow: A System for Large-Scale Machine Learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16). 265\u2013283."},{"key":"e_1_3_2_1_4_1","unstructured":"Epoch AI. 2025. Introducing the Distributed Training Interactive Simulator. https:\/\/epoch.ai\/blog\/introducing-the-distributed-training-interactive-simulator"},{"key":"e_1_3_2_1_5_1","unstructured":"Alibaba. 2025. Pai-Megatron-Patch. https:\/\/github.com\/alibaba\/Pai-Megatron-Patch"},{"key":"e_1_3_2_1_6_1","first-page":"1","article-title":"Apache mahout: Machine learning on distributed dataflow systems","volume":"21","author":"Anil Robin","year":"2020","unstructured":"Robin Anil, Gokhan Capan, Isabel Drost-Fromm, Ted Dunning, Ellen Friedman, Trevor Grant, Shannon Quinn, Paritosh Ranjan, Sebastian Schelter, and \u00d6zg\u00fcr Y\u0131lmazel. 2020. Apache mahout: Machine learning on distributed dataflow systems. Journal of Machine Learning Research 21, 127 (2020), 1\u20136.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_7_1","unstructured":"Amazon Web Services (AWS). 2022. Accelerate PyTorch with DeepSpeed to Train Large Language Models with Intel Habana Gaudi-Based DL1 EC2 Instances. https:\/\/aws.amazon.com\/blogs\/machine-learning\/accelerate-pytorch-with-deepspeed-to-train-large-language-models-with-intel-habana-gaudi-based-dl1-ec2-instances\/"},{"key":"e_1_3_2_1_8_1","volume-title":"Jamie Ryan Kiros, and Geoffrey E Hinton","author":"Ba Jimmy Lei","year":"2016","unstructured":"Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. 2016. Layer normalization. (2016)."},{"key":"e_1_3_2_1_9_1","volume-title":"SIGIR 2012 workshop on open source information retrieval.","author":"Bia\u0142ecki Andrzej","year":"2012","unstructured":"Andrzej Bia\u0142ecki, Robert Muir, Grant Ingersoll, and Lucid Imagination. 2012. Apache lucene 4. In SIGIR 2012 workshop on open source information retrieval."},{"volume-title":"Research Workshop on Large Language Models - The Summer of Language Models","year":"2021","key":"e_1_3_2_1_10_1","unstructured":"BigScience. 2021. Research Workshop on Large Language Models - The Summer of Language Models 2021. https:\/\/github.com\/bigscience-workshop"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Sid Black Stella Biderman Eric Hallahan Quentin Anthony Leo Gao Laurence Golding Horace He Connor Leahy Kyle McDonell Jason Phang et al. 2022. Gpt-neox-20b: An open-source autoregressive language model. (2022).","DOI":"10.18653\/v1\/2022.bigscience-1.9"},{"key":"e_1_3_2_1_12_1","first-page":"1877","article-title":"Language Models Are Few-Shot Learners","volume":"33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, et al. 2020. Language Models Are Few-Shot Learners. Advances in Neural Information Processing Systems 33 (2020), 1877\u20131901.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_13_1","unstructured":"Shanqing Cai. 2017. Debug TensorFlow Models with tfdbg. https:\/\/developers.googleblog.com\/2017\/02\/debug-tensorflow-models-with-tfdbg.html."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3587155","article-title":"Toward understanding deep learning framework bugs","volume":"32","author":"Chen Junjie","year":"2023","unstructured":"Junjie Chen, Yihua Liang, Qingchao Shen, Jiajun Jiang, and Shuochuan Li. 2023. Toward understanding deep learning framework bugs. ACM Transactions on Software Engineering and Methodology 32, 6 (2023), 1\u201331.","journal-title":"ACM Transactions on Software Engineering and Methodology"},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 28th ACM joint meeting on European software engineering conference and symposium on the foundations of software engineering. 750\u2013762","author":"Chen Zhenpeng","year":"2020","unstructured":"Zhenpeng Chen, Yanbin Cao, Yuanqiang Liu, Haoyu Wang, Tao Xie, and Xuanzhe Liu. 2020. A comprehensive study on challenges in deploying deep learning based software. In Proceedings of the 28th ACM joint meeting on European software engineering conference and symposium on the foundations of software engineering. 750\u2013762."},{"key":"e_1_3_2_1_16_1","volume-title":"2021 IEEE\/ACM 43rd International Conference on Software Engineering (ICSE). IEEE, 674\u2013685","author":"Chen Zhenpeng","year":"2021","unstructured":"Zhenpeng Chen, Huihan Yao, Yiling Lou, Yanbin Cao, Yuanqiang Liu, Haoyu Wang, and Xuanzhe Liu. 2021. An empirical study on deployment faults of deep learning based mobile applications. In 2021 IEEE\/ACM 43rd International Conference on Software Engineering (ICSE). IEEE, 674\u2013685."},{"key":"e_1_3_2_1_17_1","unstructured":"Fran\u00e7ois Chollet et al. 2015. Keras. https:\/\/keras.io."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","first-page":"37","DOI":"10.1177\/001316446002000104","article-title":"A Coefficient of Agreement for Nominal Scales","volume":"20","author":"Cohen Jacob","year":"1960","unstructured":"Jacob Cohen. 1960. A Coefficient of Agreement for Nominal Scales. Educational and Psychological Measurement 20, 1 (1960), 37.","journal-title":"Educational and Psychological Measurement"},{"key":"e_1_3_2_1_19_1","volume-title":"Thomas","author":"Cover Thomas M.","year":"2006","unstructured":"Thomas M. Cover and Joy A. Thomas. 2006. Elements of Information Theory (Wiley Series in Telecommunications and Signal Processing). Wiley-Interscience, USA."},{"key":"e_1_3_2_1_20_1","unstructured":"Cursor. 2025. Cursor. https:\/\/www.cursor.com\/ Accessed: 2025-01-16."},{"key":"e_1_3_2_1_21_1","unstructured":"DeepSpeed. 2013. Autotuning. https:\/\/www.deepspeed.ai\/tutorials\/autotuning\/."},{"key":"e_1_3_2_1_22_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL 2019. Minneapolis, MN, 4171\u20134186. https:\/\/www.aclweb.org\/anthology\/N19-1423"},{"key":"e_1_3_2_1_23_1","volume-title":"2017 32nd IEEE\/ACM International Conference on Automated Software Engineering (ASE). 509\u2013519","author":"Franco Anthony Di","year":"2017","unstructured":"Anthony Di Franco, Hui Guo, and Cindy Rubio-Gonz\u00e1lez. 2017. A comprehensive study of real-world numerical bug characteristics. In 2017 32nd IEEE\/ACM International Conference on Automated Software Engineering (ASE). 509\u2013519."},{"key":"e_1_3_2_1_24_1","unstructured":"EleutherAI. 2025. GPT-NeoX. https:\/\/github.com\/EleutherAI\/gpt-neox"},{"key":"e_1_3_2_1_25_1","unstructured":"D Man\u00e9 et al. 2015. TensorBoard: TensorFlow's visualization toolkit. https:\/\/github.com\/tensorflow\/tensorboard."},{"key":"e_1_3_2_1_26_1","volume-title":"OSDI 2008","author":"Guo","year":"2008","unstructured":"Guo et al. 2008. R2: An Application-Level Kernel for Record and Replay. In OSDI 2008 (San Diego, CA). 193\u2013208. https:\/\/www.usenix.org\/conference\/osdi08\/r2-application-level-kernel-record-and-replay"},{"key":"e_1_3_2_1_27_1","volume-title":"Gpipe: Efficient Training of Giant Neural Networks Using Pipeline Parallelism. NeurIPS 2019 32","author":"Huang","year":"2019","unstructured":"Huang et al. 2019. Gpipe: Efficient Training of Giant Neural Networks Using Pipeline Parallelism. NeurIPS 2019 32 (2019)."},{"key":"e_1_3_2_1_28_1","volume-title":"Detecting TensorFlow Program Bugs in Real-World Industrial Environment. In ASE","author":"Liu","year":"2021","unstructured":"Liu et al. 2021. Detecting TensorFlow Program Bugs in Real-World Industrial Environment. In ASE 2021. 55\u201366."},{"key":"e_1_3_2_1_29_1","first-page":"227","article-title":"Datastates-llm","volume":"2024","author":"Maurya","year":"2024","unstructured":"Maurya et al. 2024. Datastates-llm: Lazy Asynchronous Checkpointing for Large Language Models. In HPDC 2024. 227\u2013239.","journal-title":"Lazy Asynchronous Checkpointing for Large Language Models. In HPDC"},{"key":"e_1_3_2_1_30_1","volume-title":"Transformers: State-of-the-Art NLP. In EMNLP 2020: System Demonstrations. 38\u201345","author":"Wolf","year":"2020","unstructured":"Wolf et al. 2020. Transformers: State-of-the-Art NLP. In EMNLP 2020: System Demonstrations. 38\u201345."},{"key":"e_1_3_2_1_31_1","volume-title":"DeepDiagnosis: Diagnosing and Fixing Faults in Deep Learning Programs. In ICSE 2022","author":"Wardat","year":"2022","unstructured":"Wardat et al. 2022. DeepDiagnosis: Diagnosing and Fixing Faults in Deep Learning Programs. In ICSE 2022 (Pittsburgh, PA). 561\u2013572."},{"key":"e_1_3_2_1_32_1","volume-title":"An Empirical Study on Numerical Bugs in Deep Learning Programs. In ASE","author":"Wang","year":"2023","unstructured":"Wang et al. 2023. An Empirical Study on Numerical Bugs in Deep Learning Programs. In ASE 2022. Article 173. 10.1145\/3551349.3559561"},{"key":"e_1_3_2_1_33_1","volume-title":"GEMINI: Fast Failure Recovery in Distributed Training with In-Memory Checkpoints. In SOSP 2023","author":"Wang","year":"2023","unstructured":"Wang et al. 2023. GEMINI: Fast Failure Recovery in Distributed Training with In-Memory Checkpoints. In SOSP 2023 (Koblenz, Germany). 364\u2013381."},{"key":"e_1_3_2_1_34_1","volume-title":"MODIST: Transparent Model Checking of Unmodified Distributed Systems. In NSDI 2009","author":"Yang","year":"2009","unstructured":"Yang et al. 2009. MODIST: Transparent Model Checking of Unmodified Distributed Systems. In NSDI 2009 (Boston, MA)."},{"key":"e_1_3_2_1_35_1","volume-title":"An Empirical Study on Quality Issues of Production Big Data Platform. In ICSE 2015","author":"Zhou","year":"2015","unstructured":"Zhou et al. 2015. An Empirical Study on Quality Issues of Production Big Data Platform. In ICSE 2015 (Florence, Italy). 17\u201326."},{"key":"e_1_3_2_1_36_1","volume-title":"An Empirical Study on TensorFlow Program Bugs. In ISSTA 2018","author":"Zhang","year":"2018","unstructured":"Zhang et al. 2018. An Empirical Study on TensorFlow Program Bugs. In ISSTA 2018 (Amsterdam, Netherlands). 129\u2013140. 10.1145\/3213846.3213866"},{"key":"e_1_3_2_1_37_1","unstructured":"Hugging Face. 2025. Qwen\/QVQ-72B-Preview. https:\/\/huggingface.co\/Qwen\/QVQ-72B-Preview"},{"key":"e_1_3_2_1_38_1","volume-title":"A review of sparse expert models in deep learning. arXiv preprint arXiv:2209.01667","author":"Fedus William","year":"2022","unstructured":"William Fedus, Jeff Dean, and Barret Zoph. 2022. A review of sparse expert models in deep learning. arXiv preprint arXiv:2209.01667 (2022)."},{"key":"e_1_3_2_1_39_1","volume-title":"Echo: Simulating Distributed Training At Scale.","author":"Feng Yicheng","year":"2024","unstructured":"Yicheng Feng, Yuetao Chen, Kaiwen Chen, Jingzong Li, Tianyuan Wu, Peng Cheng, Chuan Wu, Wei Wang, Tsung-Yi Ho, and Hong Xu. 2024. Echo: Simulating Distributed Training At Scale. (2024)."},{"key":"e_1_3_2_1_40_1","unstructured":"Python Software Foundation. 2023. trace - Trace program execution function calls and exceptions. https:\/\/docs.python.org\/3\/library\/trace.html Accessed: 2025-01-16."},{"key":"e_1_3_2_1_41_1","volume-title":"An Empirical Study on Quality Issues of Deep Learning Platform. In 2023 IEEE\/ACM 45th International Conference on Software Engineering: Software Engineering in Practice (ICSE-SEIP). 455\u2013466","author":"Gao Yanjie","year":"2023","unstructured":"Yanjie Gao, Xiaoxiang Shi, Haoxiang Lin, Hongyu Zhang, Hao Wu, Rui Li, and Mao Yang. 2023. An Empirical Study on Quality Issues of Deep Learning Platform. In 2023 IEEE\/ACM 45th International Conference on Software Engineering: Software Engineering in Practice (ICSE-SEIP). 455\u2013466."},{"key":"e_1_3_2_1_42_1","unstructured":"GitHub. 2025. GitHub Copilot. https:\/\/github.com\/features\/copilot Accessed: 2025-01-16."},{"key":"e_1_3_2_1_43_1","unstructured":"Google. 2022. Best Practices for Performance and Cost Optimization for Machine Learning. https:\/\/cloud.google.com\/solutions\/machine-learning\/best-practices-for-ml-performance-cost."},{"key":"e_1_3_2_1_44_1","volume-title":"Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, et al.","author":"Gunasekar Suriya","year":"2023","unstructured":"Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio C\u00e9sar Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, et al. 2023. Textbooks are all you need. (2023)."},{"key":"e_1_3_2_1_45_1","volume-title":"An Empirical Study on Performance Bugs for Highly Configurable Software Systems. In ESEM","author":"Yu Han","year":"2016","unstructured":"Han and Yu. 2016. An Empirical Study on Performance Bugs for Highly Configurable Software Systems. In ESEM 2016. Article 23. 10.1145\/2961111.2962602"},{"volume-title":"Neural networks: a comprehensive foundation","author":"Haykin Simon","key":"e_1_3_2_1_46_1","unstructured":"Simon Haykin. 1994. Neural networks: a comprehensive foundation. Prentice Hall PTR."},{"key":"e_1_3_2_1_47_1","unstructured":"Dan Hendrycks and Kevin Gimpel. 2016. Gaussian error linear units (gelus). (2016)."},{"key":"e_1_3_2_1_48_1","volume-title":"Proceedings of the 27th ACM Joint Meeting on ESEC\/FSE. 510\u2013520","author":"Islam Md Johirul","year":"2019","unstructured":"Md Johirul Islam, Giang Nguyen, Rangeet Pan, and Hridesh Rajan. 2019. A Comprehensive Study on Deep Learning Bug Characteristics. In Proceedings of the 27th ACM Joint Meeting on ESEC\/FSE. 510\u2013520."},{"volume-title":"An Empirical Study on Bugs Inside TensorFlow. In Database Systems for Advanced Applications: 25th International Conference, DASFAA 2020. 604\u2013620","author":"Jia L.","key":"e_1_3_2_1_49_1","unstructured":"L. Jia, H. Zhong, X. Wang, L. Huang, and X. Lu. 2020. An Empirical Study on Bugs Inside TensorFlow. In Database Systems for Advanced Applications: 25th International Conference, DASFAA 2020. 604\u2013620."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Md Johirul Islam Giang Nguyen Rangeet Pan and Hridesh Rajan. 2019. A Comprehensive Study on Deep Learning Bug Characteristics. (2019) arXiv-1906.","DOI":"10.1145\/3338906.3338955"},{"key":"e_1_3_2_1_51_1","volume-title":"Adam: A method for stochastic optimization.","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. (2014)."},{"key":"e_1_3_2_1_52_1","unstructured":"Mikhail V Koroteev. 2021. BERT: a review of applications in natural language processing and understanding. (2021)."},{"key":"e_1_3_2_1_53_1","volume-title":"Lev Kurilenko, Masahiro Tanaka, Stas Bekman, Olatunji Ruwase, and Minjia Zhang.","author":"Lian Xinyu","year":"2024","unstructured":"Xinyu Lian, Sam Ade Jacobs, Lev Kurilenko, Masahiro Tanaka, Stas Bekman, Olatunji Ruwase, and Minjia Zhang. 2024. Universal checkpointing: Efficient and flexible checkpointing for large scale distributed training. (2024)."},{"key":"e_1_3_2_1_54_1","volume-title":"Jurassic-1: Technical details and evaluation. White Paper. AI21 Labs 1, 9","author":"Lieber Opher","year":"2021","unstructured":"Opher Lieber, Or Sharir, Barak Lenz, and Yoav Shoham. 2021. Jurassic-1: Technical details and evaluation. White Paper. AI21 Labs 1, 9 (2021)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","unstructured":"X. Liu D. Gu Z. Chen J. Wen Z. Zhang Y. Ma H. Wang and X. Jin. 2023. Rise of Distributed Deep Learning Training in the Big Model Era: From A Software Engineering Perspective. ACM Trans. Softw. Eng. Methodol. 30 (2023). Issue May. 10.1145\/3597204","DOI":"10.1145\/3597204"},{"key":"e_1_3_2_1_56_1","unstructured":"Yinhan Liu. [n. d.]. Roberta: A robustly optimized bert pretraining approach. 364 ([n. d.])."},{"key":"e_1_3_2_1_57_1","volume-title":"An Empirical Study on Performance Bugs in Deep Learning Frameworks. In 2022 IEEE International Conference on Software Maintenance and Evolution (ICSME). 35\u201346","author":"Makkouk Tarek","year":"2022","unstructured":"Tarek Makkouk, Dong Jae Kim, and Tse-Hsun Peter Chen. 2022. An Empirical Study on Performance Bugs in Deep Learning Frameworks. In 2022 IEEE International Conference on Software Maintenance and Evolution (ICSME). 35\u201346."},{"volume-title":"Metaseq: A codebase for working with open pre-trained transformers. https:\/\/github.com\/facebookresearch\/metaseq\/blob\/main\/projects\/OPT\/chronicles\/README.md Accessed","year":"2022","key":"e_1_3_2_1_58_1","unstructured":"Meta. 2022. Metaseq: A codebase for working with open pre-trained transformers. https:\/\/github.com\/facebookresearch\/metaseq\/blob\/main\/projects\/OPT\/chronicles\/README.md Accessed: 2022."},{"key":"e_1_3_2_1_59_1","unstructured":"Paulius Micikevicius Sharan Narang Jonah Alben Gregory Diamos Erich Elsen David Garcia Boris Ginsburg Michael Houston Oleksii Kuchaiev Ganesh Venkatesh et al. 2017. Mixed precision training. (2017)."},{"key":"e_1_3_2_1_60_1","unstructured":"Microsoft. 2023. Megatron-DeepSpeed. https:\/\/github.com\/microsoft\/Megatron-DeepSpeed."},{"key":"e_1_3_2_1_61_1","unstructured":"Microsoft. 2023. PHI-2 Model. https:\/\/huggingface.co\/microsoft\/phi-2."},{"key":"e_1_3_2_1_62_1","unstructured":"Microsoft. 2025. [BUG] CUDA error: an illegal memory access was encountered with Adam optimizer on H100. https:\/\/github.com\/microsoft\/DeepSpeed\/issues\/3429"},{"key":"e_1_3_2_1_63_1","unstructured":"Microsoft. 2025. Having issue with multi-node training. https:\/\/github.com\/microsoft\/DeepSpeed\/issues\/3546"},{"key":"e_1_3_2_1_64_1","volume-title":"Nick Cammarata, Leo Gao, Joshua Achiam, Catherine Yeh, Jan Leike, Jeff Wu, and William Saunders.","author":"Mossing Dan","year":"2024","unstructured":"Dan Mossing, Steven Bills, Henk Tillman, Tom Dupr\u00e9 la Tour, Nick Cammarata, Leo Gao, Joshua Achiam, Catherine Yeh, Jan Leike, Jeff Wu, and William Saunders. 2024. Transformer Debugger. https:\/\/github.com\/openai\/transformer-debugger."},{"key":"e_1_3_2_1_65_1","unstructured":"NVIDIA. [n. d.]. RuntimeError: Socket Timeout when setting up NCCL communicator. https:\/\/github.com\/NVIDIA\/Megatron-LM\/issues\/386"},{"key":"e_1_3_2_1_66_1","unstructured":"NVIDIA. 2021. Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B the World's Largest and Most Powerful Generative Language Model. https:\/\/developer.nvidia.com\/blog\/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model\/"},{"key":"e_1_3_2_1_67_1","unstructured":"NVIDIA. 2022. Developing a 172B LLM with Strong Japanese Capabilities Using NVIDIA Megatron-LM. https:\/\/developer.nvidia.com\/blog\/developing-a-172b-llm-with-strong-japanese-capabilities-using-nvidia-megatron-lm\/"},{"key":"e_1_3_2_1_68_1","unstructured":"NVIDIA. 2023. Apex (A PyTorch Extension). https:\/\/github.com\/NVIDIA\/apex."},{"key":"e_1_3_2_1_69_1","unstructured":"Nvidia. 2023. CUDA toolkit. https:\/\/developer.nvidia.com\/cuda-toolkit."},{"key":"e_1_3_2_1_70_1","unstructured":"NVIDIA. 2023. NCCL Tests. https:\/\/github.com\/NVIDIA\/nccl-tests."},{"key":"e_1_3_2_1_71_1","unstructured":"NVIDIA. 2024. The NVIDIA Collective Communications Library (NCCL). https:\/\/developer.nvidia.com\/nccl."},{"key":"e_1_3_2_1_72_1","unstructured":"NVIDIA. 2025. AttributeError: 'DistributedOptimizer' object has no attribute '_-copy_model_params_to_main_params'. https:\/\/github.com\/NVIDIA\/Megatron-LM\/issues\/333"},{"key":"e_1_3_2_1_73_1","unstructured":"OpenAI. 2023. ChatGPT (Mar 14 version). https:\/\/chat.openai.com\/chat."},{"key":"e_1_3_2_1_74_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. abs\/2303.08774 (2023)."},{"key":"e_1_3_2_1_75_1","unstructured":"OpenAI. 2025. Scaling Kubernetes to 7 500 nodes. https:\/\/openai.com\/index\/scaling-kubernetes-to-7500-nodes\/"},{"key":"e_1_3_2_1_76_1","unstructured":"A. Paszke S. Gross F. Massa A. Lerer J. Bradbury G. Chanan T. Killeen Z. Lin N. Gimelshein L. Antiga A. Desmaison A. K\u00f6pf E. Yang Z. DeVito M. Raison A. Tejani S. Chilamkurthy B. Steiner L. Fang J. Bai and S. Chintala. 2019. PyTorch: An Imperative Style High-Performance Deep Learning Library. In Neural Information Processing Systems."},{"key":"e_1_3_2_1_77_1","unstructured":"Alec Radford Jeff Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners. https:\/\/cdn.openai.com\/better-language-models\/language_models_are_unsupervised_multitask_learners.pdf. (2019)."},{"volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Rajbhandari S.","key":"e_1_3_2_1_78_1","unstructured":"S. Rajbhandari, J. Rasley, O. Ruwase, and Y. He. 2020. ZeRO: Memory Optimizations toward Training Trillion Parameter Models. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (Atlanta, Georgia) (SC '20). IEEE Press, Article 20, 16 pages."},{"key":"e_1_3_2_1_79_1","volume-title":"Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD '20)","author":"Rasley Jeff","year":"2020","unstructured":"Jeff Rasley, Samyam Rajbhandari, Olatunji Ruwase, and Yuxiong He. 2020. Deep-Speed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD '20). 3505\u20133506. 10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_80_1","volume-title":"Superbench: A super-resolution benchmark dataset for scientific machine learning.","author":"Ren Pu","year":"2023","unstructured":"Pu Ren, N Benjamin Erichson, Shashank Subramanian, Omer San, Zarija Lukic, and Michael W Mahoney. 2023. Superbench: A super-resolution benchmark dataset for scientific machine learning. (2023)."},{"key":"e_1_3_2_1_81_1","unstructured":"V Sanh. 2019. DistilBERT a distilled version of BERT: smaller faster cheaper and lighter. (2019)."},{"key":"e_1_3_2_1_82_1","volume-title":"Fran\u00e7ois Yvon, Matthias Gall\u00e9, et al.","author":"Scao Teven Le","year":"2022","unstructured":"Teven Le Scao, Angela Fan, Christopher Akiki, Ellie Pavlick, Suzana Ili\u0107, Daniel Hesslow, Roman Castagn\u00e9, Alexandra Sasha Luccioni, Fran\u00e7ois Yvon, Matthias Gall\u00e9, et al. 2022. Bloom: A 176b-parameter open-access multilingual language model. (2022)."},{"key":"e_1_3_2_1_83_1","volume-title":"Horovod: fast and easy distributed deep learning in TensorFlow. CoRR","author":"Sergeev Alexander","year":"2018","unstructured":"Alexander Sergeev and Mike Del Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. CoRR (2018)."},{"volume-title":"Proceedings of the 29th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering. 968\u2013980","author":"Shen Q.","key":"e_1_3_2_1_84_1","unstructured":"Q. Shen, H. Ma, J. Chen, Y. Tian, S.-C. Cheung, and X. Chen. 2021. A Comprehensive Study of Deep Learning Compiler Bugs. In Proceedings of the 29th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering. 968\u2013980."},{"key":"e_1_3_2_1_85_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism.","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multi-billion parameter language models using model parallelism. (2019)."},{"key":"e_1_3_2_1_86_1","unstructured":"Shaden Smith Mostofa Patwary Brandon Norick Patrick LeGresley Samyam Rajbhandari et al. 2022. Using DeepSpeed and Megatron to train Megatron-Turing NLG 530B a large-scale generative language model. (2022)."},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"crossref","first-page":"127063","DOI":"10.1016\/j.neucom.2023.127063","article-title":"Roformer: Enhanced transformer with rotary position embedding","volume":"568","author":"Su Jianlin","year":"2024","unstructured":"Jianlin Su, Murtadha Ahmed, Yu Lu, Shengfeng Pan, Wen Bo, and Yunfeng Liu. 2024. Roformer: Enhanced transformer with rotary position embedding. Neurocomputing 568 (2024), 127063.","journal-title":"Neurocomputing"},{"key":"e_1_3_2_1_88_1","volume-title":"Silent Bugs in Deep Learning Frameworks: An Empirical Study of Keras and TensorFlow. ArXiv abs\/2112.13314","author":"Tambon Florian","year":"2021","unstructured":"Florian Tambon, Amin Nikanjam, Le An, Foutse Khomh, and Giuliano Antoniol. 2021. Silent Bugs in Deep Learning Frameworks: An Empirical Study of Keras and TensorFlow. ArXiv abs\/2112.13314 (2021)."},{"key":"e_1_3_2_1_89_1","volume-title":"Galactica: A large language model for science.","author":"Taylor Ross","year":"2022","unstructured":"Ross Taylor, Marcin Kardas, Guillem Cucurull, Thomas Scialom, Anthony Hartshorn, Elvis Saravia, Andrew Poulton, Viktor Kerkez, and Robert Stojnic. 2022. Galactica: A large language model for science. (2022)."},{"key":"e_1_3_2_1_90_1","unstructured":"PyTorch Team. 2024. Kineto. https:\/\/github.com\/pytorch\/kineto. Profiling library for PyTorch and other ML frameworks.."},{"key":"e_1_3_2_1_91_1","volume-title":"An Empirical Study of Bugs in Machine Learning Systems. In 2012 IEEE 23rd International Symposium on Software Reliability Engineering. 271\u2013280","author":"Thung Ferdian","year":"2012","unstructured":"Ferdian Thung, Shaowei Wang, David Lo, and Lingxiao Jiang. 2012. An Empirical Study of Bugs in Machine Learning Systems. In 2012 IEEE 23rd International Symposium on Software Reliability Engineering. 271\u2013280."},{"key":"e_1_3_2_1_92_1","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux et al. 2023. LLaMA: Open and Efficient Foundation Language Models. (2023)."},{"key":"e_1_3_2_1_93_1","unstructured":"Ashish Vaswani Noam M. Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention is All you Need. In NIPS."},{"key":"e_1_3_2_1_94_1","unstructured":"VolcEngine. 2025. A PyTorch Native LLM Training Framework. https:\/\/github.com\/volcengine\/veScale"},{"key":"e_1_3_2_1_95_1","unstructured":"Roger Waleffe Wonmin Byeon Duncan Riach Brandon Norick Vijay Korthikanti Tri Dao Albert Gu Ali Hatamizadeh Sudhakar Singh Deepak Narayanan et al. 2024. An Empirical Study of Mamba-based Language Models. (2024)."},{"key":"e_1_3_2_1_96_1","unstructured":"Borui Wan Mingji Han Yiyao Sheng Yanghua Peng Haibin Lin Mofan Zhang Zhichao Lai Menghan Yu Junda Zhang Zuquan Song et al. 2024. ByteCheck-point: A Unified Checkpointing System for Large Foundation Model Development. (2024)."},{"key":"e_1_3_2_1_97_1","unstructured":"Zhongwei Wan Xin Wang Che Liu Samiul Alam Yu Zheng Jiachen Liu Zhongnan Qu Shen Yan Yi Zhu Quanlu Zhang et al. 2023. Efficient large language models: A survey. (2023)."},{"key":"e_1_3_2_1_98_1","volume-title":"Instructretro: Instruction tuning post retrieval-augmented pretraining.","author":"Wang Boxin","year":"2023","unstructured":"Boxin Wang, Wei Ping, Lawrence McAfee, Peng Xu, Bo Li, Mohammad Shoeybi, and Bryan Catanzaro. 2023. Instructretro: Instruction tuning post retrieval-augmented pretraining. (2023)."},{"key":"e_1_3_2_1_99_1","doi-asserted-by":"crossref","unstructured":"Boxin Wang Wei Ping Peng Xu Lawrence McAfee Zihan Liu Mohammad Shoeybi Yi Dong Oleksii Kuchaiev Bo Li Chaowei Xiao et al. 2023. Shall we pretrain autoregressive language models with retrieval? a comprehensive study. (2023).","DOI":"10.18653\/v1\/2023.emnlp-main.482"},{"key":"e_1_3_2_1_100_1","volume-title":"HuggingFace's Transformers: State-of-the-art Natural Language Processing. CoRR abs\/1910.03771","author":"Wolf Thomas","year":"2019","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, R\u00e9mi Louf, Morgan Funtowicz, and Jamie Brew. 2019. HuggingFace's Transformers: State-of-the-art Natural Language Processing. CoRR abs\/1910.03771 (2019). arXiv:1910.03771"},{"volume-title":"Workshop.","year":"2025","key":"e_1_3_2_1_101_1","unstructured":"BigScience Workshop. 2025. Megatron-DeepSpeed. https:\/\/github.com\/bigscience-workshop\/Megatron-DeepSpeed"},{"key":"e_1_3_2_1_102_1","volume-title":"Proceedings of the 29th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering","author":"Yan Ming","year":"2021","unstructured":"Ming Yan, Junjie Chen, Xiangyu Zhang, Lin Tan, Gan Wang, and Zan Wang. 2021. Exposing Numerical Bugs in Deep Learning via Gradient Back-Propagation. In Proceedings of the 29th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering (Athens, Greece) (ESEC\/FSE 2021). New York, NY, USA, 627\u2013638."},{"key":"e_1_3_2_1_103_1","unstructured":"Zhilin Yang. 2019. XLNet: Generalized Autoregressive Pretraining for Language Understanding. (2019)."},{"key":"e_1_3_2_1_104_1","volume-title":"SLURM: Simple Linux Utility for Resource Management. In Job Scheduling Strategies for Parallel Processing","author":"Yoo Andy B.","year":"2003","unstructured":"Andy B. Yoo, Morris A. Jette, and Mark Grondona. 2003. SLURM: Simple Linux Utility for Resource Management. In Job Scheduling Strategies for Parallel Processing. Springer, 44\u201360."},{"key":"e_1_3_2_1_105_1","unstructured":"Aohan Zeng Xiao Liu Zhengxiao Du Zihan Wang Hanyu Lai Ming Ding Zhuoyi Yang Yifan Xu Wendi Zheng Xiao Xia et al. 2022. Glm-130b: An open bilingual pre-trained model. (2022)."},{"key":"e_1_3_2_1_106_1","volume-title":"Proceedings of the ACM\/IEEE 42nd international conference on software engineering. 1159\u20131170","author":"Zhang Ru","year":"2020","unstructured":"Ru Zhang, Wencong Xiao, Hongyu Zhang, Yu Liu, Haoxiang Lin, and Mao Yang. 2020. An empirical study on program failures of deep learning jobs. In Proceedings of the ACM\/IEEE 42nd international conference on software engineering. 1159\u20131170."},{"key":"e_1_3_2_1_107_1","volume-title":"Proceedings of the 42nd International Conference on Software Engineering","author":"Zhang Ru","year":"2020","unstructured":"Ru Zhang, Wencong Xiao, Hongyu Zhang, Yu Liu, Haoxiang Lin, and Mao Yang. 2020. An Empirical Study on Program Failures of Deep Learning Jobs. In Proceedings of the 42nd International Conference on Software Engineering (Seoul, Republic of Korea) (ICSE '20). New York, NY, USA, 1159\u20131170."},{"key":"e_1_3_2_1_108_1","volume-title":"Proceedings of the ACM\/IEEE 42nd International Conference on Software Engineering","author":"Zhang Ru","year":"2020","unstructured":"Ru Zhang, Wencong Xiao, Hongyu Zhang, Yu Liu, Haoxiang Lin, and Mao Yang. 2020. An Empirical Study on Program Failures of Deep Learning Jobs. In Proceedings of the ACM\/IEEE 42nd International Conference on Software Engineering (Seoul, South Korea) (ICSE '20). Association for Computing Machinery, New York, NY, USA, 1159\u20131170. 10.1145\/3377811.3380362"},{"key":"e_1_3_2_1_109_1","volume-title":"Xi Victoria Lin, et al","author":"Zhang Susan","year":"2023","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, et al. 2023. Opt: Open pre-trained transformer language models, 2022. 3 (2023), 19\u20130."},{"key":"e_1_3_2_1_110_1","volume-title":"An Empirical Study of Common Challenges in Developing Deep Learning Applications. In 2019 IEEE 30th International Symposium on Software Reliability Engineering (ISSRE). 104\u2013115","author":"Zhang Tianyi","year":"2019","unstructured":"Tianyi Zhang, Cuiyun Gao, Lei Ma, Michael Lyu, and Miryung Kim. 2019. An Empirical Study of Common Challenges in Developing Deep Learning Applications. In 2019 IEEE 30th International Symposium on Software Reliability Engineering (ISSRE). 104\u2013115. 10.1109\/ISSRE.2019.00020"},{"key":"e_1_3_2_1_111_1","volume-title":"Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming. 447\u2013449","author":"Zhong Yuchen","year":"2023","unstructured":"Yuchen Zhong, Guangming Sheng, Juncheng Liu, Jinhui Yuan, and Chuan Wu. 2023. Swift: Expedited Failure Recovery for Large-Scale DNN Training. In Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming. 447\u2013449."}],"event":{"name":"FSE Companion '25: 33rd ACM International Conference on the Foundations of Software Engineering","sponsor":["SIGSOFT ACM Special Interest Group on Software Engineering"],"location":"Clarion Hotel Trondheim Trondheim Norway","acronym":"FSE Companion '25"},"container-title":["Proceedings of the 33rd ACM International Conference on the Foundations of Software Engineering"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696630.3728538","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,28]],"date-time":"2025-07-28T19:14:09Z","timestamp":1753730049000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696630.3728538"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,23]]},"references-count":111,"alternative-id":["10.1145\/3696630.3728538","10.1145\/3696630"],"URL":"https:\/\/doi.org\/10.1145\/3696630.3728538","relation":{},"subject":[],"published":{"date-parts":[[2025,6,23]]},"assertion":[{"value":"2025-07-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}