{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T01:12:29Z","timestamp":1775092349479,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,21]]},"DOI":"10.1145\/3695053.3731410","type":"proceedings-article","created":{"date-parts":[[2025,6,20]],"date-time":"2025-06-20T16:43:11Z","timestamp":1750437791000},"page":"1703-1716","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Scaling Llama 3 Training with Efficient Parallelism Strategies"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8356-1054","authenticated-orcid":false,"given":"Weiwei","family":"Chu","sequence":"first","affiliation":[{"name":"Meta Platforms, Inc., Menlo Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7285-6682","authenticated-orcid":false,"given":"Xinfeng","family":"Xie","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., Bellevue, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2085-0312","authenticated-orcid":false,"given":"Jiecao","family":"Yu","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., Menlo Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2133-7943","authenticated-orcid":false,"given":"Jie","family":"Wang","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., Menlo Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2777-1118","authenticated-orcid":false,"given":"Amar","family":"Phanishayee","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., Bellevue, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0133-4800","authenticated-orcid":false,"given":"Chunqiang","family":"Tang","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., Menlo Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8513-9566","authenticated-orcid":false,"given":"Yuchen","family":"Hao","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., Menlo Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7595-5539","authenticated-orcid":false,"given":"Jianyu","family":"Huang","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., Bellevue, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6239-9622","authenticated-orcid":false,"given":"Mustafa","family":"Ozdal","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., Menlo Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7414-6537","authenticated-orcid":false,"given":"Jun","family":"Wang","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., Menlo Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5027-1452","authenticated-orcid":false,"given":"Vedanuj","family":"Goswami","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., Menlo Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3985-0322","authenticated-orcid":false,"given":"Naman","family":"Goyal","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., New York, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9261-0487","authenticated-orcid":false,"given":"Abhishek","family":"Kadian","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., Menlo Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1236-9484","authenticated-orcid":false,"given":"Andrew","family":"Gu","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., New York, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3416-8939","authenticated-orcid":false,"given":"Chris","family":"Cai","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., Menlo Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7181-3293","authenticated-orcid":false,"given":"Feng","family":"Tian","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., Bellevue, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5436-9952","authenticated-orcid":false,"given":"Xiaodong","family":"Wang","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., Menlo Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0208-096X","authenticated-orcid":false,"given":"Min","family":"Si","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., Menlo Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7830-0001","authenticated-orcid":false,"given":"Pavan","family":"Balaji","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., Menlo Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6752-3135","authenticated-orcid":false,"given":"Ching-Hsiang","family":"Chu","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., Menlo Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4750-9440","authenticated-orcid":false,"given":"Jongsoo","family":"Park","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc., Menlo Park, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,6,20]]},"reference":[{"key":"e_1_3_3_1_2_2","volume-title":"Understanding GPU Memory 1: Visualizing All Allocations over Time","author":"Aaron\u00a0Shi Zachary\u00a0DeVito","year":"2023","unstructured":"Zachary\u00a0DeVito Aaron\u00a0Shi. 2023. Understanding GPU Memory 1: Visualizing All Allocations over Time. https:\/\/pytorch.org\/blog\/understanding-gpu-memory-1\/"},{"key":"e_1_3_3_1_3_2","first-page":"23716","volume-title":"Advances in Neural Information Processing Systems","volume":"35","author":"Alayrac Jean-Baptiste","year":"2022","unstructured":"Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katherine Millican, Malcolm Reynolds, Roman Ring, Eliza Rutherford, Serkan Cabi, Tengda Han, Zhitao Gong, Sina Samangooei, Marianne Monteiro, Jacob\u00a0L Menick, Sebastian Borgeaud, Andy Brock, Aida Nematzadeh, Sahand Sharifzadeh, Miko\u0142\u00a0aj Bi\u0144kowski, Ricardo Barreira, Oriol Vinyals, Andrew Zisserman, and Kar\u00e9n Simonyan. 2022. Flamingo: a Visual Language Model for Few-Shot Learning. In Advances in Neural Information Processing Systems, S.\u00a0Koyejo, S.\u00a0Mohamed, A.\u00a0Agarwal, D.\u00a0Belgrave, K.\u00a0Cho, and A.\u00a0Oh (Eds.), Vol.\u00a035. Curran Associates, Inc., 23716\u201323736. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/960a172bc7fbf0177ccccbb411a7d800-Paper-Conference.pdf"},{"key":"e_1_3_3_1_4_2","volume-title":"Introducing the next generation of Claude","year":"2024","unstructured":"Anthropic. 2024. Introducing the next generation of Claude. https:\/\/www.anthropic.com\/news\/claude-3-family"},{"key":"e_1_3_3_1_5_2","unstructured":"Jinze Bai Shuai Bai Shusheng Yang Shijie Wang Sinan Tan Peng Wang Junyang Lin Chang Zhou and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.12966 (2023)."},{"key":"e_1_3_3_1_6_2","unstructured":"Tianqi Chen Bing Xu Chiyuan Zhang and Carlos Guestrin. 2016. Training deep nets with sublinear memory cost. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1604.06174 (2016)."},{"key":"e_1_3_3_1_7_2","unstructured":"Aakanksha Chowdhery Sharan Narang Jacob Devlin Maarten Bosma Gaurav Mishra Adam Roberts Paul Barham Hyung\u00a0Won Chung Charles Sutton Sebastian Gehrmann Parker Schuh Kensen Shi Sashank Tsvyashchenko Joshua Maynez Abhishek Rao Parker Barnes Yi Tay Noam Shazeer Vinodkumar Prabhakaran Emily Reif Nan Du Ben Hutchinson Reiner Pope James Bradbury Jacob Austin Michael Isard Guy Gur-Ari Pengcheng Yin Toju Duke Anselm Levskaya Sanjay Ghemawat Sunipa Dev Henryk Michalewski Xavier Garcia Vedant Misra Kevin Robinson Liam Fedus Denny Zhou Daphne Ippolito David Luan Hyeontaek Lim Barret Zoph Alexander Spiridonov Ryan Sepassi David Dohan Shivani Agrawal Mark Omernick Andrew\u00a0M. Dai Thanumalayan\u00a0Sankaranarayana Pillai Marie Pellat Aitor Lewkowycz Erica Moreira Rewon Child Oleksandr Polozov Katherine Lee Zongwei Zhou Xuezhi Wang Brennan Saeta Mark Diaz Orhan Firat Michele Catasta Jason Wei Kathy Meier-Hellstern Douglas Eck Jeff Dean Slav Petrov and Noah Fiedel. 2024. PaLM: scaling language modeling with pathways. J. Mach. Learn. Res. 24 1 Article 240 (March 2024) 113\u00a0pages."},{"key":"e_1_3_3_1_8_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Dao Tri","year":"2024","unstructured":"Tri Dao. 2024. FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_1_9_2","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Daniel\u00a0Y. Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. In Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_1_10_2","unstructured":"DeepSeek-AI. 2024. DeepSeek-V3 Technical Report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.19437 (2024)."},{"key":"e_1_3_3_1_11_2","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan Amy Yang Angela Fan Anirudh Goyal Anthony Hartshorn Aobo Yang Archi Mitra Archie Sravankumar Artem Korenev Arthur Hinsvark Arun Rao Aston Zhang Aurelien Rodriguez Austen Gregerson Ava Spataru Baptiste Roziere Bethany Biron Binh Tang Bobbie Chern Charlotte Caucheteux Chaya Nayak Chloe Bi Chris Marra Chris McConnell Christian Keller Christophe Touret Chunyang Wu Corinne Wong Cristian\u00a0Canton Ferrer Cyrus Nikolaidis Damien Allonsius Daniel Song Danielle Pintz Danny Livshits Danny Wyatt David Esiobu Dhruv Choudhary Dhruv Mahajan Diego Garcia-Olano Diego Perino Dieuwke Hupkes Egor Lakomkin Ehab AlBadawy Elina Lobanova Emily Dinan Eric\u00a0Michael Smith Filip Radenovic Francisco Guzm\u00e1n Frank Zhang Gabriel Synnaeve Gabrielle Lee Georgia\u00a0Lewis Anderson Govind Thattai Graeme Nail Gregoire Mialon Guan Pang Guillem Cucurell Hailey Nguyen Hannah Korevaar Hu Xu Hugo Touvron Iliyan Zarov Imanol\u00a0Arrieta Ibarra Isabel Kloumann Ishan Misra Ivan Evtimov Jack Zhang Jade Copet Jaewon Lee Jan Geffert Jana Vranes Jason Park Jay Mahadeokar Jeet Shah Jelmer van\u00a0der Linde Jennifer Billock Jenny Hong Jenya Lee Jeremy Fu Jianfeng Chi Jianyu Huang Jiawen Liu Jie Wang Jiecao Yu Joanna Bitton Joe Spisak Jongsoo Park Joseph Rocca Joshua Johnstun Joshua Saxe Junteng Jia Kalyan\u00a0Vasuden Alwala Karthik Prasad Kartikeya Upasani Kate Plawiak Ke Li Kenneth Heafield Kevin Stone Khalid El-Arini Krithika Iyer Kshitiz Malik Kuenley Chiu Kunal Bhalla Kushal Lakhotia Lauren Rantala-Yeary Laurens van\u00a0der Maaten Lawrence Chen Liang Tan Liz Jenkins Louis Martin Lovish Madaan Lubo Malo Lukas Blecher Lukas Landzaat Luke de Oliveira Madeline Muzzi Mahesh Pasupuleti Mannat Singh Manohar Paluri Marcin Kardas Maria Tsimpoukelli Mathew Oldham Mathieu Rita Maya Pavlova Melanie Kambadur Mike Lewis Min Si Mitesh\u00a0Kumar Singh Mona Hassan Naman Goyal Narjes Torabi Nikolay Bashlykov Nikolay Bogoychev Niladri Chatterji Ning Zhang Olivier Duchenne Onur \u00c7elebi Patrick Alrassy Pengchuan Zhang Pengwei Li Petar Vasic Peter Weng Prajjwal Bhargava Pratik Dubal Praveen Krishnan Punit\u00a0Singh Koura Puxin Xu Qing He Qingxiao Dong Ragavan Srinivasan Raj Ganapathy Ramon Calderer Ricardo\u00a0Silveira Cabral Robert Stojnic Roberta Raileanu Rohan Maheswari Rohit Girdhar Rohit Patel Romain Sauvestre Ronnie Polidoro Roshan Sumbaly Ross Taylor Ruan Silva Rui Hou Rui Wang Saghar Hosseini Sahana Chennabasappa Sanjay Singh Sean Bell Seohyun\u00a0Sonia Kim Sergey Edunov Shaoliang Nie Sharan Narang Sharath Raparthy Sheng Shen Shengye Wan Shruti Bhosale Shun Zhang Simon Vandenhende Soumya Batra Spencer Whitman Sten Sootla Stephane Collot Suchin Gururangan Sydney Borodinsky Tamar Herman Tara Fowler Tarek Sheasha Thomas Georgiou Thomas Scialom Tobias Speckbacher Todor Mihaylov Tong Xiao Ujjwal Karn Vedanuj Goswami Vibhor Gupta Vignesh Ramanathan Viktor Kerkez Vincent Gonguet Virginie Do Vish Vogeti V\u00edtor Albiero Vladan Petrovic Weiwei Chu Wenhan Xiong Wenyin Fu Whitney Meers Xavier Martinet Xiaodong Wang Xiaofang Wang Xiaoqing\u00a0Ellen Tan Xide Xia Xinfeng Xie Xuchao Jia Xuewei Wang Yaelle Goldschlag Yashesh Gaur Yasmine Babaei Yi Wen Yiwen Song Yuchen Zhang Yue Li Yuning Mao Zacharie\u00a0Delpierre Coudert Zheng Yan Zhengxing Chen Zoe Papakipos Aaditya Singh Aayushi Srivastava Abha Jain Adam Kelsey Adam Shajnfeld Adithya Gangidi Adolfo Victoria Ahuva Goldstand Ajay Menon Ajay Sharma Alex Boesenberg Alexei Baevski Allie Feinstein Amanda Kallet Amit Sangani Amos Teo Anam Yunus Andrei Lupu Andres Alvarado Andrew Caples Andrew Gu Andrew Ho Andrew Poulton Andrew Ryan Ankit Ramchandani Annie Dong Annie Franco Anuj Goyal Aparajita Saraf Arkabandhu Chowdhury Ashley Gabriel Ashwin Bharambe Assaf Eisenman Azadeh Yazdan Beau James Ben Maurer Benjamin Leonhardi Bernie Huang Beth Loyd Beto\u00a0De Paola Bhargavi Paranjape Bing Liu Bo Wu Boyu Ni Braden Hancock Bram Wasti Brandon Spence Brani Stojkovic Brian Gamido Britt Montalvo Carl Parker Carly Burton Catalina Mejia Ce Liu Changhan Wang Changkyu Kim Chao Zhou Chester Hu Ching-Hsiang Chu Chris Cai Chris Tindal Christoph Feichtenhofer Cynthia Gao Damon Civin Dana Beaty Daniel Kreymer Daniel Li David Adkins David Xu Davide Testuggine Delia David Devi Parikh Diana Liskovich Didem Foss Dingkang Wang Duc Le Dustin Holland Edward Dowling Eissa Jamil Elaine Montgomery Eleonora Presani Emily Hahn Emily Wood Eric-Tuan Le Erik Brinkman Esteban Arcaute Evan Dunbar Evan Smothers Fei Sun Felix Kreuk Feng Tian Filippos Kokkinos Firat Ozgenel Francesco Caggioni Frank Kanayet Frank Seide Gabriela\u00a0Medina Florez Gabriella Schwarz Gada Badeer Georgia Swee Gil Halpern Grant Herman Grigory Sizov Guangyi Zhang Guna Lakshminarayanan Hakan Inan Hamid Shojanazeri Han Zou Hannah Wang Hanwen Zha Haroun Habeeb Harrison Rudolph Helen Suk Henry Aspegren Hunter Goldman Hongyuan Zhan Ibrahim Damlaj Igor Molybog Igor Tufanov Ilias Leontiadis Irina-Elena Veliche Itai Gat Jake Weissman James Geboski James Kohli Janice Lam Japhet Asher Jean-Baptiste Gaya Jeff Marcus Jeff Tang Jennifer Chan Jenny Zhen Jeremy Reizenstein Jeremy Teboul Jessica Zhong Jian Jin Jingyi Yang Joe Cummings Jon Carvill Jon Shepard Jonathan McPhie Jonathan Torres Josh Ginsburg Junjie Wang Kai Wu Kam\u00a0Hou U Karan Saxena Kartikay Khandelwal Katayoun Zand Kathy Matosich Kaushik Veeraraghavan Kelly Michelena Keqian Li Kiran Jagadeesh Kun Huang Kunal Chawla Kyle Huang Lailin Chen Lakshya Garg Lavender A Leandro Silva Lee Bell Lei Zhang Liangpeng Guo Licheng Yu Liron Moshkovich Luca Wehrstedt Madian Khabsa Manav Avalani Manish Bhatt Martynas Mankus Matan Hasson Matthew Lennie Matthias Reso Maxim Groshev Maxim Naumov Maya Lathi Meghan Keneally Miao Liu Michael\u00a0L. Seltzer Michal Valko Michelle Restrepo Mihir Patel Mik Vyatskov Mikayel Samvelyan Mike Clark Mike Macey Mike Wang Miquel\u00a0Jubert Hermoso Mo Metanat Mohammad Rastegari Munish Bansal Nandhini Santhanam Natascha Parks Natasha White Navyata Bawa Nayan Singhal Nick Egebo Nicolas Usunier Nikhil Mehta Nikolay\u00a0Pavlovich Laptev Ning Dong Norman Cheng Oleg Chernoguz Olivia Hart Omkar Salpekar Ozlem Kalinli Parkin Kent Parth Parekh Paul Saab Pavan Balaji Pedro Rittner Philip Bontrager Pierre Roux Piotr Dollar Polina Zvyagina Prashant Ratanchandani Pritish Yuvraj Qian Liang Rachad Alao Rachel Rodriguez Rafi Ayub Raghotham Murthy Raghu Nayani Rahul Mitra Rangaprabhu Parthasarathy Raymond Li Rebekkah Hogan Robin Battey Rocky Wang Russ Howes Ruty Rinott Sachin Mehta Sachin Siby Sai\u00a0Jayesh Bondu Samyak Datta Sara Chugh Sara Hunt Sargun Dhillon Sasha Sidorov Satadru Pan Saurabh Mahajan Saurabh Verma Seiji Yamamoto Sharadh Ramaswamy Shaun Lindsay Shaun Lindsay Sheng Feng Shenghao Lin Shengxin\u00a0Cindy Zha Shishir Patil Shiva Shankar Shuqiang Zhang Shuqiang Zhang Sinong Wang Sneha Agarwal Soji Sajuyigbe Soumith Chintala Stephanie Max Stephen Chen Steve Kehoe Steve Satterfield Sudarshan Govindaprasad Sumit Gupta Summer Deng Sungmin Cho Sunny Virk Suraj Subramanian Sy Choudhury Sydney Goldman Tal Remez Tamar Glaser Tamara Best Thilo Koehler Thomas Robinson Tianhe Li Tianjun Zhang Tim Matthews Timothy Chou Tzook Shaked Varun Vontimitta Victoria Ajayi Victoria Montanez Vijai Mohan Vinay\u00a0Satish Kumar Vishal Mangla Vlad Ionescu Vlad Poenaru Vlad\u00a0Tiberiu Mihailescu Vladimir Ivanov Wei Li Wenchen Wang Wenwen Jiang Wes Bouaziz Will Constable Xiaocheng Tang Xiaojian Wu Xiaolan Wang Xilun Wu Xinbo Gao Yaniv Kleinman Yanjun Chen Ye Hu Ye Jia Ye Qi Yenda Li Yilin Zhang Ying Zhang Yossi Adi Youngjin Nam Yu Wang Yu Zhao Yuchen Hao Yundi Qian Yunlu Li Yuzi He Zach Rait Zachary DeVito Zef Rosnbrick Zhaoduo Wen Zhenyu Yang Zhiwei Zhao and Zhiyu Ma. 2024. The Llama 3 Herd of Models. arxiv:https:\/\/arXiv.org\/abs\/2407.21783\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_3_1_12_2","unstructured":"Yanping Huang Youlong Cheng Ankur Bapna Orhan Firat Mia\u00a0Xu Chen Dehao Chen HyoukJoong Lee Jiquan Ngiam Quoc\u00a0V. Le Yonghui Wu and Zhifeng Chen. 2019. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. arxiv:https:\/\/arXiv.org\/abs\/1811.06965\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/1811.06965"},{"key":"e_1_3_3_1_13_2","series-title":"(NSDI\u201924)","volume-title":"Proceedings of the 21st USENIX Symposium on Networked Systems Design and Implementation","author":"Jiang Ziheng","year":"2025","unstructured":"Ziheng Jiang, Haibin Lin, Yinmin Zhong, Qi Huang, Yangrui Chen, Zhi Zhang, Yanghua Peng, Xiang Li, Cong Xie, Shibiao Nong, Yulu Jia, Sun He, Hongmin Chen, Zhihao Bai, Qi Hou, Shipeng Yan, Ding Zhou, Yiyao Sheng, Zhuo Jiang, Haohan Xu, Haoran Wei, Zhang Zhang, Pengfei Nie, Leqi Zou, Sida Zhao, Liang Xiang, Zherui Liu, Zhe Li, Xiaoying Jia, Jianxi Ye, Xin Jin, and Xin Liu. 2025. MegaScale: Scaling Large Language Model Training to More Than 10,000 GPUs. In Proceedings of the 21st USENIX Symposium on Networked Systems Design and Implementation (Santa Clara, CA, USA) (NSDI\u201924). USENIX Association, USA, Article 41, 16\u00a0pages."},{"key":"e_1_3_3_1_14_2","unstructured":"Dhiraj Kalamkar Dheevatsa Mudigere Naveen Mellempudi Dipankar Das Kunal Banerjee Sasikanth Avancha Dharma\u00a0Teja Vooturi Nataraj Jammalamadaka Jianyu Huang Hector Yuen Jiyan Yang Jongsoo Park Alexander Heinecke Evangelos Georganas Sudarshan Srinivasan Abhisek Kundu Misha Smelyanskiy Bharat Kaul and Pradeep Dubey. 2019. A Study of BFLOAT16 for Deep Learning Training. (2019). arxiv:https:\/\/arXiv.org\/abs\/1905.12322\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/1905.12322"},{"key":"e_1_3_3_1_15_2","unstructured":"Vijay Korthikanti Jared Casper Sangkug Lym Lawrence McAfee Michael Andersch Mohammad Shoeybi and Bryan Catanzaro. 2022. Reducing Activation Recomputation in Large Transformer Models. arxiv:https:\/\/arXiv.org\/abs\/2205.05198\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2205.05198"},{"key":"e_1_3_3_1_16_2","volume-title":"Advances in Neural Information Processing Systems","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey\u00a0E Hinton. 2012. ImageNet Classification with Deep Convolutional Neural Networks. In Advances in Neural Information Processing Systems, F.\u00a0Pereira, C.J. Burges, L.\u00a0Bottou, and K.Q. Weinberger (Eds.), Vol.\u00a025. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2012\/file\/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf"},{"key":"e_1_3_3_1_17_2","unstructured":"Joel Lamy-Poirier. 2023. Breadth-First Pipeline Parallelism. arxiv:https:\/\/arXiv.org\/abs\/2211.05953\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2211.05953"},{"key":"e_1_3_3_1_18_2","unstructured":"Shen Li Yanli Zhao Rohan Varma Omkar Salpekar Pieter Noordhuis Teng Li Adam Paszke Jeff Smith Brian Vaughan Pritam Damania and Soumith Chintala. 2020. PyTorch Distributed: Experiences on Accelerating Data Parallel Training. (2020). arxiv:https:\/\/arXiv.org\/abs\/2006.15704\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2006.15704"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","unstructured":"Yujia Li David Choi Junyoung Chung Nate Kushman Julian Schrittwieser R\u00e9mi Leblond Tom Eccles James Keeling Felix Gimeno Agustin\u00a0Dal Lago Thomas Hubert Peter Choy Cyprien de Masson\u00a0d\u2019Autume Igor Babuschkin Xinyun Chen Po-Sen Huang Johannes Welbl Sven Gowal Alexey Cherepanov James Molloy Daniel\u00a0J. Mankowitz Esme\u00a0Sutherland Robson Pushmeet Kohli Nando de Freitas Koray Kavukcuoglu and Oriol Vinyals. 2022. Competition-level code generation with AlphaCode. Science 378 6624 (2022) 1092\u20131097. 10.1126\/science.abq1158 arXiv:https:\/\/www.science.org\/doi\/pdf\/10.1126\/science.abq1158","DOI":"10.1126\/science.abq1158"},{"key":"e_1_3_3_1_20_2","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong\u00a0Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems 36 (2024)."},{"key":"e_1_3_3_1_21_2","unstructured":"Hao Liu Matei Zaharia and Pieter Abbeel. 2023. Ring Attention with Blockwise Transformers for Near-Infinite Context. arxiv:https:\/\/arXiv.org\/abs\/2310.01889\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2310.01889"},{"key":"e_1_3_3_1_22_2","unstructured":"Hao Liu Matei Zaharia and Pieter Abbeel. 2023. Ring Attention with Blockwise Transformers for Near-Infinite Context. arxiv:https:\/\/arXiv.org\/abs\/2310.01889\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2310.01889"},{"key":"e_1_3_3_1_23_2","volume-title":"Meta open compute project, grand teton ai platform","year":"2022","unstructured":"Meta. 2022. Meta open compute project, grand teton ai platform. https:\/\/engineering.fb.com\/2022\/10\/18\/open-source\/ocp-summit-2022-grand-teton"},{"key":"e_1_3_3_1_24_2","volume-title":"Introducing Llama 3.1: Our most capable models to date","year":"2024","unstructured":"Meta. 2024. Introducing Llama 3.1: Our most capable models to date. https:\/\/ai.meta.com\/blog\/meta-llama-3-1\/"},{"key":"e_1_3_3_1_25_2","volume-title":"Introducing Meta Llama 3: The most capable openly available LLM to date","year":"2024","unstructured":"Meta. 2024. Introducing Meta Llama 3: The most capable openly available LLM to date. https:\/\/ai.meta.com\/blog\/meta-llama-3\/"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_3_1_27_2","volume-title":"NVIDIA Hopper Architecture In-Depth","year":"2022","unstructured":"NVIDIA. 2022. NVIDIA Hopper Architecture In-Depth. https:\/\/developer.nvidia.com\/blog\/nvidia-hopper-architecture-in-depth\/"},{"key":"e_1_3_3_1_28_2","volume-title":"NVIDIA TransformerEngine","year":"2024","unstructured":"NVIDIA. 2024. NVIDIA TransformerEngine. https:\/\/github.com\/NVIDIA\/TransformerEngine"},{"key":"e_1_3_3_1_29_2","volume-title":"Introducing ChatGPT","year":"2022","unstructured":"OpenAI. 2022. Introducing ChatGPT. https:\/\/openai.com\/index\/chatgpt\/"},{"key":"e_1_3_3_1_30_2","unstructured":"Qwen Team. 2024. Qwen2.5 Technical Report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.15115 (2024)."},{"key":"e_1_3_3_1_31_2","unstructured":"Samyam Rajbhandari Jeff Rasley Olatunji Ruwase and Yuxiong He. 2020. ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. arxiv:https:\/\/arXiv.org\/abs\/1910.02054\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/1910.02054"},{"key":"e_1_3_3_1_32_2","unstructured":"Jie Ren Samyam Rajbhandari Reza\u00a0Yazdani Aminabadi Olatunji Ruwase Shuangyan Yang Minjia Zhang Dong Li and Yuxiong He. 2021. ZeRO-Offload: Democratizing Billion-Scale Model Training. arxiv:https:\/\/arXiv.org\/abs\/2101.06840\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2101.06840"},{"key":"e_1_3_3_1_33_2","unstructured":"SemiAnalysis. 2025. 100 000 H100 Clusters: Power Network Topology Ethernet vs InfiniBand Reliability Failures Checkpointing. https:\/\/semianalysis.com\/2024\/06\/17\/100000-h100-clusters-power-network\/"},{"key":"e_1_3_3_1_34_2","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper and Bryan Catanzaro. 2020. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arxiv:https:\/\/arXiv.org\/abs\/1909.08053\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/1909.08053"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3307772.3328315"},{"key":"e_1_3_3_1_36_2","first-page":"24829","volume-title":"Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual","author":"Tarnawski Jakub","year":"2021","unstructured":"Jakub Tarnawski, Deepak Narayanan, and Amar Phanishayee. 2021. Piper: Multidimensional Planner for DNN Parallelization. In Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual. 24829\u201324840."},{"key":"e_1_3_3_1_37_2","unstructured":"Team Gemini. 2023. Gemini: A Family of Highly Capable Multimodal Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.11805 (2023)."},{"key":"e_1_3_3_1_38_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar Aurelien Rodriguez Armand Joulin Edouard Grave and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. (2023). arxiv:https:\/\/arXiv.org\/abs\/2302.13971\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2302.13971"},{"key":"e_1_3_3_1_39_2","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian\u00a0Canton Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit\u00a0Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric\u00a0Michael Smith Ranjan Subramanian Xiaoqing\u00a0Ellen Tan Binh Tang Ross Taylor Adina Williams Jian\u00a0Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arxiv:https:\/\/arXiv.org\/abs\/2307.09288\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2307.09288"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"W\u00a0Eric Wong Ruizhi Gao Yihao Li Rui Abreu and Franz Wotawa. 2016. A survey on software fault localization. IEEE Transactions on Software Engineering 42 8 (2016) 707\u2013740.","DOI":"10.1109\/TSE.2016.2521368"},{"key":"e_1_3_3_1_41_2","unstructured":"Zhiyu Wu Xiaokang Chen Zizheng Pan Xingchao Liu Wen Liu Damai Dai Huazuo Gao Yiyang Ma Chengyue Wu Bingxuan Wang Zhenda Xie Yu Wu Kai Hu Jiawei Wang Yaofeng Sun Yukun Li Yishi Piao Kang Guan Aixin Liu Xin Xie Yuxiang You Kai Dong Xingkai Yu Haowei Zhang Liang Zhao Yisong Wang and Chong Ruan. 2024. DeepSeek-VL2: Mixture-of-Experts Vision-Language Models for Advanced Multimodal Understanding. (2024). arxiv:https:\/\/arXiv.org\/abs\/2412.10302\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2412.10302"},{"key":"e_1_3_3_1_42_2","unstructured":"Wenhan Xiong Jingyu Liu Igor Molybog Hejia Zhang Prajjwal Bhargava Rui Hou Louis Martin Rashi Rungta Karthik\u00a0Abinav Sankararaman Barlas Oguz Madian Khabsa Han Fang Yashar Mehdad Sharan Narang Kshitiz Malik Angela Fan Shruti Bhosale Sergey Edunov Mike Lewis Sinong Wang and Hao Ma. 2023. Effective Long-Context Scaling of Foundation Models. arxiv:https:\/\/arXiv.org\/abs\/2309.16039\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2309.16039"},{"key":"e_1_3_3_1_43_2","unstructured":"Hu Xu Saining Xie Xiaoqing\u00a0Ellen Tan Po-Yao Huang Russell Howes Vasu Sharma Shang-Wen Li Gargi Ghosh Luke Zettlemoyer and Christoph Feichtenhofer. 2023. Demystifying clip data. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.16671 (2023)."},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359650"},{"key":"e_1_3_3_1_45_2","unstructured":"Yanli Zhao Andrew Gu Rohan Varma Liang Luo Chien-Chin Huang Min Xu Less Wright Hamid Shojanazeri Myle Ott Sam Shleifer Alban Desmaison Can Balioglu Pritam Damania Bernard Nguyen Geeta Chauhan Yuchen Hao Ajit Mathews and Shen Li. 2023. PyTorch FSDP: Experiences on Scaling Fully Sharded Data Parallel. arxiv:https:\/\/arXiv.org\/abs\/2304.11277\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2304.11277"},{"key":"e_1_3_3_1_46_2","first-page":"559","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022, Carlsbad, CA, USA, July 11-13, 2022","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric\u00a0P. Xing, Joseph\u00a0E. Gonzalez, and Ion Stoica. 2022. Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022, Carlsbad, CA, USA, July 11-13, 2022. USENIX Association, 559\u2013578."}],"event":{"name":"ISCA '25: Proceedings of the 52nd Annual International Symposium on Computer Architecture","location":"Tokyo Japan","acronym":"SIGARCH '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 52nd Annual International Symposium on Computer Architecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3695053.3731410","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T11:03:22Z","timestamp":1750503802000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3695053.3731410"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,20]]},"references-count":45,"alternative-id":["10.1145\/3695053.3731410","10.1145\/3695053"],"URL":"https:\/\/doi.org\/10.1145\/3695053.3731410","relation":{},"subject":[],"published":{"date-parts":[[2025,6,20]]},"assertion":[{"value":"2025-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}