{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,30]],"date-time":"2026-06-30T15:43:50Z","timestamp":1782834230086,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,8,7]],"date-time":"2023-08-07T00:00:00Z","timestamp":1691366400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,8,7]]},"DOI":"10.1145\/3605573.3605613","type":"proceedings-article","created":{"date-parts":[[2023,9,13]],"date-time":"2023-09-13T16:21:16Z","timestamp":1694622076000},"page":"766-775","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":78,"title":["Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2037-2496","authenticated-orcid":false,"given":"Shenggui","family":"Li","sequence":"first","affiliation":[{"name":"HPC-AI Technology Inc., Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0495-1108","authenticated-orcid":false,"given":"Hongxin","family":"Liu","sequence":"additional","affiliation":[{"name":"HPC-AI Technology Inc., China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1906-1781","authenticated-orcid":false,"given":"Zhengda","family":"Bian","sequence":"additional","affiliation":[{"name":"HPC-AI Technology Inc., China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6724-2763","authenticated-orcid":false,"given":"Jiarui","family":"Fang","sequence":"additional","affiliation":[{"name":"HPC-AI Technology Inc., China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6301-5557","authenticated-orcid":false,"given":"Haichen","family":"Huang","sequence":"additional","affiliation":[{"name":"HPC-AI Technology Inc., China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1878-0199","authenticated-orcid":false,"given":"Yuliang","family":"Liu","sequence":"additional","affiliation":[{"name":"HPC-AI Technology Inc., China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3622-6020","authenticated-orcid":false,"given":"Boxiang","family":"Wang","sequence":"additional","affiliation":[{"name":"HPC-AI Technology Inc., Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2816-4384","authenticated-orcid":false,"given":"Yang","family":"You","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,9,13]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1147\/rd.395.0575"},{"key":"e_1_3_2_1_2_1","unstructured":"Mandeep Baines Shruti Bhosale Vittorio Caggiano Naman Goyal Siddharth Goyal Myle Ott Benjamin Lefaudeux Vitaliy Liptchinsky Mike Rabbat Sam Sheiffer Anjali Sridhar and Min Xu. 2021. FairScale: A general purpose modular PyTorch library for high performance and large scale training. https:\/\/github.com\/facebookresearch\/fairscale."},{"key":"e_1_3_2_1_3_1","volume-title":"Communication efficient matrix multiplication on hypercubes. Parallel computing 12, 3","author":"Berntsen Jarle","year":"1989","unstructured":"Jarle Berntsen. 1989. Communication efficient matrix multiplication on hypercubes. Parallel computing 12, 3 (1989), 335\u2013342."},{"key":"e_1_3_2_1_4_1","volume-title":"Maximizing Parallelism in Distributed Training for Huge Neural Networks. arXiv preprint arXiv:2105.14450","author":"Bian Zhengda","year":"2021","unstructured":"Zhengda Bian, Qifan Xu, Boxiang Wang, and Yang You. 2021. Maximizing Parallelism in Distributed Training for Huge Neural Networks. arXiv preprint arXiv:2105.14450 (2021)."},{"key":"e_1_3_2_1_5_1","volume-title":"Language models are few-shot learners. arXiv preprint arXiv:2005.14165","author":"Brown B","year":"2020","unstructured":"Tom\u00a0B Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, 2020. Language models are few-shot learners. arXiv preprint arXiv:2005.14165 (2020)."},{"key":"e_1_3_2_1_6_1","volume-title":"A cellular computer to implement the Kalman filter algorithm","author":"Cannon Lynn\u00a0Elliot","unstructured":"Lynn\u00a0Elliot Cannon. 1969. A cellular computer to implement the Kalman filter algorithm. Montana State University."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","unstructured":"Tianqi Chen Bing Xu Chiyuan Zhang and Carlos Guestrin. 2016. Training Deep Nets with Sublinear Memory Cost. https:\/\/doi.org\/10.48550\/ARXIV.1604.06174","DOI":"10.48550\/ARXIV.1604.06174"},{"key":"e_1_3_2_1_8_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_9_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_10_1","volume-title":"All NLP Tasks Are Generation Tasks: A General Pretraining Framework. arXiv preprint arXiv:2103.10360","author":"Du Zhengxiao","year":"2021","unstructured":"Zhengxiao Du, Yujie Qian, Xiao Liu, Ming Ding, Jiezhong Qiu, Zhilin Yang, and Jie Tang. 2021. All NLP Tasks Are Generation Tasks: A General Pretraining Framework. arXiv preprint arXiv:2103.10360 (2021)."},{"key":"e_1_3_2_1_11_1","volume-title":"Adaptive subgradient methods for online learning and stochastic optimization.Journal of machine learning research 12, 7","author":"Duchi John","year":"2011","unstructured":"John Duchi, Elad Hazan, and Yoram Singer. 2011. Adaptive subgradient methods for online learning and stochastic optimization.Journal of machine learning research 12, 7 (2011)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","unstructured":"Jiarui Fang Yang Yu Zilin Zhu Shenggui Li Yang You and Jie Zhou. 2021. PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management. https:\/\/doi.org\/10.48550\/ARXIV.2108.05818","DOI":"10.48550\/ARXIV.2108.05818"},{"key":"e_1_3_2_1_13_1","unstructured":"Wikimedia Foundation. [n. d.]. Wikimedia Downloads. https:\/\/dumps.wikimedia.org"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_15_1","volume-title":"Advances in Neural Information Processing Systems, H.\u00a0Wallach, H.\u00a0Larochelle, A.\u00a0Beygelzimer, F.\u00a0d'Alch\u00e9-Buc, E.\u00a0Fox, and R.\u00a0Garnett (Eds.). Vol.\u00a032. Curran Associates","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc\u00a0V Le, Yonghui Wu, and zhifeng Chen. 2019. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. In Advances in Neural Information Processing Systems, H.\u00a0Wallach, H.\u00a0Larochelle, A.\u00a0Beygelzimer, F.\u00a0d'Alch\u00e9-Buc, E.\u00a0Fox, and R.\u00a0Garnett (Eds.). Vol.\u00a032. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper\/2019\/file\/093f65e080a295f8076b1c5722a46aa2-Paper.pdf"},{"key":"e_1_3_2_1_16_1","volume-title":"GPipe: Efficient Training of Giant Neural Networks Using Pipeline Parallelism","author":"Huang Yanping","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Mia\u00a0Xu Chen, Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc\u00a0V. Le, Yonghui Wu, and Zhifeng Chen. 2019. GPipe: Efficient Training of Giant Neural Networks Using Pipeline Parallelism. Curran Associates Inc., Red Hook, NY, USA."},{"key":"e_1_3_2_1_17_1","volume-title":"Adam: A Method for Stochastic Optimization. International Conference on Learning Representations (12","author":"Kingma Diederik","year":"2014","unstructured":"Diederik Kingma and Jimmy Ba. 2014. Adam: A Method for Stochastic Optimization. International Conference on Learning Representations (12 2014)."},{"key":"e_1_3_2_1_18_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma P","year":"2014","unstructured":"Diederik\u00a0P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_19_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=qrwe7XHTmYb","author":"Lepikhin Dmitry","year":"2021","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. 2021. {GS}hard: Scaling Giant Models with Conditional Computation and Automatic Sharding. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=qrwe7XHTmYb"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476145"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2105.13120"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.29"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","unstructured":"Denis Paperno Germ\u00e1n Kruszewski Angeliki Lazaridou Quan Pham Raffaella Bernardi Sandro Pezzelle Marco Baroni Gemma Boleda and Raquel Fern\u00e1ndez. 2016. The LAMBADA dataset: Word prediction requiring a broad discourse context. 1525\u20131534. https:\/\/doi.org\/10.18653\/v1\/P16-1144","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_28_1","unstructured":"Alec Radford Jeff Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_30_1","unstructured":"Jie Ren Samyam Rajbhandari Reza\u00a0Yazdani Aminabadi Olatunji Ruwase Shuangyan Yang Minjia Zhang Dong Li and Yuxiong He. 2021. ZeRO-Offload: Democratizing Billion-Scale Model Training. arxiv:2101.06840\u00a0[cs.DC]"},{"key":"e_1_3_2_1_31_1","volume-title":"Horovod: fast and easy distributed deep learning in TensorFlow. CoRR abs\/1802.05799","author":"Sergeev Alexander","year":"2018","unstructured":"Alexander Sergeev and Mike\u00a0Del Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. CoRR abs\/1802.05799 (2018). arXiv:1802.05799http:\/\/arxiv.org\/abs\/1802.05799"},{"key":"e_1_3_2_1_32_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Edgar Solomonik and James Demmel. 2011. Communication-Optimal Parallel 2.5D Matrix Multiplication and LU Factorization Algorithms. In Euro-Par.","DOI":"10.1007\/978-3-642-23397-5_10"},{"key":"e_1_3_2_1_34_1","volume-title":"van\u00a0de Geijn and Jerrell Watts","author":"A.","year":"1995","unstructured":"Robert\u00a0A. van\u00a0de Geijn and Jerrell Watts. 1995. SUMMA: Scalable Universal Matrix Multiplication Algorithm. Technical Report. USA."},{"key":"e_1_3_2_1_35_1","volume-title":"Advances in Neural Information Processing Systems, I.\u00a0Guyon, U.\u00a0V. Luxburg, S.\u00a0Bengio, H.\u00a0Wallach, R.\u00a0Fergus, S.\u00a0Vishwanathan, and R.\u00a0Garnett (Eds.). Vol.\u00a030. Curran Associates","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141\u00a0ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems, I.\u00a0Guyon, U.\u00a0V. Luxburg, S.\u00a0Bengio, H.\u00a0Wallach, R.\u00a0Fergus, S.\u00a0Vishwanathan, and R.\u00a0Garnett (Eds.). Vol.\u00a030. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_2_1_36_1","volume-title":"arXiv preprint arXiv:2105.14500","author":"Wang Boxiang","year":"2021","unstructured":"Boxiang Wang, Qifan Xu, Zhengda Bian, and Yang You. 2021. 2.5-dimensional distributed model training. arXiv preprint arXiv:2105.14500 (2021)."},{"key":"e_1_3_2_1_37_1","volume-title":"Linformer: Self-Attention with Linear Complexity. arXiv preprint arXiv:2006.04768","author":"Wang Sinong","year":"2020","unstructured":"Sinong Wang, Belinda Li, Madian Khabsa, Han Fang, and Hao Ma. 2020. Linformer: Self-Attention with Linear Complexity. arXiv preprint arXiv:2006.04768 (2020)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Thomas Wolf Lysandre Debut Victor Sanh Julien Chaumond Clement Delangue Anthony Moi Pierric Cistac Tim Rault R\u00e9mi Louf Morgan Funtowicz Joe Davison Sam Shleifer Patrick von Platen Clara Ma Yacine Jernite Julien Plu Canwen Xu Teven\u00a0Le Scao Sylvain Gugger Mariama Drame Quentin Lhoest and Alexander\u00a0M. Rush. 2020. HuggingFace\u2019s Transformers: State-of-the-art Natural Language Processing. arxiv:1910.03771\u00a0[cs.CL]","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"e_1_3_2_1_39_1","volume-title":"An Efficient 2D Method for Training Super-Large Deep Learning Models. arXiv preprint arXiv:2104.05343","author":"Xu Qifan","year":"2021","unstructured":"Qifan Xu, Shenggui Li, Chaoyu Gong, and Yang You. 2021. An Efficient 2D Method for Training Super-Large Deep Learning Models. arXiv preprint arXiv:2104.05343 (2021)."},{"key":"e_1_3_2_1_40_1","volume-title":"Big Bird: Transformers for Longer Sequences. In Advances in Neural Information Processing Systems, H.\u00a0Larochelle, M.\u00a0Ranzato, R.\u00a0Hadsell, M.\u00a0F. Balcan, and H.\u00a0Lin (Eds.). Vol.\u00a033. Curran Associates","author":"Zaheer Manzil","year":"2020","unstructured":"Manzil Zaheer, Guru Guruganesh, Kumar\u00a0Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, and Amr Ahmed. 2020. Big Bird: Transformers for Longer Sequences. In Advances in Neural Information Processing Systems, H.\u00a0Larochelle, M.\u00a0Ranzato, R.\u00a0Hadsell, M.\u00a0F. Balcan, and H.\u00a0Lin (Eds.). Vol.\u00a033. Curran Associates, Inc., 17283\u201317297. https:\/\/proceedings.neurips.cc\/paper\/2020\/file\/c8512d142a2d849725f31a9a7a361ab9-Paper.pdf"},{"key":"e_1_3_2_1_41_1","volume-title":"OPT: Open Pre-trained Transformer Language Models. arxiv:2205.01068\u00a0[cs.CL]","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi\u00a0Victoria Lin, Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit\u00a0Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer. 2022. OPT: Open Pre-trained Transformer Language Models. arxiv:2205.01068\u00a0[cs.CL]"},{"key":"e_1_3_2_1_42_1","volume-title":"Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric\u00a0P. Xing, Joseph\u00a0E. Gonzalez, and Ion Stoica. 2022. Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 559\u2013578. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/zheng-lianmin"}],"event":{"name":"ICPP 2023: 52nd International Conference on Parallel Processing","location":"Salt Lake City UT USA","acronym":"ICPP 2023"},"container-title":["Proceedings of the 52nd International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3605573.3605613","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3605573.3605613","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T17:49:04Z","timestamp":1750182544000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3605573.3605613"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,8,7]]},"references-count":42,"alternative-id":["10.1145\/3605573.3605613","10.1145\/3605573"],"URL":"https:\/\/doi.org\/10.1145\/3605573.3605613","relation":{},"subject":[],"published":{"date-parts":[[2023,8,7]]},"assertion":[{"value":"2023-09-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}