{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T19:20:20Z","timestamp":1773429620606,"version":"3.50.1"},"reference-count":179,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J. Comput. Sci. Technol."],"published-print":{"date-parts":[[2025,1]]},"DOI":"10.1007\/s11390-024-4178-1","type":"journal-article","created":{"date-parts":[[2025,3,12]],"date-time":"2025-03-12T15:26:32Z","timestamp":1741793192000},"page":"6-41","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":12,"title":["AI Computing Systems for Large Language Models Training"],"prefix":"10.1007","volume":"40","author":[{"given":"Zhen-Xing","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Yuan-Bo","family":"Wen","sequence":"additional","affiliation":[]},{"given":"Han-Qi","family":"Lyu","sequence":"additional","affiliation":[]},{"given":"Chang","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Rui","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Xia-Qing","family":"Li","sequence":"additional","affiliation":[]},{"given":"Chao","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Zi-Dong","family":"Du","sequence":"additional","affiliation":[]},{"given":"Qi","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Ling","family":"Li","sequence":"additional","affiliation":[]},{"given":"Xue-Hai","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Yun-Ji","family":"Chen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,12]]},"reference":[{"key":"4178_CR1","first-page":"6000","volume-title":"Proc. the 31st International Conference on Neural Information Processing Systems","author":"A Vaswani","year":"2017","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez A N, Kaiser \u0141, Polosukhin I. Attention is all you need. In Proc. the 31st International Conference on Neural Information Processing Systems, Dec. 2017, pp.6000\u20136010."},{"key":"4178_CR2","doi-asserted-by":"publisher","first-page":"4171","DOI":"10.18653\/v1\/N19-1423","volume-title":"Proc. the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)","author":"J Devlin","year":"2018","unstructured":"Devlin J, Chang M W, Lee K, Toutanova K. BERT: Pre-training of deep bidirectional transformers for language understanding. In Proc. the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), Jun. 2018, pp.4171\u20134186. DOI: https:\/\/doi.org\/10.18653\/v1\/N19-1423."},{"key":"4178_CR3","unstructured":"Radford A, Wu J, Child R, Luan D, Amodei D, Sutskever I. Language models are unsupervised multitask learners. OpenAI Blog, 2019, 1 (8): Article No. 9."},{"key":"4178_CR4","volume-title":"Proc. the 34th International Conference on Neural Information Processing Systems","author":"T B Brown","year":"2020","unstructured":"Brown T B, Mann B, Ryder N et al. Language models are few-shot learners. In Proc. the 34th International Conference on Neural Information Processing Systems, Dec. 2020, Article No. 159."},{"key":"4178_CR5","unstructured":"Liu A, Feng B, Wang B et al. Deepseek-v2: A strong, economical, and efficient mixture-of-experts language model. arXiv: 2405.04434, 2024. https:\/\/arxiv.org\/abs\/2405.04434, Jan. 2025."},{"key":"4178_CR6","unstructured":"Liu A, Feng B, Xue B et al. Deepseek-v3 technical report. arXiv: 2412.19437, 2024. https:\/\/arxiv.org\/abs\/2412.19437, Jan. 2025."},{"key":"4178_CR7","doi-asserted-by":"publisher","DOI":"10.1016\/C2021-0-02950-3","volume-title":"AI Computing Systems: An Application-Driven Perspective","author":"Y Chen","year":"2023","unstructured":"Chen Y, Li L, Li W, Guo Q, Du Z, Xu Z. AI Computing Systems: An Application-Driven Perspective. Elsevier, 2023. DOI: https:\/\/doi.org\/10.1016\/C2021-0-02950-3."},{"key":"4178_CR8","doi-asserted-by":"publisher","first-page":"269","DOI":"10.1145\/2541940.2541967","volume-title":"Proc. the 19th International Conference on Architectural Support for Programming Languages and Operating systems","author":"T Chen","year":"2014","unstructured":"Chen T, Du Z, Sun N, Wang J, Wu C, Chen Y, Temam O. DianNao: A small-footprint high-throughput accelerator for ubiquitous machine-learning. In Proc. the 19th International Conference on Architectural Support for Programming Languages and Operating systems, Mar. 2014, pp.269\u2013284. DOI: https:\/\/doi.org\/10.1145\/2541940.2541967."},{"key":"4178_CR9","doi-asserted-by":"publisher","first-page":"609","DOI":"10.1109\/MICRO.2014.58","volume-title":"Proc. the 47th Annual IEEE\/ACM International Symposium on Microarchitecture","author":"Y Chen","year":"2014","unstructured":"Chen Y, Luo T, Liu S, Zhang S, He L, Wang J, Li L, Chen T, Xu Z, Sun N, Temam O. DaDianNao: A machine-learning supercomputer. In Proc. the 47th Annual IEEE\/ACM International Symposium on Microarchitecture, Dec. 2014, pp.609\u2013622. DOI: https:\/\/doi.org\/10.1109\/MICRO.2014.58."},{"key":"4178_CR10","doi-asserted-by":"publisher","first-page":"369","DOI":"10.1145\/2694344.2694358","volume-title":"Proc. the 20th International Conference on Architectural Support for Programming Languages and Operating Systems","author":"D Liu","year":"2015","unstructured":"Liu D, Chen T, Liu S, Zhou J, Zhou S, Teman O, Feng X, Zhou X, Chen Y. PuDianNao: A polyvalent machine learning accelerator. In Proc. the 20th International Conference on Architectural Support for Programming Languages and Operating Systems, Mar. 2015, pp.369\u2013381. DOI: https:\/\/doi.org\/10.1145\/2694344.2694358."},{"key":"4178_CR11","doi-asserted-by":"publisher","first-page":"92","DOI":"10.1145\/2749469.2750389","volume-title":"Proc. the 42nd ACM\/IEEE Annual International Symposium on Computer Architecture (ISCA)","author":"Z Du","year":"2015","unstructured":"Du Z, Fasthuber R, Chen T, Ienne P, Li L, Luo T, Feng X, Chen Y, Temam O. ShiDianNao: Shifting vision processing closer to the sensor. In Proc. the 42nd ACM\/IEEE Annual International Symposium on Computer Architecture (ISCA), Jun. 2015, pp.92\u2013104. DOI: https:\/\/doi.org\/10.1145\/2749469.2750389."},{"key":"4178_CR12","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3079856.3080246","volume-title":"Proc. the 44th ACM\/IEEE Annual International Symposium on Computer Architecture (ISCA)","author":"N P Jouppi","year":"2017","unstructured":"Jouppi N P, Young C, Patil N et al. In-datacenter performance analysis of a tensor processing unit. In Proc. the 44th ACM\/IEEE Annual International Symposium on Computer Architecture (ISCA), Jun. 2017, pp.1\u201312. DOI: https:\/\/doi.org\/10.1145\/3079856.3080246."},{"key":"4178_CR13","doi-asserted-by":"publisher","first-page":"3505","DOI":"10.1145\/3394486.3406703","volume-title":"Proc. the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining","author":"J Rasley","year":"2020","unstructured":"Rasley J, Rajbhandari S, Ruwase O, He Y. DeepSpeed: System optimizations enable training deep learning models with over 100 billion parameters. In Proc. the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, Jul. 2020, pp.3505\u20133506. DOI: https:\/\/doi.org\/10.1145\/3394486.3406703."},{"key":"4178_CR14","doi-asserted-by":"publisher","first-page":"18490","DOI":"10.1609\/aaai.v38i16.29810","volume-title":"Proc. the 38th AAAI Conference on Artificial Intelligence","author":"C Li","year":"2024","unstructured":"Li C, Yao Z, Wu X, Zhang M, Holmes C, Li C, He Y. DeepSpeed data efficiency: Improving deep learning model quality and training efficiency via efficient data sampling and routing. In Proc. the 38th AAAI Conference on Artificial Intelligence, Feb. 2024, pp.18490\u201318498. DOI: https:\/\/doi.org\/10.1609\/aaai.v38i16.29810."},{"key":"4178_CR15","volume-title":"Proc. the 2022 International Conference on High Performance Computing, Networking, Storage and Analysis","author":"R Y Aminabadi","year":"2022","unstructured":"Aminabadi R Y, Rajbhandari S, Awan A A, Li C, Li D, Zheng E, Ruwase O, Smith S, Zhang M, Rasley J, He Y. DeepSpeed-inference: Enabling efficient inference of transformer models at unprecedented scale. In Proc. the 2022 International Conference on High Performance Computing, Networking, Storage and Analysis, Nov. 2022, Article No. 46."},{"key":"4178_CR16","first-page":"18332","volume-title":"Proc. the 39th International Conference on Machine Learning","author":"S Rajbhandari","year":"2022","unstructured":"Rajbhandari S, Li C, Yao Z, Zhang M, Aminabadi R Y, Awan A A, Rasley J, He Y. DeepSpeed-MoE: Advancing mixture-of-experts inference and training to power next-generation AI scale. In Proc. the 39th International Conference on Machine Learning, Jul. 2022, pp.18332\u201318346."},{"key":"4178_CR17","unstructured":"Shoeybi M, Patwary M, Puri R, LeGresley P, Casper J, Catanzaro B. Megatron-LM: Training multi-billion parameter language models using model parallelism. arXiv: 1909.08053, 2019. https:\/\/arxiv.org\/abs\/1909.08053, Nov. 2024."},{"key":"4178_CR18","volume-title":"Proc. the 2021 International Conference for High Performance Computing, Networking, Storage and Analysis","author":"D Narayanan","year":"2021","unstructured":"Narayanan D, Shoeybi M, Casper J et al. Efficient large-scale language model training on GPU clusters using megatron-LM. In Proc. the 2021 International Conference for High Performance Computing, Networking, Storage and Analysis, Nov. 2021."},{"key":"4178_CR19","doi-asserted-by":"publisher","first-page":"111","DOI":"10.1016\/j.aiopen.2022.10.001","volume":"3","author":"T Lin","year":"2022","unstructured":"Lin T, Wang Y, Liu X, Qiu X. A survey of transformers. AI Open, 2022, 3: 111\u2013132. DOI: https:\/\/doi.org\/10.1016\/j.aiopen.2022.10.001.","journal-title":"AI Open"},{"key":"4178_CR20","unstructured":"Zhao W X, Zhou K, Li J, Tang T, Wang X, Hou Y, Min Y, Zhang B, Zhang J, Dong Z, Du Y, Yang C, Chen Y, Chen Z, Jiang J, Ren R, Li Y, Tang X, Liu Z, Liu P, Nie J Y, Wen J R. A survey of large language models. arXiv: 2303.18223, 2023. https:\/\/arxiv.org\/abs\/2303.18223, Nov. 2024."},{"key":"4178_CR21","doi-asserted-by":"publisher","first-page":"2247","DOI":"10.1109\/BigData59044.2023.10386743","volume-title":"Proc. the 2023 IEEE International Conference on Big Data (BigData)","author":"J Wu","year":"2023","unstructured":"Wu J, Gan W, Chen Z, Wan S, Philip S Y. Multimodal large language models: A survey. In Proc. the 2023 IEEE International Conference on Big Data (BigData), Dec. 2023, pp.2247\u20132256. DOI: https:\/\/doi.org\/10.1109\/BigData59044.2023.10386743."},{"key":"4178_CR22","unstructured":"Kim S, Hooper C, Wattanawong T, Kang M, Yan R, Genc H, Dinh G, Huang Q, Keutzer K, Mahoney M W, Shao Y S, Gholami A. Full stack optimization of transformer inference: A survey. arXiv: 2302.14017, 2023. https:\/\/arxiv.org\/abs\/2302.14017, Nov. 2024."},{"key":"4178_CR23","unstructured":"Miao X, Oliaro G, Zhang Z, Cheng X, Jin H, Chen T, Jia Z. Towards efficient generative large language model serving: A survey from algorithms to systems. arXiv: 2312.15234, 2023. https:\/\/arxiv.org\/abs\/2312.15234, Nov. 2024."},{"key":"4178_CR24","unstructured":"Shen L, Sun Y, Yu Z, Ding L, Tian X, Tao D. On efficient training of large-scale deep learning models: A literature review. arXiv: 2304.03589, 2023. https:\/\/arxiv.org\/abs\/2304.03589, Nov. 2024."},{"key":"4178_CR25","doi-asserted-by":"publisher","first-page":"6823","DOI":"10.24963\/ijcai.2023\/764","volume-title":"Proc. the 32nd International Joint Conference on Artificial Intelligence","author":"B Zhuang","year":"2023","unstructured":"Zhuang B, Liu J, Pan Z, He H, Weng Y, Shen C. A survey on efficient training of transformers. In Proc. the 32nd International Joint Conference on Artificial Intelligence, Aug. 2023, pp.6823\u20136831. DOI: https:\/\/doi.org\/10.24963\/ijcai.2023\/764."},{"issue":"1","key":"4178_CR26","first-page":"140","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel C, Shazeer N, Roberts A, Lee K, Narang S, Matena M, Zhou Y, Li W, Liu P J. Exploring the limits of transfer learning with a unified text-to-text transformer. The Journal of Machine Learning Research, 2020, 21(1): 140.","journal-title":"The Journal of Machine Learning Research"},{"key":"4178_CR27","unstructured":"OpenAI. GPT-4 technical report. arXiv: 2303.08774, 2023. https:\/\/arxiv.org\/abs\/2303.08774, Nov. 2024."},{"key":"4178_CR28","unstructured":"Sun Y, Wang S, Feng S, Ding S, Pang C, Shang J, Liu J, Chen X, Zhao Y, Lu Y, Liu W, Wu Z, Gong W, Liang J, Shang Z, Sun P, Liu W, Ouyang X, Yu D, Tian H, Wu H, Wang H. Ernie 3.0: Large-scale knowledge enhanced pre-training for language understanding and generation. arXiv: 2107.02137, 2021. https:\/\/arxiv.org\/abs\/2107.02137, Nov. 2024."},{"key":"4178_CR29","unstructured":"Rae J W, Borgeaud S, Cai T et al. Scaling language models: Methods, analysis & insights from training gopher. arXiv: 2112.11446, 2021. https:\/\/arxiv.org\/abs\/2112.11446, Nov. 2024."},{"issue":"1","key":"4178_CR30","first-page":"240","volume":"24","author":"A Chowdhery","year":"2023","unstructured":"Chowdhery A, Narang S, Devlin J et al. PaLM: Scaling language modeling with pathways. The Journal of Machine Learning Research, 2023, 24(1): 240.","journal-title":"The Journal of Machine Learning Research"},{"key":"4178_CR31","unstructured":"Zhang S, Roller S, Goyal N et al. OPT: Open pretrained transformer language models. arXiv: 2205.01068, 2022. https:\/\/arxiv.org\/abs\/2205.01068, Nov. 2024."},{"key":"4178_CR32","unstructured":"Scao TL, Fan A, Akiki C et al. BLOOM: A 176b-parameter open-access multilingual language model. arXiv: 2211.05100, 2022. https:\/\/arxiv.org\/abs\/2211.05100, Nov. 2024."},{"key":"4178_CR33","doi-asserted-by":"publisher","first-page":"320","DOI":"10.18653\/v1\/2022.acl-long.26","volume-title":"Proc. the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","author":"Z Du","year":"2022","unstructured":"Du Z, Qian Y, Liu X, Ding M, Qiu J, Yang Z, Tang J. GLM: General language model pretraining with autoregressive blank infilling. In Proc. the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), May 2022, pp.320\u2013335. DOI: https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.26."},{"key":"4178_CR34","unstructured":"Touvron H, Lavril T, Izacard G, Martinet X, Lachaux M A, Lacroix T, Rozi\u00e8re B, Goyal N, Hambro E, Azhar F, Rodriguez A, Joulin A, Grave E, Lample G. LLaMA: Open and efficient foundation language models. arXiv: 2302.13971, 2023. https:\/\/arxiv.org\/abs\/2302.13971, Nov. 2024."},{"key":"4178_CR35","unstructured":"Ren X, Zhou P, Meng X, Huang X, Wang Y, Wang W, Li P, Zhang X, Podolskiy A, Arshinov G, Bout A, Piontkovskaya I, Wei J, Jiang X, Su T, Liu Q, Yao J. Pan-Gu-\u03a3: Towards trillion parameter language model with sparse heterogeneous computing. arXiv: 2303.10845, 2023. https:\/\/arxiv.org\/abs\/2303.10845, Nov. 2024."},{"key":"4178_CR36","unstructured":"Touvron H, Martin L, Stone K et al. Llama 2: Open foundation and fine-tuned chat models. arXiv: 2307.09288, 2023. https:\/\/arxiv.org\/abs\/2307.09288, Nov. 2024."},{"key":"4178_CR37","doi-asserted-by":"publisher","first-page":"26753","DOI":"10.1109\/CVPR52733.2024.02527","volume-title":"Proc. the 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Z Li","year":"2024","unstructured":"Li Z, Yang B, Liu Q, Ma Z, Zhang S, Yang J, Sun Y, Liu Y, Bai X. Monkey: Image resolution and text label are important things for large multi-modal models. In Proc. the 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Jun. 2024, pp.26753\u201326763. DOI: https:\/\/doi.org\/10.1109\/CVPR52733.2024.02527."},{"key":"4178_CR38","unstructured":"Anil R, Borgeand S, Alayrac JB, et al. Gemini: A family of highly capable multimodal models. arXiv: 2312.11805, 2023. https:\/\/arxiv.org\/abs\/2312.11805, Nov. 2024."},{"key":"4178_CR39","unstructured":"Bai J, Bai S, Chu Y et al. Qwen technical report. arXiv: 2309.16609, 2023. https:\/\/arxiv.org\/abs\/2309.16609, Nov. 2024."},{"key":"4178_CR40","unstructured":"Kaplan J, McCandlish S, Henighan T, Brown T B, Chess B, Child R, Gray S, Radford A, Wu J, Amodei D. Scaling laws for neural language models. arXiv: 2001. 08361, 2020. https:\/\/arxiv.org\/abs\/2001.08361, Nov. 2024."},{"key":"4178_CR41","first-page":"5547","volume-title":"Proc. the 39th International Conference on Machine Learning","author":"N Du","year":"2022","unstructured":"Du N, Huang Y, Dai A M et al. GLaM: Efficient scaling of language models with mixture-of-experts. In Proc. the 39th International Conference on Machine Learning, Jul. 2022, pp.5547\u20135569."},{"issue":"1","key":"4178_CR42","first-page":"120","volume":"23","author":"W Fedus","year":"2022","unstructured":"Fedus W, Zoph B, Shazeer N. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. The Journal of Machine Learning Research, 2022, 23(1): 120.","journal-title":"The Journal of Machine Learning Research"},{"key":"4178_CR43","unstructured":"Dai D, Deng C, Zhao C et al. Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models. arXiv: 2401.06066, 2024. https:\/\/arxiv.org\/abs\/2401.06066, Jan. 2025."},{"key":"4178_CR44","unstructured":"Child R, Gray S, Radford A, Sutskever I. Generating long sequences with sparse transformers. arXiv: 1904.10509, 2019. https:\/\/arxiv.org\/abs\/1904.10509, Nov. 2024."},{"key":"4178_CR45","volume-title":"Proc. the 8th International Conference on Learning Representations","author":"N Kitaev","year":"2020","unstructured":"Kitaev N, Kaiser L, Levskaya A. Reformer: The efficient transformer. In Proc. the 8th International Conference on Learning Representations, Apr. 2020."},{"key":"4178_CR46","unstructured":"Beltagy I, Peters M E, Cohan A. Longformer: The long-document transformer. arXiv: 2004.05150, 2020. https:\/\/arxiv.org\/abs\/2004.05150, Nov. 2024."},{"key":"4178_CR47","volume-title":"Proc. the 34th International Conference on Neural Information Processing Systems","author":"M Zaheer","year":"2020","unstructured":"Zaheer M, Guruganesh G, Dubey A, Ainslie J, Alberti C, Ontanon S, Pham P, Ravula A, Wang Q, Yang L, Ahmed A. Big bird: Transformers for longer sequences. In Proc. the 34th International Conference on Neural Information Processing Systems, Dec. 2020, Article No. 1450."},{"key":"4178_CR48","volume-title":"Proc. the 12th International Conference on Learning Representations","author":"G Xiao","year":"2024","unstructured":"Xiao G, Tian Y, Chen B, Han S, Lewis M. Efficient streaming language models with attention sinks. In Proc. the 12th International Conference on Learning Representations, May 2024."},{"key":"4178_CR49","doi-asserted-by":"publisher","first-page":"6383","DOI":"10.18653\/v1\/2023.acl-long.352","volume-title":"Proc. the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","author":"N Ratner","year":"2023","unstructured":"Ratner N, Levine Y, Belinkov Y et al. Parallel context windows for large language models. In Proc. the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), Jul. 2023, pp.6383\u20136402. DOI: https:\/\/doi.org\/10.18653\/v1\/2023.acl-long.352."},{"key":"4178_CR50","doi-asserted-by":"crossref","unstructured":"Ding J, Ma S, Dong L, Zhang X, Huang S, Wang W, Zheng N, Wei F. LongNet: Scaling transformers to 1, 000, 000, 000 tokens. arXiv: 2307.02486, 2023. https:\/\/arxiv.org\/abs\/2307.02486, Nov. 2024.","DOI":"10.14218\/JCTH.2024.00317"},{"issue":"6","key":"4178_CR51","doi-asserted-by":"publisher","first-page":"109","DOI":"10.1145\/3530811","volume":"55","author":"Y Tay","year":"2023","unstructured":"Tay Y, Dehghani M, Bahri D, Metzler D. Efficient transformers: A survey. ACM Computing Surveys, 2023, 55(6): 109. DOI: https:\/\/doi.org\/10.1145\/3530811.","journal-title":"ACM Computing Surveys"},{"key":"4178_CR52","doi-asserted-by":"publisher","first-page":"7871","DOI":"10.18653\/v1\/2020.acl-main.703","volume-title":"Proc. the 58th Annual Meeting of the Association for Computational Linguistics","author":"M Lewis","year":"2020","unstructured":"Lewis M, Liu Y, Goyal N, Ghazvininejad M, Mohamed A, Levy O, Stoyanov V, Zettlemoyer L. BART: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. In Proc. the 58th Annual Meeting of the Association for Computational Linguistics, Jul. 2020, pp.7871\u20137880. DOI: https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.703."},{"key":"4178_CR53","doi-asserted-by":"publisher","first-page":"483","DOI":"10.18653\/v1\/2021.naacl-main.41","volume-title":"Proc. the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","author":"L Xue","year":"2021","unstructured":"Xue L, Constant N, Roberts A, Kale M, Al-Rfou R, Siddhant A, Barua A, Raffel C. mT5: A massively multilingual pre-trained text-to-text transformer. In Proc. the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Jun. 2021, pp.483\u2013498. DOI: https:\/\/doi.org\/10.18653\/v1\/2021.naacl-main.41."},{"key":"4178_CR54","volume-title":"Proc. the Eleventh International Conference on Learning Representations","author":"Y Tay","year":"2023","unstructured":"Tay Y, Dehghani M, Tran V Q, Garcia X, Wei J, Wang X, Chung H W, Bahri D, Schuster T, Zheng S, Zhou D, Houlsby N, Metzler D. UL2: Unifying language learning paradigms. In Proc. the Eleventh International Conference on Learning Representations, May 2023."},{"key":"4178_CR55","doi-asserted-by":"publisher","first-page":"1069","DOI":"10.18653\/v1\/2023.emnlp-main.68","volume-title":"Proc. the 2023 Conference on Empirical Methods in Natural Language Processing","author":"Y Wang","year":"2023","unstructured":"Wang Y, Le H, Gotmare A, Bui N D, Li J, Hoi S C. CodeT5+: Open code large language models for code understanding and generation. In Proc. the 2023 Conference on Empirical Methods in Natural Language Processing, Dec. 2023, pp.1069\u20131088. DOI: https:\/\/doi.org\/10.18653\/v1\/2023.emnlp-main.68."},{"key":"4178_CR56","unstructured":"Soltan S, Ananthakrishnan S, FitzGerald J, Gupta R, Hamza W, Khan H, Peris C, Rawls S, Rosenbaum A, Rumshisky A, Prakash C S, Sridhar M, Triefenbach F, Verma A, Tur G, Natarajan P. AlexaTM 20B: Few-shot learning using a large-scale multilingual seq2seq model. arXiv: 2208.01448, 2022. https:\/\/arxiv.org\/abs\/2208.01448, Nov. 2024."},{"key":"4178_CR57","volume-title":"Proc. the 10th International Conference on Learning Representations","author":"E J Hu","year":"2021","unstructured":"Hu E J, Shen Y, Wallis P, Allen-Zhu Z, Li Y, Wang S, Wang L, Chen W. LoRA: Low-rank adaptation of large language models. In Proc. the 10th International Conference on Learning Representations, Apr. 2021."},{"key":"4178_CR58","doi-asserted-by":"publisher","first-page":"4582","DOI":"10.18653\/v1\/2021.acl-long.353","volume-title":"Proc. the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing","author":"X L Li","year":"2021","unstructured":"Li X L, Liang P. Prefix-tuning: Optimizing continuous prompts for generation. In Proc. the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing, Aug. 2021, pp.4582\u20134597. DOI: https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.353."},{"key":"4178_CR59","doi-asserted-by":"publisher","first-page":"5254","DOI":"10.18653\/v1\/2023.emnlp-main.319","volume-title":"Proc. the 2023 Conference on Empirical Methods in Natural Language Processing","author":"Z Hu","year":"2023","unstructured":"Hu Z, Wang L, Lan Y, Xu W, Lim E P, Bing L, Xu X, Poria S, Lee R. LLM-adapters: An adapter family for parameter-efficient fine-tuning of large language models. In Proc. the 2023 Conference on Empirical Methods in Natural Language Processing, Dec. 2023, pp.5254\u20135276. DOI: https:\/\/doi.org\/10.18653\/v1\/2023.emnlp-main.319."},{"key":"4178_CR60","volume-title":"Proc. the 10th International Conference on Learning Representations","author":"J Wei","year":"2022","unstructured":"Wei J, Bosma M, Zhao V Y, Guu K, Yu A W, Lester B, Du N, Dai A M, Le Q V. Finetuned language models are zero-shot learners. In Proc. the 10th International Conference on Learning Representations, Apr. 2022."},{"key":"4178_CR61","volume-title":"Proc. the 36th International Conference on Neural Information Processing Systems","author":"J Wei","year":"2022","unstructured":"Wei J, Wang X, Schuurmans D, Bosma M, Ichter B, Xia F, Chi E H, Le Q V, Zhou D. Chain-of-thought prompting elicits reasoning in large language models. In Proc. the 36th International Conference on Neural Information Processing Systems, Nov. 2022, Article No. 1800."},{"key":"4178_CR62","volume-title":"Proc. the 36th International Conference on Neural Information Processing Systems","author":"L Ouyang","year":"2022","unstructured":"Ouyang L, Wu J, Jiang X et al. Training language models to follow instructions with human feedback. In Proc. the 36th International Conference on Neural Information Processing Systems, Nov. 2022, Article No. 2011."},{"key":"4178_CR63","unstructured":"Yu T, Zhu H. Hyper-parameter optimization: A review of algorithms and applications. arXiv: 2003.05689, 2020. https:\/\/arxiv.org\/abs\/2003.05689, Nov. 2024."},{"issue":"2","key":"4178_CR64","doi-asserted-by":"publisher","first-page":"270","DOI":"10.1162\/neco.1989.1.2.270","volume":"1","author":"R J Williams","year":"1989","unstructured":"Williams R J, Zipser D. A learning algorithm for continually running fully recurrent neural networks. Neural Computation, 1989, 1(2): 270\u2013280. https:\/\/doi.org\/10.1162\/neco.1989.1.2.270.","journal-title":"Neural Computation"},{"key":"4178_CR65","doi-asserted-by":"publisher","first-page":"97","DOI":"10.1109\/HPCA51647.2021.00018","volume-title":"Proc. the 2021 IEEE International Symposium on High-Performance Computer Architecture (HPCA)","author":"H Wang","year":"2021","unstructured":"Wang H, Zhang Z, Han S. SpAtten: Efficient sparse attention architecture with cascade token and head pruning. In Proc. the 2021 IEEE International Symposium on High-Performance Computer Architecture (HPCA), Feb. 27-Mar. 3, 2021, pp.97\u2013110. DOI: https:\/\/doi.org\/10.1109\/HPCA51647.2021.00018."},{"key":"4178_CR66","doi-asserted-by":"publisher","first-page":"5052","DOI":"10.1109\/ICCV51070.2023.00466","volume-title":"Proc. the 2023 IEEE\/CVF International Conference on Computer Vision","author":"W Zeng","year":"2023","unstructured":"Zeng W, Li M, Xiong W, Tong T, Lu W J, Tan J, Wang R, Huang R. MPCViT: Searching for accurate and efficient MPC-friendly vision transformer with heterogeneous attention. In Proc. the 2023 IEEE\/CVF International Conference on Computer Vision, Oct. 2023, pp.5052\u20135063. DOI: https:\/\/doi.org\/10.1109\/ICCV51070.2023.00466."},{"key":"4178_CR67","unstructured":"Liu L, Qu Z, Chen Z, Ding Y, Xie Y. Transformer acceleration with dynamic sparse attention. arXiv: 2110.11299, 2021. https:\/\/arxiv.org\/abs\/2110.11299, Nov. 2024."},{"key":"4178_CR68","doi-asserted-by":"publisher","first-page":"469","DOI":"10.1109\/DAC18074.2021.9586134","volume-title":"Proc. the 58th ACM\/IEEE Design Automation Conference (DAC)","author":"J R Stevens","year":"2021","unstructured":"Stevens J R, Venkatesan R, Dai S, Khailany B, Raghunathan A. Softermax: Hardware\/software co-design of an efficient softmax for transformers. In Proc. the 58th ACM\/IEEE Design Automation Conference (DAC), Dec. 2021, pp.469\u2013474. DOI: https:\/\/doi.org\/10.1109\/DAC18074.2021.9586134."},{"key":"4178_CR69","doi-asserted-by":"publisher","first-page":"393","DOI":"10.1109\/ISCA.2016.42","volume-title":"Proc. the 43rd ACM\/IEEE Annual International Symposium on Computer Architecture (ISCA)","author":"S Liu","year":"2016","unstructured":"Liu S, Du Z, Tao J, Han D, Luo T, Xie Y, Chen Y, Chen T. Cambricon: An instruction set architecture for neural networks. In Proc. the 43rd ACM\/IEEE Annual International Symposium on Computer Architecture (ISCA), Jun. 2016, pp.393\u2013405. DOI: https:\/\/doi.org\/10.1109\/ISCA.2016.42."},{"key":"4178_CR70","first-page":"15","volume-title":"Proc. the 20th USENIX Symposium on Networked Systems Design and Implementation","author":"K Liu","year":"2023","unstructured":"Liu K, Jiang Z, Zhang J, Wei H, Zhong X, Tan L, Pan T, Huang T. Hostping: Diagnosing intra-host network bottlenecks in RDMA servers. In Proc. the 20th USENIX Symposium on Networked Systems Design and Implementation, Apr. 2023, pp.15\u201329."},{"issue":"1","key":"4178_CR71","doi-asserted-by":"publisher","first-page":"94","DOI":"10.1109\/TPDS.2019.2928289","volume":"31","author":"A Li","year":"2020","unstructured":"Li A, Song S L, Chen J, Li J, Liu X, Tallent N R, Barker K J. Evaluating modern GPU interconnect: PCIe, NVLink, NV-SLI, NVSwitch and GPUDirect. IEEE Trans. Parallel and Distributed Systems, 2020, 31(1): 94\u2013110. DOI: https:\/\/doi.org\/10.1109\/TPDS.2019.2928289.","journal-title":"IEEE Trans. Parallel and Distributed Systems"},{"key":"4178_CR72","doi-asserted-by":"publisher","first-page":"661","DOI":"10.1109\/SC.2018.00055","volume-title":"Proc. the 2018 International Conference for High Performance Computing, Networking, Storage and Analysis (SC)","author":"S S Vazhkudai","year":"2018","unstructured":"Vazhkudai S S, De Supinski B R, Bland A S et al. The design, deployment, and evaluation of the CORAL preexascale systems. In Proc. the 2018 International Conference for High Performance Computing, Networking, Storage and Analysis (SC), Nov. 2018, pp.661\u2013672. DOI: https:\/\/doi.org\/10.1109\/SC.2018.00055."},{"key":"4178_CR73","doi-asserted-by":"publisher","first-page":"5","DOI":"10.1109\/HOTI55740.2022.00017","volume-title":"Proc. the 2022 IEEE Symposium on High-Performance Interconnects (HOTI)","author":"D D Sharma","year":"2022","unstructured":"Sharma D D. Compute express link\u00ae: An open industry-standard interconnect enabling heterogeneous data-centric computing. In Proc. the 2022 IEEE Symposium on High-Performance Interconnects (HOTI), Aug. 2022, pp.5\u201312. DOI: https:\/\/doi.org\/10.1109\/HOTI55740.2022.00017."},{"key":"4178_CR74","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1109\/HOTI59126.2023.00022","volume-title":"Proc. the 2023 IEEE Symposium on High-Performance Interconnects (HOTI)","author":"H Qi","year":"2023","unstructured":"Qi H, Dai L, Chen W, Jia Z, Lu X. Performance characterization of large language models on high-speed interconnects. In Proc. the 2023 IEEE Symposium on High-Performance Interconnects (HOTI), Aug. 2023, pp.53\u201360. DOI: https:\/\/doi.org\/10.1109\/HOTI59126.2023.00022."},{"key":"4178_CR75","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589350","volume-title":"Proc. the 50th Annual International Symposium on Computer Architecture","author":"N Jouppi","year":"2023","unstructured":"Jouppi N, Kurian G, Li S, Ma P, Nagarajan R, Nai L, Patil N, Subramanian S, Swing A, Towles B, Young C, Zhou X, Zhou Z, Patterson D A. TPU v4: An optically reconfigurable supercomputer for machine learning with hardware support for embeddings. In Proc. the 50th Annual International Symposium on Computer Architecture, Jun. 2023, Article No. 82. DOI: https:\/\/doi.org\/10.1145\/3579371.3589350."},{"key":"4178_CR76","first-page":"745","volume-title":"Proc. the 21st USENIX Symposium on Networked Systems Design and Implementation","author":"Z Jiang","year":"2024","unstructured":"Jiang Z, Lin H, Zhong Y et al. Megascale: Scaling large language model training to more than 10000 GPUs. In Proc. the 21st USENIX Symposium on Networked Systems Design and Implementation, Apr. 2024, pp.745\u2013760."},{"key":"4178_CR77","unstructured":"Naumov M, Kim J, Mudigere D et al. Deep learning training in facebook data centers: Design of scale-up and scale-out systems. arXiv: 2003.09518, 2020. https:\/\/arxiv.org\/abs\/2003.09518, Nov. 2024."},{"key":"4178_CR78","doi-asserted-by":"publisher","first-page":"2002","DOI":"10.1145\/3292500.3330756","volume-title":"Proc. the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining","author":"S Tokui","year":"2019","unstructured":"Tokui S, Okuta R, Akiba T, Niitani Y, Ogawa T, Saito S, Suzuki S, Uenishi K, Vogel B, Yamazaki Vincent H. Chainer: A deep learning framework for accelerating the research cycle. In Proc. the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, Aug. 2019, pp.2002\u20132011. DOI: https:\/\/doi.org\/10.1145\/3292500.3330756."},{"key":"4178_CR79","first-page":"10435","volume-title":"Proc. the 32nd International Conference on Neural Information Processing Systems","author":"N Shazeer","year":"2018","unstructured":"Shazeer N, Cheng Y, Parmar N et al. Mesh-TensorFlow: Deep learning for supercomputers. In Proc. the 32nd International Conference on Neural Information Processing Systems, Dec. 2018, pp.10435\u201310444."},{"key":"4178_CR80","unstructured":"Kim C, Lee H, Jeong M, Baek W, Yoon B, Kim I, Lim S, Kim S. torchgpipe: On-the-fly pipeline parallelism for training giant models. arXiv: 2004.09910, 2020. https:\/\/arxiv.org\/abs\/2004.09910, Nov. 2024."},{"key":"4178_CR81","volume-title":"Proc. the 2020 International Conference for High Performance Computing, Networking, Storage and Analysis","author":"S Rajbhandari","year":"2020","unstructured":"Rajbhandari S, Rasley J, Ruwase O, He Y. ZeRO: Memory optimizations toward training trillion parameter models. In Proc. the 2020 International Conference for High Performance Computing, Networking, Storage and Analysis, Nov. 2020, Article No. 20."},{"key":"4178_CR82","unstructured":"Yuan J, Li X, Cheng C, Liu J, Guo R, Cai S, Yao C, Yang F, Yi X, Wu C, Zhang H, Zhao J. OneFlow: Redesign the distributed deep learning framework from scratch. arXiv: 2110.15032, 2021. https:\/\/arxiv.org\/abs\/2110.15032, Nov. 2024."},{"key":"4178_CR83","first-page":"673","volume-title":"Proc. the 2022 USENIX Annual Technical Conference","author":"X Jia","year":"2022","unstructured":"Jia X, Jiang L, Wang A, Xiao W, Shi Z, Zhang J, Li X, Chen L, Li Y, Zheng Z, Liu X, Lin W. Whale: Efficient giant model training over heterogeneous GPUs. In Proc. the 2022 USENIX Annual Technical Conference, Jul. 2022, pp.673\u2013688."},{"key":"4178_CR84","doi-asserted-by":"publisher","first-page":"606","DOI":"10.1109\/IPDPS53621.2022.00065","volume-title":"Proc. the 2022 IEEE International Parallel and Distributed Processing Symposium","author":"S Singh","year":"2022","unstructured":"Singh S, Bhatele A. AxoNN: An asynchronous, message-driven parallel framework for extreme-scale deep learning. In Proc. the 2022 IEEE International Parallel and Distributed Processing Symposium, May 30\u2013Jun. 3, 2022, pp.606\u2013616. DOI: https:\/\/doi.org\/10.1109\/IPDPS53621.2022.00065."},{"key":"4178_CR85","doi-asserted-by":"publisher","first-page":"472","DOI":"10.1145\/3492321.3519584","volume-title":"Proc. the 17th European Conference on Computer Systems","author":"S Athlur","year":"2022","unstructured":"Athlur S, Saran N, Sivathanu M, Ramjee R, Kwatra N. Varuna: Scalable, low-cost training of massive deep learning models. In Proc. the 17th European Conference on Computer Systems, Apr. 2022, pp.472\u2013487. DOI: https:\/\/doi.org\/10.1145\/3492321.3519584."},{"key":"4178_CR86","doi-asserted-by":"publisher","first-page":"766","DOI":"10.1145\/3605573.3605613","volume-title":"Proc. the 52nd International Conference on Parallel Processing","author":"S Li","year":"2023","unstructured":"Li S, Liu H, Bian Z, Fang J, Huang H, Liu Y, Wang B, You Y. Colossal-AI: A unified deep learning system for large-scale parallel training. In Proc. the 52nd International Conference on Parallel Processing, Aug. 2023, pp.766\u2013775. DOI: https:\/\/doi.org\/10.1145\/3605573.3605613."},{"issue":"5","key":"4178_CR87","doi-asserted-by":"publisher","first-page":"1466","DOI":"10.1109\/TPDS.2023.3247001","volume":"34","author":"Z Lai","year":"2023","unstructured":"Lai Z, Li S, Tang X, Ge K, Liu W, Duan Y, Qiao L, Li D. Merak: An efficient distributed DNN training framework with automated 3D parallelism for giant foundation models. IEEE Trans. Parallel and Distributed Systems, 2023, 34(5): 1466\u20131478. DOI: https:\/\/doi.org\/10.1109\/TPDS.2023.3247001.","journal-title":"IEEE Trans. Parallel and Distributed Systems"},{"key":"4178_CR88","first-page":"497","volume-title":"Proc. the 20th USENIX Symposium on Networked Systems Design and Implementation","author":"J Thorpe","year":"2023","unstructured":"Thorpe J, Zhao P, Eyolfson J, Qiao Y, Jia Z, Zhang M, Netravali R, Xu G H. Bamboo: Making preemptible instances resilient for affordable training of large DNNs. In Proc. the 20th USENIX Symposium on Networked Systems Design and Implementation, Apr. 2023, pp.497\u2013513."},{"key":"4178_CR89","doi-asserted-by":"publisher","first-page":"382","DOI":"10.1145\/3600006.3613152","volume-title":"Proc. the 29th Symposium on Operating Systems Principles","author":"I Jang","year":"2023","unstructured":"Jang I, Yang Z, Zhang Z, Jin X, Chowdhury M. Oobleck: Resilient distributed training of large models using pipeline templates. In Proc. the 29th Symposium on Operating Systems Principles, Oct. 2023, pp.382\u2013395. DOI: https:\/\/doi.org\/10.1145\/3600006.3613152."},{"key":"4178_CR90","unstructured":"Lepikhin D, Lee H, Xu Y et al. Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv:2006.16668, 2025. https:\/\/arxiv.org\/abs\/2006.16668, Jan. 2025."},{"key":"4178_CR91","unstructured":"Li S, Xue F, Baranwal C, Li Y, You Y. Sequence parallelism: Long sequence training from system perspective. arXiv: 2105.13120, 2021. https:\/\/arxiv.org\/abs\/2105.13120, Jan. 2025."},{"key":"4178_CR92","unstructured":"Jia X, Song S, He W et al. Highly scalable deep learning training system with mixed-precision: Training ImageNet in four minutes. arXiv: 1807.11205, 2018. https:\/\/arxiv.org\/abs\/1807.11205, Nov. 2024."},{"key":"4178_CR93","unstructured":"Xu Y, Lee H, Chen D, Choi H, Hechtman B, Wang S. Automatic cross-replica sharding of weight update in data-parallel training. arXiv: 2004.13336, 2020. https:\/\/arxiv.org\/abs\/2004.13336, Nov. 2024."},{"key":"4178_CR94","doi-asserted-by":"publisher","first-page":"222","DOI":"10.1109\/IPDPS54959.2023.00031","volume-title":"Proc. the 2023 IEEE International Parallel and Distributed Processing Symposium (IPDPS)","author":"Q Xu","year":"2023","unstructured":"Xu Q, You Y. An efficient 2D method for training superlarge deep learning models. In Proc. the 2023 IEEE International Parallel and Distributed Processing Symposium (IPDPS), May 2023, pp.222\u2013232. DOI: https:\/\/doi.org\/10.1109\/IPDPS54959.2023.00031."},{"key":"4178_CR95","doi-asserted-by":"publisher","DOI":"10.1145\/3545008.3545087","volume-title":"Proc. the 51st International Conference on Parallel Processing","author":"B Wang","year":"2022","unstructured":"Wang B, Xu Q, Bian Z, You Y. Tesseract: Parallelize the tensor parallelism efficiently. In Proc. the 51st International Conference on Parallel Processing, Aug. 29\u2013Sept. 1, 2022, Article No. 12. DOI: https:\/\/doi.org\/10.1145\/3545008.3545087."},{"key":"4178_CR96","unstructured":"Bian Z, Xu Q, Wang B, You Y. Maximizing parallelism in distributed training for huge neural networks. arXiv: 2105.14450, 2021. https:\/\/arxiv.org\/abs\/2105.14450, Nov. 2024."},{"key":"4178_CR97","unstructured":"Cheng S, Liu Z, Du J, You Y. ATP: Adaptive tensor parallelism for foundation models. arXiv: 2301.08658, 2023. https:\/\/arxiv.org\/abs\/2301.08658, Nov. 2024."},{"issue":"12","key":"4178_CR98","doi-asserted-by":"publisher","first-page":"4326","DOI":"10.1109\/TPDS.2022.3187815","volume":"33","author":"Z Zeng","year":"2022","unstructured":"Zeng Z, Liu C, Tang Z, Li K, Li K. AccTFM: An effective intra-layer model parallelization strategy for training large-scale transformer-based models. IEEE Trans. Parallel and Distributed Systems, 2022, 33(12): 4326\u20134338. DOI: https:\/\/doi.org\/10.1109\/TPDS.2022.3187815.","journal-title":"IEEE Trans. Parallel and Distributed Systems"},{"key":"4178_CR99","doi-asserted-by":"publisher","first-page":"93","DOI":"10.1145\/3567955.3567959","volume-title":"Proc. the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","author":"S Wang","year":"2022","unstructured":"Wang S, Wei J, Sabne A, Davis A, Ilbeyi B, Hechtman B, Chen D, Murthy K S, Maggioni M, Zhang Q, Kumar S, Guo T, Xu Y, Zhou Z. Overlap communication with dependent computation via decomposition in large deep learning models. In Proc. the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Mar. 2022, pp.93\u2013106. DOI: https:\/\/doi.org\/10.1145\/3567955.3567959."},{"key":"4178_CR100","volume-title":"Proc. the 33rd International Conference on Neural Information Processing Systems","author":"Y Huang","year":"2019","unstructured":"Huang Y, Cheng Y, Bapna A, Firat O, Chen M X, Chen D, Lee H, Ngiam J, Le Q V, Wu Y, Chen Z. GPipe: Efficient training of giant neural networks using pipeline parallelism. In Proc. the 33rd International Conference on Neural Information Processing Systems, Dec. 2019, Article No. 10."},{"key":"4178_CR101","doi-asserted-by":"publisher","first-page":"431","DOI":"10.1145\/3437801.3441593","volume-title":"Proc. the 26th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming","author":"S Fan","year":"2021","unstructured":"Fan S, Rong Y, Meng C, Cao Z, Wang S, Zheng Z, Wu C, Long G, Yang J, Xia L, Diao L, Liu X, Lin W. DAPPLE: A pipelined data parallel approach for training large models. In Proc. the 26th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, Feb. 2021, pp.431\u2013445. DOI: https:\/\/doi.org\/10.1145\/3437801.3441593."},{"key":"4178_CR102","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3341301.3359646","volume-title":"Proc. the 27th ACM Symposium on Operating Systems Principles","author":"D Narayanan","year":"2019","unstructured":"Narayanan D, Harlap A, Phanishayee A, Seshadri V, Devanur N R, Ganger G R, Gibbons P B, Zaharia M. PipeDream: Generalized pipeline parallelism for DNN training. In Proc. the 27th ACM Symposium on Operating Systems Principles, Oct. 2019, pp.1\u201315. DOI: https:\/\/doi.org\/10.1145\/3341301.3359646."},{"key":"4178_CR103","first-page":"7937","volume-title":"Proc. the 38th International Conference on Machine Learning","author":"D Narayanan","year":"2021","unstructured":"Narayanan D, Phanishayee A, Shi K, Chen X, Zaharia M. Memory-efficient pipeline-parallel DNN training. In Proc. the 38th International Conference on Machine Learning, Jul. 2021, pp.7937\u20137947."},{"key":"4178_CR104","first-page":"307","volume-title":"Proc. the 2020 USENIX Annual Technical Conference","author":"J H Park","year":"2020","unstructured":"Park J H, Yun G, Yi C M, Nguyen N T, Lee S, Choi J, Noh S H, Choi Y R. HetPipe: Enabling large DNN training on (Whimpy) heterogeneous GPU clusters through integration of pipelined model parallelism and data parallelism. In Proc. the 2020 USENIX Annual Technical Conference, Jul. 2020, pp.307\u2013321."},{"key":"4178_CR105","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3458817.3476145","volume-title":"Proc. the 2021 International Conference for High Performance Computing, Networking, Storage and Analysis","author":"S Li","year":"2021","unstructured":"Li S, Hoefler T. Chimera: Efficiently training large-scale neural networks with bidirectional pipelines. In Proc. the 2021 International Conference for High Performance Computing, Networking, Storage and Analysis, Nov. 2021, pp.1\u201314. DOI: https:\/\/doi.org\/10.1145\/3458817.3476145."},{"key":"4178_CR106","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/SC41405.2020.00049","volume-title":"Proc. the 2020 International Conference for High Performance Computing, Networking, Storage and Analysis","author":"A Jain","year":"2020","unstructured":"Jain A, Awan A A, Aljuhani A M, Hashmi J M, Anthony Q G, Subramoni H, Panda D K, Machiraju R, Parwani A. GEMS: GPU-enabled memory-aware model-parallelism system for distributed DNN training. In Proc. the 2020 International Conference for High Performance Computing, Networking, Storage and Analysis, Nov. 2020, pp.1\u201315. DOI: https:\/\/doi.org\/10.1109\/SC41405.2020.00049."},{"key":"4178_CR107","first-page":"16639","volume-title":"Proc. the 40th International Conference on Machine Learning","author":"T Kim","year":"2023","unstructured":"Kim T, Kim H, Yu G I, Chun B G. BPipe: Memory-balanced pipeline parallelism for training large language models. In Proc. the 40th International Conference on Machine Learning, Jul. 2023, pp.16639\u201316653."},{"issue":"3","key":"4178_CR108","doi-asserted-by":"publisher","first-page":"489","DOI":"10.1109\/TPDS.2021.3094364","volume":"33","author":"S Zhao","year":"2022","unstructured":"Zhao S, Li F, Chen X, Guan X, Jiang J, Huang D, Qing Y, Wang S, Wang P, Zhang G, Li C, Luo P, Cui H. vPipe: A virtualized acceleration system for achieving efficient and scalable pipeline parallel DNN training. IEEE Trans. Parallel and Distributed Systems, 2022, 33(3): 489\u2013506. DOI: https:\/\/doi.org\/10.1109\/TPDS.2021.3094364.","journal-title":"IEEE Trans. Parallel and Distributed Systems"},{"key":"4178_CR109","doi-asserted-by":"publisher","first-page":"1004","DOI":"10.1109\/IPDPS49936.2021.00109","volume-title":"Proc. the 2021 IEEE International Parallel and Distributed Processing Symposium (IPDPS)","author":"M Tanaka","year":"2021","unstructured":"Tanaka M, Taura K, Hanawa T, Torisawa K. Automatic graph partitioning for very large-scale deep learning. In Proc. the 2021 IEEE International Parallel and Distributed Processing Symposium (IPDPS), May 2021, pp.1004\u20131013. DOI: https:\/\/doi.org\/10.1109\/IPDPS49936.2021.00109."},{"issue":"3","key":"4178_CR110","doi-asserted-by":"publisher","first-page":"470","DOI":"10.14778\/3570690.3570697","volume":"16","author":"X Miao","year":"2022","unstructured":"Miao X, Wang Y, Jiang Y, Shi C, Nie X, Zhang H, Cui B. Galvatron: Efficient transformer training over multiple GPUs using automatic parallelism. Proc. the VLDB Endowment, 2022, 16(3): 470\u2013479. DOI: https:\/\/doi.org\/10.14778\/3570690.3570697.","journal-title":"Proc. the VLDB Endowment"},{"key":"4178_CR111","first-page":"559","volume-title":"Proc. the 16th USENIX Symposium on Operating Systems Design and Implementation","author":"L Zheng","year":"2022","unstructured":"Zheng L, Li Z, Zhang H, Zhuang Y, Chen Z, Huang Y, Wang Y, Xu Y, Zhuo D, Xing E P, Gonzalez J E, Stoica I. Alpa: Automating inter-and intra-operator parallelism for distributed deep learning. In Proc. the 16th USENIX Symposium on Operating Systems Design and Implementation, Jul. 2022, pp.559\u2013578."},{"key":"4178_CR112","doi-asserted-by":"publisher","first-page":"340","DOI":"10.1109\/INFOCOM48880.2022.9796787","volume-title":"Proc. the 2022 IEEE Conference on Computer Communications","author":"Z Luo","year":"2022","unstructured":"Luo Z, Yi X, Long G, Fan S, Wu C, Yang J, Lin W. Efficient pipeline planning for expedited distributed DNN training. In Proc. the 2022 IEEE Conference on Computer Communications, May 2022, pp.340\u2013349. DOI: https:\/\/doi.org\/10.1109\/INFOCOM48880.2022.9796787."},{"key":"4178_CR113","first-page":"24829","volume-title":"Proc. the 35th International Conference on Neural Information Processing Systems","author":"J Tarnawski","year":"2021","unstructured":"Tarnawski J, Narayanan D, Phanishayee A. Piper: Multidimensional planner for DNN parallelization. In Proc. the 35th International Conference on Neural Information Processing Systems, Dec. 2021, pp.24829\u201324840."},{"key":"4178_CR114","unstructured":"Wang S, Rong Y, Fan S, Zheng Z, Diao L, Long G, Yang J, Liu X, Lin W. Auto-MAP: A DQN framework for exploring distributed execution plans for DNN workloads. arXiv: 2007.04069, 2020. https:\/\/arxiv.org\/abs\/2007.04069, Nov. 2024."},{"key":"4178_CR115","doi-asserted-by":"publisher","first-page":"642","DOI":"10.1145\/3600006.3613175","volume-title":"Proc. the 29th Symposium on Operating Systems Principles","author":"S Jayaram Subramanya","year":"2023","unstructured":"Jayaram Subramanya S, Arfeen D, Lin S, Qiao A, Jia Z, Ganger G R. Sia: Heterogeneity-aware, goodput-optimized Ml-cluster scheduling. In Proc. the 29th Symposium on Operating Systems Principles, Oct. 2023, pp.642\u2013657. DOI: https:\/\/doi.org\/10.1145\/3600006.3613175."},{"key":"4178_CR116","doi-asserted-by":"publisher","first-page":"402","DOI":"10.1145\/3503222.3507778","volume-title":"Proc. the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","author":"A Jangda","year":"2022","unstructured":"Jangda A, Huang J, Liu G, Sabet A H N, Maleki S, Miao Y, Musuvathi M, Mytkowicz T, Saarikivi O. Breaking the computation and communication abstraction barrier in distributed machine learning workloads. In Proc. the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Feb. 28\u2013Mar. 4, 2022, pp.402\u2013416. DOI: https:\/\/doi.org\/10.1145\/3503222.3507778."},{"key":"4178_CR117","doi-asserted-by":"publisher","first-page":"178","DOI":"10.1145\/3620666.3651379","volume-title":"Proc. the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","author":"C Chen","year":"2024","unstructured":"Chen C, Li X, Zhu Q, Duan J, Sun P, Zhang X, Yang C. Centauri: Enabling efficient scheduling for communication-computation overlap in large model training via communication partitioning. In Proc. the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Apr. 27\u2013May 1, 2024, pp.178\u2013191. DOI: https:\/\/doi.org\/10.1145\/3620666.3651379."},{"key":"4178_CR118","doi-asserted-by":"publisher","first-page":"172","DOI":"10.1109\/CCGrid49817.2020.00-76","volume-title":"Proc. the 20th IEEE\/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID)","author":"B Nicolae","year":"2020","unstructured":"Nicolae B, Li J, Wozniak J M, Bosilca G, Dorier M, Cappello F. DeepFreeze: Towards scalable asynchronous checkpointing of deep learning models. In Proc. the 20th IEEE\/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID), May 2020, pp.172\u2013181. DOI: https:\/\/doi.org\/10.1109\/CCGrid49817.2020.00-76."},{"key":"4178_CR119","first-page":"203","volume-title":"Proc. the 19th USENIX Conference on File and Storage Technologies","author":"J Mohan","year":"2021","unstructured":"Mohan J, Phanishayee A, Chidambaram V. CheckFreq: Frequent, fine-grained DNN checkpointing. In Proc. the 19th USENIX Conference on File and Storage Technologies, Feb. 2021, pp.203\u2013216."},{"key":"4178_CR120","first-page":"929","volume-title":"Proc. the 19th USENIX Symposium on Networked Systems Design and Implementation","author":"A Eisenman","year":"2022","unstructured":"Eisenman A, Matam K K, Ingram S, Mudigere D, Krishnamoorthi R, Nair K, Smelyanskiy M, Annavaram M. Check-N-Run: A checkpointing system for training deep learning recommendation models. In Proc. the 19th USENIX Symposium on Networked Systems Design and Implementation, Apr. 2022, pp.929\u2013943."},{"key":"4178_CR121","doi-asserted-by":"publisher","first-page":"364","DOI":"10.1145\/3600006.3613145","volume-title":"Proc. the 29th Symposium on Operating Systems Principles","author":"Z Wang","year":"2023","unstructured":"Wang Z, Jia Z, Zheng S, Zhang Z, Fu X, Ng T S E, Wang Y. GEMINI: Fast failure recovery in distributed training with in-memory checkpoints. In Proc. the 29th Symposium on Operating Systems Principles, Oct. 2023, pp.364\u2013381. DOI: https:\/\/doi.org\/10.1145\/3600006.3613145."},{"key":"4178_CR122","unstructured":"Wu B, Xia L, Li Q, Li K, Chen X, Guo Y, Xiang T, Chen Y, Li S. TRANSOM: An efficient fault-tolerant system for training LLMs. arXiv: 2310.10046, 2023. https:\/\/arxiv.org\/abs\/2310.10046, Nov. 2024."},{"key":"4178_CR123","unstructured":"He T, Li X, Wang Z, Qian K, Xu J, Yu W, Zhou J. Unicron: Economizing self-healing LLM training at scale. arXiv: 2401.00134, 2023. https:\/\/arxiv.org\/abs\/2401.00134, Nov. 2024."},{"key":"4178_CR124","first-page":"463","volume-title":"Proc. the 14th USENIX Symposium on Operating Systems Design and Implementation","author":"Y Jiang","year":"2020","unstructured":"Jiang Y, Zhu Y, Lan C, Yi B, Cui Y, Guo C. A unified architecture for accelerating distributed DNN training in heterogeneous GPU\/CPU clusters. In Proc. the 14th USENIX Symposium on Operating Systems Design and Implementation, Nov. 2020, pp.463\u2013479."},{"key":"4178_CR125","doi-asserted-by":"publisher","first-page":"401","DOI":"10.1145\/3373376.3378499","volume-title":"Proc. the 25th International Conference on Architectural Support for Programming Languages and Operating Systems","author":"Q Luo","year":"2020","unstructured":"Luo Q, He J, Zhuo Y, Qian X. Prague: High-performance heterogeneity-aware asynchronous decentralized training. In Proc. the 25th International Conference on Architectural Support for Programming Languages and Operating Systems, Mar. 2020, pp.401\u2013416. DOI: https:\/\/doi.org\/10.1145\/3373376.3378499."},{"key":"4178_CR126","doi-asserted-by":"publisher","first-page":"610","DOI":"10.1109\/HPCA47549.2020.00056","volume-title":"Proc. the 2020 IEEE International Symposium on High Performance Computer Architecture (HPCA)","author":"J Dong","year":"2020","unstructured":"Dong J, Cao Z, Zhang T et al. EFLOPS: Algorithm and system co-design for a high performance distributed training platform. In Proc. the 2020 IEEE International Symposium on High Performance Computer Architecture (HPCA), Feb. 2020, pp.610\u2013622. DOI: https:\/\/doi.org\/10.1109\/HPCA47549.2020.00056."},{"key":"4178_CR127","doi-asserted-by":"publisher","first-page":"374","DOI":"10.1145\/3503222.3507735","volume-title":"Proc. the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","author":"S Zhao","year":"2022","unstructured":"Zhao S, Li F, Chen X, Shen T, Chen L, Wang S, Zhang N, Li C, Cui H. NASPipe: High performance and reproducible pipeline parallel supernet training via causal synchronous parallelism. In Proc. the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Feb. 28 -Mar. 4, 2022, pp.374\u2013387. DOI: https:\/\/doi.org\/10.1145\/3503222.3507735."},{"key":"4178_CR128","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126933","volume-title":"Proc. the 2017 International Conference for High Performance Computing, Networking, Storage and Analysis","author":"M Amaral","year":"2017","unstructured":"Amaral M, Polo J, Carrera D, Seelam S, Steinder M. Topology-aware GPU scheduling for learning workloads in cloud environments. In Proc. the 2017 International Conference for High Performance Computing, Networking, Storage and Analysis, Nov. 2017, Article No. 17. DOI: https:\/\/doi.org\/10.1145\/3126908.3126933."},{"key":"4178_CR129","unstructured":"Chen T, Xu B, Zhang C, Guestrin C. Training deep nets with sublinear memory cost. arXiv: 1604.06174, 2016. https:\/\/arxiv.org\/abs\/1604.06174, Nov. 2024."},{"key":"4178_CR130","volume-title":"Proc. the 33rd International Conference on Neural Information Processing Systems","author":"M Kusumoto","year":"2019","unstructured":"Kusumoto M, Inoue T, Watanabe G, Akiba T, Koyama M. A graph theoretic framework of recomputation algorithms for memory-efficient backpropagation. In Proc. the 33rd International Conference on Neural Information Processing Systems, Dec. 2019, Article No. 105."},{"key":"4178_CR131","first-page":"497","volume-title":"Proc. the 3rd Conference on Machine Learning and Systems","author":"P Jain","year":"2020","unstructured":"Jain P, Jain A, Nrusimha A, Gholami A, Abbeel P, Gonzalez J, Keutzer K, Stoica I. Checkmate: Breaking the memory wall with optimal tensor rematerialization. In Proc. the 3rd Conference on Machine Learning and Systems, Mar. 2020, pp.497\u2013511."},{"key":"4178_CR132","volume-title":"Proc. the 9th International Conference on Learning Representations","author":"M Kirisame","year":"2021","unstructured":"Kirisame M, Lyubomirsky S, Haan A, Brennan J, He M, Roesch J, Chen T, Tatlock Z. Dynamic tensor rematerialization. In Proc. the 9th International Conference on Learning Representations, May 2021."},{"key":"4178_CR133","volume-title":"Proc. the 6th Conference on Machine Learning and Systems","author":"V A Korthikanti","year":"2023","unstructured":"Korthikanti V A, Casper J, Lym S, McAfee L, Andersch M, Shoeybi M, Catanzaro B. Reducing activation recomputation in large transformer models. In Proc. the 6th Conference on Machine Learning and Systems, Jun. 2023."},{"key":"4178_CR134","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/MICRO.2016.7783721","volume-title":"Proc. the 49th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO)","author":"M Rhu","year":"2016","unstructured":"Rhu M, Gimelshein N, Clemons J, Zulfiqar A, Keckler S W. vDNN: Virtualized deep neural networks for scalable, memory-efficient neural network design. In Proc. the 49th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO), Oct. 2016, pp.1\u201313. DOI: https:\/\/doi.org\/10.1109\/MICRO.2016.7783721."},{"key":"4178_CR135","first-page":"551","volume-title":"Proc. the 2021 USENIX Annual Technical Conference","author":"J Ren","year":"2021","unstructured":"Ren J, Rajbhandari S, Aminabadi R Y, Ruwase O, Yang S, Zhang M, Li D, He Y. ZeRO-offload: Democratizing billion-scale model training. In Proc. the 2021 USENIX Annual Technical Conference, Jul. 2021, pp.551\u2013564."},{"key":"4178_CR136","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205","volume-title":"Proc. the 2021 International Conference for High Performance Computing, Networking, Storage and Analysis","author":"S Rajbhandari","year":"2021","unstructured":"Rajbhandari S, Ruwase O, Rasley J, Smith S, He Y. ZeRO-infinity: Breaking the GPU memory wall for extreme scale deep learning. In Proc. the 2021 International Conference for High Performance Computing, Networking, Storage and Analysis, Nov. 2021. DOI: https:\/\/doi.org\/10.1145\/3458817.3476205."},{"issue":"1","key":"4178_CR137","doi-asserted-by":"publisher","first-page":"304","DOI":"10.1109\/TPDS.2022.3219819","volume":"34","author":"J R Fang","year":"2023","unstructured":"Fang J R, Zhu Z L, Li S G, Su H, Yu Y, Zhou J, You Y. Parallel training of pre-trained models via chunk-based dynamic memory management. IEEE Trans. Parallel and Distributed Systems, 2023, 34(1): 304\u2013315. DOI: https:\/\/doi.org\/10.1109\/TPDS.2022.3219819.","journal-title":"IEEE Trans. Parallel and Distributed Systems"},{"key":"4178_CR138","first-page":"395","volume-title":"Proc. the 56th IEEE\/ACM International Symposium on Microarchitecture (MICRO)","author":"H Zhang","year":"2023","unstructured":"Zhang H, Zhou Y E, Xue Y, Liu Y, Huang J. G10: Enabling an efficient unified GPU memory and storage architecture with smart tensor migrations. In Proc. the 56th IEEE\/ACM International Symposium on Microarchitecture (MICRO), Oct. 28\u2013Nov. 1, 2023, pp.395\u2013410."},{"key":"4178_CR139","doi-asserted-by":"publisher","first-page":"891","DOI":"10.1145\/3373376.3378505","volume-title":"Proc. the 25th International Conference on Architectural Support for Programming Languages and Operating Systems","author":"X Peng","year":"2020","unstructured":"Peng X, Shi X, Dai H, Jin H, Ma W, Xiong Q, Yang F, Qian X. Capuchin: Tensor-based GPU memory management for deep learning. In Proc. the 25th International Conference on Architectural Support for Programming Languages and Operating Systems, Mar. 2020, pp.891\u2013905. DOI: https:\/\/doi.org\/10.1145\/3373376.3378505."},{"key":"4178_CR140","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00023","volume-title":"Proc. the 2020 International Conference for High Performance Computing, Networking, Storage and Analysis","author":"M Wahib","year":"2020","unstructured":"Wahib M, Zhang H, Nguyen T T, Drozd A, Domke J, Zhang L, Takano R, Matsuoka S. Scaling distributed deep learning workloads beyond the memory capacity with KARMA. In Proc. the 2020 International Conference for High Performance Computing, Networking, Storage and Analysis, Nov. 2020. DOI: https:\/\/doi.org\/10.1109\/SC41405.2020.00023."},{"issue":"8","key":"4178_CR141","doi-asserted-by":"publisher","first-page":"2403","DOI":"10.1109\/TPDS.2023.3266110","volume":"34","author":"Z Zong","year":"2023","unstructured":"Zong Z, Lin L, Lin L, Wen L, Sun Y. STR: Hybrid tensor re-generation to break memory wall for DNN training. IEEE Trans. Parallel and Distributed Systems, 2023, 34(8): 2403\u20132418. DOI: https:\/\/doi.org\/10.1109\/TPDS.2023.3266110.","journal-title":"IEEE Trans. Parallel and Distributed Systems"},{"key":"4178_CR142","doi-asserted-by":"publisher","first-page":"1341","DOI":"10.1145\/3373376.3378530","volume-title":"Proc. the 25th International Conference on Architectural Support for Programming Languages and Operating Systems","author":"C C Huang","year":"2020","unstructured":"Huang C C, Jin G, Li J. SwapAdvisor: Pushing deep learning beyond the GPU memory limit via smart swapping. In Proc. the 25th International Conference on Architectural Support for Programming Languages and Operating Systems, Mar. 2020, pp.1341\u20131355. DOI: https:\/\/doi.org\/10.1145\/3373376.3378530."},{"issue":"3","key":"4178_CR143","doi-asserted-by":"publisher","first-page":"826","DOI":"10.1109\/TC.2022.3180991","volume":"72","author":"S He","year":"2022","unstructured":"He S, Chen P, Chen S, Li Z, Yang S, Chen W, Shou L. HOME: A holistic GPU memory management framework for deep learning. IEEE Trans. Computers, 2022, 72(3): 826\u2013838. DOI: https:\/\/doi.org\/10.1109\/TC.2022.3180991.","journal-title":"IEEE Trans. Computers"},{"key":"4178_CR144","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1145\/3178487.3178491","volume-title":"Proc. the 23rd ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming","author":"L Wang","year":"2018","unstructured":"Wang L, Ye J, Zhao Y, Wu W, Li A, Song S L, Xu Z, Kraska T. Superneurons: Dynamic GPU memory management for training deep neural networks. In Proc. the 23rd ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, Feb. 2018, pp.41\u201353. DOI: https:\/\/doi.org\/10.1145\/3178487.3178491."},{"key":"4178_CR145","volume-title":"Proc. the 8th International Conference on Learning Representations","author":"L Cambier","year":"2020","unstructured":"Cambier L, Bhiwandiwalla A, Gong T, Elibol O H, Nekuii M, Tang H. Shifted and squeezed 8-bit floating point format for low-precision training of deep neural networks. In Proc. the 8th International Conference on Learning Representations, Apr. 2020."},{"key":"4178_CR146","doi-asserted-by":"publisher","first-page":"2327","DOI":"10.1109\/CVPR42600.2020.00240","volume-title":"Proc. the 2020 IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","author":"X Zhang","year":"2020","unstructured":"Zhang X, Liu S, Zhang R, Liu C, Huang D, Zhou S, Guo J, Guo Q, Du Z, Zhi T, Chen Y. Fixed-point back-propagation training. In Proc. the 2020 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 2020, pp.2327\u20132335. DOI: https:\/\/doi.org\/10.1109\/CVPR42600.2020.00240."},{"key":"4178_CR147","unstructured":"Peng H, Wu K, Wei Y et al. FP8-LM: Training FP8 large language models. arXiv: 2310.18313, 2023. https:\/\/arxiv.org\/abs\/2310.18313, Nov. 2024."},{"key":"4178_CR148","volume-title":"Proc. the 37th International Conference on Neural Information Processing Systems","author":"T Dettmers","year":"2023","unstructured":"Dettmers T, Pagnoni A, Holtzman A, Zettlemoyer L. QLORA: Efficient finetuning of quantized LLMs. In Proc. the 37th International Conference on Neural Information Processing Systems, Dec. 2023, Article No. 441."},{"key":"4178_CR149","unstructured":"Pan Z, Chen P, He H, Liu J, Cai J, Zhuang B. Mesa: A memory-saving training framework for transformers. arXiv: 2111.11124, 2021. https:\/\/arxiv.org\/abs\/2111.11124, Nov. 2024."},{"key":"4178_CR150","first-page":"14139","volume-title":"Proc. the 39th International Conference on Machine Learning","author":"X Liu","year":"2022","unstructured":"Liu X, Zheng L, Wang D, Cen Y, Chen W, Han X, Chen J, Liu Z, Tang J, Gonzalez J, Mahoney M W, Cheung A. GACT: Activation compressed training for generic network architectures. In Proc. the 39th International Conference on Machine Learning, Jul. 2022, pp.14139\u201314152."},{"key":"4178_CR151","first-page":"1058","volume-title":"Proc. the 15th Annual Conference of the International Speech Communication Association","author":"F Seide","year":"2014","unstructured":"Seide F, Fu H, Droppo J, Li G, Yu D. 1-bit stochastic gradient descent and its application to data-parallel distributed training of speech DNNs. In Proc. the 15th Annual Conference of the International Speech Communication Association, Sept. 2014, pp.1058\u20131062."},{"key":"4178_CR152","first-page":"10118","volume-title":"Proc. the 38th International Conference on Machine Learning","author":"H Tang","year":"2021","unstructured":"Tang H, Gan S, Awan A A, Rajbhandari S, Li C, Lian X, Liu J, Zhang C, He Y. 1-bit Adam: Communication efficient large-scale training with Adam\u2019s convergence speed. In Proc. the 38th International Conference on Machine Learning, Jul. 2021, pp.10118\u201310129."},{"key":"4178_CR153","volume-title":"Proc. the 11th International Conference on Learning Representations","author":"Y Lu","year":"2023","unstructured":"Lu Y, Li C, Zhang M, De Sa C, He Y. Maximizing communication efficiency for large-scale training via 0\/1 Adam. In Proc. the 11th International Conference on Learning Representations, May 2023."},{"key":"4178_CR154","doi-asserted-by":"publisher","first-page":"47","DOI":"10.1145\/3341301.3359630","volume-title":"Proc. the 27th ACM Symposium on Operating Systems Principles","author":"Z Jia","year":"2019","unstructured":"Jia Z, Padon O, Thomas J, Warszawski T, Zaharia M, Aiken A. TASO: Optimizing deep learning computation with automatic generation of graph substitutions. In Proc. the 27th ACM Symposium on Operating Systems Principles, Oct. 2019, pp.47\u201362. DOI: https:\/\/doi.org\/10.1145\/3341301.3359630."},{"key":"4178_CR155","first-page":"711","volume-title":"Proc. the 4th Conference on Machine Learning and Systems","author":"A Ivanov","year":"2021","unstructured":"Ivanov A, Dryden N, Ben-Nun T, Li S, Hoefler T. Data movement is all you need: A case study on optimizing transformers. In Proc. the 4th Conference on Machine Learning and Systems, Apr. 2021, pp.711\u2013732."},{"key":"4178_CR156","unstructured":"Rabe M N, Staats C. Self-attention does not need O(n2) memory. arXiv: 2112.05682, 2021. https:\/\/arxiv.org\/abs\/2112.05682, Nov. 2024."},{"key":"4178_CR157","volume-title":"Proc. the 36th International Conference on Neural Information Processing Systems","author":"T Dao","year":"2022","unstructured":"Dao T, Fu D Y, Ermon S, Rudra A, R\u00e9 C. FLASHATTENTION: Fast and memory-efficient exact attention with IO-awareness. In Proc. the 36th International Conference on Neural Information Processing Systems, Nov. 28-Dec. 9, 2022, Article No. 1189."},{"key":"4178_CR158","volume-title":"Proc. the 12th International Conference on Learning Representations","author":"T Dao","year":"2024","unstructured":"Dao T. FlashAttention-2: Faster attention with better parallelism and work partitioning. In Proc. the 12th International Conference on Learning Representations, May 2024."},{"key":"4178_CR159","doi-asserted-by":"publisher","first-page":"1113","DOI":"10.1109\/HPCA56546.2023.10071018","volume-title":"Proc. the 2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)","author":"S Zheng","year":"2023","unstructured":"Zheng S, Chen S, Song P, Chen R, Li X, Yan S, Lin D, Leng J, Liang Y. Chimera: An analytical optimizing framework for effective compute-intensive operators fusion. In Proc. the 2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA), Feb. 25\u2013Mar. 1, 2023, pp.1113\u20131126. DOI: https:\/\/doi.org\/10.1109\/HPCA56546.2023.10071018."},{"key":"4178_CR160","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/ICCAD57390.2023.10323944","volume-title":"Proc. the 2023 IEEE\/ACM International Conference on Computer Aided Design ( ICCAD)","author":"F Wang","year":"2023","unstructured":"Wang F, Shen M. Automatic kernel generation for large language models on deep learning accelerators. In Proc. the 2023 IEEE\/ACM International Conference on Computer Aided Design ( ICCAD), Oct. 29\u2013Nov. 2, 2023, pp.1\u20139. DOI: https:\/\/doi.org\/10.1109\/ICCAD57390.2023.10323944."},{"key":"4178_CR161","doi-asserted-by":"publisher","first-page":"70","DOI":"10.1016\/j.neunet.2019.12.027","volume":"125","author":"Y Yang","year":"2020","unstructured":"Yang Y, Deng L, Wu S, Yan T, Xie Y, Li G. Training high-performance and large-scale deep neural networks with full 8-bit integers. Neural Networks, 2020, 125: 70\u201382. DOI: https:\/\/doi.org\/10.1016\/j.neunet.2019.12.027.","journal-title":"Neural Networks"},{"key":"4178_CR162","doi-asserted-by":"publisher","first-page":"7006","DOI":"10.1109\/TIP.2022.3216776","volume":"31","author":"C Liu","year":"2022","unstructured":"Liu C, Zhang X, Zhang R, Li L, Zhou S, Huang D, Li Z, Du Z, Liu S, Chen T. Rethinking the importance of quantization bias, toward full low-bit training. IEEE Trans. Image Processing, 2022, 31: 7006\u20137019. DOI: https:\/\/doi.org\/10.1109\/TIP.2022.3216776.","journal-title":"IEEE Trans. Image Processing"},{"key":"4178_CR163","doi-asserted-by":"publisher","first-page":"706","DOI":"10.1109\/ISCA52012.2021.00061","volume-title":"Proc. the 48th ACM\/IEEE Annual International Symposium on Computer Architecture ( ISCA)","author":"Y Zhao","year":"2021","unstructured":"Zhao Y, Liu C, Du Z, Guo Q, Hu X, Zhuang Y, Zhang Z, Song X, Li W, Zhang X, Li L, Xu Z, Chen T. Cambricon-Q: A hybrid architecture for efficient training. In Proc. the 48th ACM\/IEEE Annual International Symposium on Computer Architecture ( ISCA), Jun. 2021, pp.706\u2013719. DOI: https:\/\/doi.org\/10.1109\/ISCA52012.2021.00061."},{"key":"4178_CR164","first-page":"701","volume-title":"Proc. the 17th USENIX Symposium on Operating Systems Design and Implementation","author":"Y Shi","year":"2023","unstructured":"Shi Y, Yang Z, Xue J, Ma L, Xia Y, Miao Z, Guo Y, Yang F, Zhou L. Welder: Scheduling deep learning memory access via tile-graph. In Proc. the 17th USENIX Symposium on Operating Systems Design and Implementation, Jul. 2023, pp.701\u2013718."},{"key":"4178_CR165","doi-asserted-by":"publisher","first-page":"804","DOI":"10.1145\/3575693.3576933","volume-title":"Proc. the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","author":"S Feng","year":"2023","unstructured":"Feng S, Hou B, Jin H, Lin W, Shao J, Lai R, Ye Z, Zheng L, Yu C H, Yu Y, Chen T. TensorIR: An abstraction for automatic tensorized program optimization. In Proc. the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Mar. 2023, pp.804\u2013817. DOI: https:\/\/doi.org\/10.1145\/3575693.3576933."},{"key":"4178_CR166","doi-asserted-by":"publisher","first-page":"314","DOI":"10.1145\/3582016.3582061","volume-title":"Proc. the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","author":"J Bi","year":"2023","unstructured":"Bi J, Guo Q, Li X, Zhao Y, Wen Y, Guo Y, Zhou E, Hu X, Du Z, Li L, Chen H, Chen T. Heron: Automatically constrained high-performance library generation for deep learning accelerators. In Proc. the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Mar. 2023, pp.314\u2013328. DOI: https:\/\/doi.org\/10.1145\/3582016.3582061."},{"key":"4178_CR167","first-page":"5506","volume-title":"Proc. the 38th International Conference on Machine Learning","author":"S Kim","year":"2021","unstructured":"Kim S, Gholami A, Yao Z, Mahoney M W, Keutzer K. I-BERT: Integer-only BERT quantization. In Proc. the 38th International Conference on Machine Learning, Jul. 2021, pp.5506\u20135518."},{"key":"4178_CR168","doi-asserted-by":"publisher","first-page":"611","DOI":"10.1145\/3600006.3613165","volume-title":"Proc. the 29th Symposium on Operating Systems Principles","author":"W Kwon","year":"2023","unstructured":"Kwon W, Li Z, Zhuang S, Sheng Y, Zheng L, Yu C H, Gonzalez J, Zhang H, Stoica I. Efficient memory management for large language model serving with PagedAttention. In Proc. the 29th Symposium on Operating Systems Principles, Oct. 2023, pp.611\u2013626. DOI: https:\/\/doi.org\/10.1145\/3600006.3613165."},{"key":"4178_CR169","volume-title":"Proc. the 37th International Conference on Neural Information Processing Systems","author":"Z Liu","year":"2023","unstructured":"Liu Z, Desai A, Liao F, Wang W, Xie V, Xu Z, Kyrillidis A, Shrivastava A. Scissorhands: Exploiting the persistence of importance hypothesis for LLM KV cache compression at test time. In Proc. the 37th International Conference on Neural Information Processing Systems, Dec. 2023, Article No. 2279."},{"key":"4178_CR170","volume-title":"Proc. the 12th International Conference on Learning Representations","author":"S Ge","year":"2024","unstructured":"Ge S, Zhang Y, Liu L, Zhang M, Han J, Gao J. Model tells you what to discard: Adaptive KV cache compression for LLMs. In Proc. the 12th International Conference on Learning Representations, May 2024."},{"key":"4178_CR171","doi-asserted-by":"publisher","first-page":"344","DOI":"10.1109\/IPDPS54959.2023.00042","volume-title":"Proc. the 2023 IEEE International Parallel and Distributed Processing Symposium (IPDPS)","author":"Y Zhai","year":"2023","unstructured":"Zhai Y, Jiang C, Wang L, Jia X, Zhang S, Chen Z, Liu X, Zhu Y. ByteTransformer: A high-performance transformer boosted for variable-length inputs. In Proc. the 2023 IEEE International Parallel and Distributed Processing Symposium (IPDPS), May 2023, pp.344\u2013355. DOI: https:\/\/doi.org\/10.1109\/IPDPS54959.2023.00042."},{"key":"4178_CR172","first-page":"521","volume-title":"Proc. the 16th USENIX Symposium on Operating Systems Design and Implementation","author":"G I Yu","year":"2022","unstructured":"Yu G I, Jeong J S, Kim G W, Kim S, Chun B G. Orca: A distributed serving system for transformer-based generative models. In Proc. the 16th USENIX Symposium on Operating Systems Design and Implementation, Jul. 2022, pp.521\u2013538."},{"key":"4178_CR173","unstructured":"Agrawal A, Panwar A, Mohan J, Kwatra N, Gulavani B S, Ramjee R. SARATHI: Efficient LLM inference by piggybacking decodes with chunked prefills. arXiv: 2308.16369, 2023. https:\/\/arxiv.org\/abs\/2308.16369, Nov. 2024."},{"key":"4178_CR174","first-page":"208","volume-title":"Proc. the Fourth Conference on Machine Learning and Systems","author":"H Shen","year":"2021","unstructured":"Shen H, Roesch J, Chen Z, Chen W, Wu Y, Li M, Sharma V, Tatlock Z, Wang Y. Nimble: Efficiently compiling dynamic neural networks for model inference. In Proc. the Fourth Conference on Machine Learning and Systems, Apr. 2021, pp.208\u2013222."},{"key":"4178_CR175","first-page":"721","volume-title":"Proc. the 5th Conference on Machine Learning and Systems","author":"P Fegade","year":"2022","unstructured":"Fegade P, Chen T, Gibbons P, Mowry T. The CoRa tensor compiler: Compilation for ragged tensors with minimal padding. In Proc. the 5th Conference on Machine Learning and Systems, Aug. 2022, pp.721\u2013747."},{"key":"4178_CR176","unstructured":"Sun Y, Dong L, Huang S, Ma S, Xia Y, Xue J, Wang J, Wei F. Retentive network: A successor to transformer for large language models. arXiv: 2307.08621, 2023. https:\/\/arxiv.org\/abs\/2307.08621, Nov. 2024."},{"key":"4178_CR177","volume-title":"Proc. the 37th International Conference on Neural Information Processing Systems","author":"L Yu","year":"2023","unstructured":"Yu L, Simig D, Flaherty C, Aghajanyan A, Zettlemoyer L, Lewis M. MEGABYTE: Modeling million-byte sequences with multiscale transformers. In Proc. the 37th International Conference on Neural Information Processing Systems, Dec. 2023, Article No. 3447."},{"key":"4178_CR178","volume-title":"Proc. the 41st International Conference on Machine Learning","author":"T Dao","year":"2024","unstructured":"Dao T, Gu A. Transformers are SSMs: Generalized models and efficient algorithms through structured state space duality. In Proc. the 41st International Conference on Machine Learning, Jul. 2024."},{"key":"4178_CR179","doi-asserted-by":"crossref","unstructured":"Zhao L, Maleki S, Shah A, Yang Z, Pourreza H, Krishnamurthy A. ForestColl: Efficient collective communications on heterogeneous network fabrics. arXiv: 2402.06787, 2024. https:\/\/arxiv.org\/abs\/2402.06787, Nov. 2024.","DOI":"10.23919\/EUSIPCO63174.2024.10715002"}],"container-title":["Journal of Computer Science and Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11390-024-4178-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11390-024-4178-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11390-024-4178-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,12]],"date-time":"2025-03-12T15:29:08Z","timestamp":1741793348000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11390-024-4178-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,1]]},"references-count":179,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2025,1]]}},"alternative-id":["4178"],"URL":"https:\/\/doi.org\/10.1007\/s11390-024-4178-1","relation":{},"ISSN":["1000-9000","1860-4749"],"issn-type":[{"value":"1000-9000","type":"print"},{"value":"1860-4749","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,1]]},"assertion":[{"value":"8 February 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 January 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 March 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Conflict of Interest The authors declare that they have no conflict of interest.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics"}}]}}