{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,2]],"date-time":"2025-08-02T14:52:53Z","timestamp":1754146373050,"version":"3.41.2"},"reference-count":64,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"9","license":[{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"National Key R&#x0026;D Program of China","award":["2022ZD0160201"],"award-info":[{"award-number":["2022ZD0160201"]}]},{"name":"HK RGC RIF","award":["R7030-22"],"award-info":[{"award-number":["R7030-22"]}]},{"name":"HK RGC GRF","award":["17208223","17204424"],"award-info":[{"award-number":["17208223","17204424"]}]},{"name":"Huawei flagship research"},{"name":"HKU-CAS Joint Laboratory for Intelligent System Software"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1109\/tpds.2025.3583983","type":"journal-article","created":{"date-parts":[[2025,6,27]],"date-time":"2025-06-27T13:47:39Z","timestamp":1751032059000},"page":"1872-1889","source":"Crossref","is-referenced-by-count":0,"title":["PipeMesh: Achieving Memory-Efficient Computation-Communication Overlap for Training Large Language Models"],"prefix":"10.1109","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2268-3036","authenticated-orcid":false,"given":"Fanxin","family":"Li","sequence":"first","affiliation":[{"name":"Department of Computer Science, University of Hong Kong, Hong Kong, SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1643-2583","authenticated-orcid":false,"given":"Shixiong","family":"Zhao","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Hong Kong, Hong Kong, SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1500-2192","authenticated-orcid":false,"given":"Yuhao","family":"Qing","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Hong Kong, Hong Kong, SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8684-8509","authenticated-orcid":false,"given":"Jianyu","family":"Jiang","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Hong Kong, Hong Kong, SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2807-9780","authenticated-orcid":false,"given":"Xusheng","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Hong Kong, Hong Kong, SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7746-440X","authenticated-orcid":false,"given":"Heming","family":"Cui","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Hong Kong, Hong Kong, SAR, China"}]}],"member":"263","reference":[{"year":"2020","author":"Brown","article-title":"Language models are few-shot learners","key":"ref1"},{"year":"2023","author":"Touvron","article-title":"Llama 2: Open foundation and fine-tuned chat models","key":"ref2"},{"year":"2022","author":"Workshop","article-title":"BLOOM: A 176B-parameter open-access multilingual language model","key":"ref3"},{"year":"2023","author":"Jiang","article-title":"Mistral 7B","key":"ref4"},{"year":"2023","author":"Almazrouei","article-title":"The falcon series of open language models","key":"ref5"},{"year":"2023","author":"Team","article-title":"Introducing MPT-30B: Raising the bar for open-source foundation models","key":"ref6"},{"year":"2017","author":"Vaswani","article-title":"Attention is all you need","key":"ref7"},{"year":"2019","author":"Shoeybi","article-title":"Megatron-LM: Training multi-billion parameter language models using model parallelism","key":"ref9"},{"year":"2022","author":"Korthikanti","article-title":"Reducing activation recomputation in large transformer models","key":"ref10"},{"year":"2023","author":"Liu","article-title":"Ring attention with blockwise transformers for near-infinite context","key":"ref11"},{"year":"2018","author":"Huang","article-title":"GPipe: Efficient training of giant neural networks using pipeline parallelism","key":"ref12"},{"doi-asserted-by":"publisher","key":"ref13","DOI":"10.1145\/3341301.3359646"},{"doi-asserted-by":"publisher","key":"ref16","DOI":"10.1145\/3458817.3476209"},{"key":"ref17","first-page":"945","article-title":"MLaaS in the wild: Workload analysis and scheduling in large-scale heterogeneous GPU clusters","volume-title":"Proc. 19th USENIX Symp. Netw. Syst. Des. Implementation","author":"Weng"},{"doi-asserted-by":"publisher","key":"ref18","DOI":"10.1109\/SC41405.2020.00024"},{"year":"2024","author":"Jiang","article-title":"MegaScale: Scaling large language model training to more than 10,000 GPUs","key":"ref19"},{"year":"2023","author":"Qi","article-title":"Zero bubble pipeline parallelism","key":"ref20"},{"doi-asserted-by":"publisher","key":"ref21","DOI":"10.1109\/TPDS.2023.3247883"},{"key":"ref22","first-page":"48","article-title":"Breadth-first pipeline parallelism","volume-title":"Proc. Int. Conf. Mach. Learn. Syst.","author":"Lamy-Poirier"},{"year":"2016","author":"Chen","article-title":"Training deep nets with sublinear memory cost","key":"ref23"},{"key":"ref24","first-page":"497","article-title":"Checkmate: Breaking the memory wall with optimal tensor rematerialization","volume-title":"Proc. Int. Conf. Mach. Learn. Syst.","author":"Jain"},{"year":"2020","author":"Kirisame","article-title":"Dynamic tensor rematerialization","key":"ref25"},{"year":"2018","author":"Sergeev","article-title":"Horovod: Fast and easy distributed deep learning in tensorflow","key":"ref26"},{"year":"2014","author":"Kingma","article-title":"Adam: A method for stochastic optimization","key":"ref27"},{"doi-asserted-by":"publisher","key":"ref28","DOI":"10.1109\/TPDS.2021.3094364"},{"key":"ref29","first-page":"269","article-title":"PipeMare: Asynchronous pipeline parallel DNN training","volume-title":"Proc. Int. Conf. Mach. Learn. Syst.","author":"Yang"},{"key":"ref30","first-page":"7937","article-title":"Memory-efficient pipeline-parallel DNN training","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Narayanan"},{"doi-asserted-by":"publisher","key":"ref31","DOI":"10.1145\/79173.79181"},{"doi-asserted-by":"publisher","key":"ref32","DOI":"10.1109\/HCS55958.2022.9895592"},{"year":"2020","author":"Kaplan","article-title":"Scaling laws for neural language models","key":"ref33"},{"year":"2017","author":"Micikevicius","article-title":"Mixed precision training","key":"ref34"},{"year":"2016","author":"Hendrycks","article-title":"Gaussian error linear units (GELUs)","key":"ref36"},{"year":"2020","author":"Shazeer","article-title":"GLU variants improve transformer","key":"ref37"},{"doi-asserted-by":"publisher","key":"ref38","DOI":"10.1016\/j.neunet.2017.12.012"},{"key":"ref39","first-page":"551","article-title":"ZeRO-Offload: Democratizing billion-scale model training","volume-title":"Proc. 2021 USENIX Annu. Tech. Conf.","author":"Ren"},{"year":"2018","author":"McCandlish","article-title":"An empirical model of large-batch training","key":"ref40"},{"key":"ref41","first-page":"4042","article-title":"Critical parameters for scalable distributed learning with large batches and asynchronous updates","volume-title":"Proc. Int. Conf. Artif. Intell. Statist.","author":"Stich"},{"volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Tarnawski","article-title":"Piper: Multidimensional planner for DNN parallelization","key":"ref42"},{"key":"ref43","first-page":"12360","article-title":"Root mean square layer normalization","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Zhang"},{"doi-asserted-by":"publisher","key":"ref44","DOI":"10.18653\/v1\/2023.emnlp-main.298"},{"key":"ref45","first-page":"16344","article-title":"FlashAttention: Fast and memory-efficient exact attention with IO-awareness","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Dao"},{"doi-asserted-by":"publisher","key":"ref47","DOI":"10.1109\/ISCA52012.2021.00049"},{"year":"2021","author":"Wei","article-title":"Finetuned language models are zero-shot learners","key":"ref48"},{"year":"2021","author":"Sanh","article-title":"Multitask prompted training enables zero-shot task generalization","key":"ref49"},{"year":"2021","author":"He","article-title":"Towards a unified view of parameter-efficient transfer learning","key":"ref50"},{"key":"ref51","first-page":"2790","article-title":"Parameter-efficient transfer learning for NLP","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Houlsby"},{"year":"2021","author":"Hu","article-title":"LoRA: Low-rank adaptation of large language models","key":"ref52"},{"volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Dettmers","article-title":"QLORA: Efficient finetuning of quantized LLMs","key":"ref53"},{"doi-asserted-by":"publisher","key":"ref54","DOI":"10.14778\/3611540.3611569"},{"doi-asserted-by":"publisher","key":"ref55","DOI":"10.1145\/3492321.3519563"},{"doi-asserted-by":"publisher","key":"ref56","DOI":"10.1145\/3437801.3441593"},{"doi-asserted-by":"publisher","key":"ref57","DOI":"10.1145\/3458817.3476145"},{"doi-asserted-by":"publisher","key":"ref58","DOI":"10.1145\/3620666.3651359"},{"key":"ref59","first-page":"16639","article-title":"BPIPE: Memory-balanced pipeline parallelism for training large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kim"},{"doi-asserted-by":"publisher","key":"ref60","DOI":"10.1109\/TPDS.2023.3343570"},{"year":"2022","author":"Zheng","article-title":"Alpa: Automating inter-and intra-operator parallelism for distributed deep learning","key":"ref61"},{"key":"ref62","first-page":"267","article-title":"Unity: Accelerating DNN training through joint optimization of algebraic transformations and parallelization","volume-title":"Proc. 16th USENIX Symp. Operating Syst. Des. Implementation","author":"Unger"},{"key":"ref63","first-page":"4125","article-title":"Memory-efficient backpropagation through time","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Gruslys"},{"year":"2023","author":"Dao","article-title":"FlashAttention-2: Faster attention with better parallelism and work partitioning","key":"ref64"},{"key":"ref65","first-page":"132","article-title":"Priority-based parameter propagation for distributed DNN training","volume-title":"Proc. Int. Conf. Mach. Learn. Syst.","author":"Jayarajan"},{"key":"ref66","first-page":"418","article-title":"TicTac: Accelerating distributed deep learning with communication scheduling","volume-title":"Proc. 2nd Conf. Mach. Learn. Syst.","author":"Hashemi"},{"doi-asserted-by":"publisher","key":"ref67","DOI":"10.1145\/3341301.3359642"},{"key":"ref68","first-page":"526","article-title":"On optimizing the communication of model parallelism","volume-title":"Proc. Int. Conf. Mach. Learn. Syst.","author":"Zhuang"},{"doi-asserted-by":"publisher","key":"ref69","DOI":"10.1145\/3620666.3651379"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/71\/11083525\/11054307.pdf?arnumber=11054307","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,18]],"date-time":"2025-07-18T04:40:29Z","timestamp":1752813629000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11054307\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9]]},"references-count":64,"journal-issue":{"issue":"9"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2025.3583983","relation":{},"ISSN":["1045-9219","1558-2183","2161-9883"],"issn-type":[{"type":"print","value":"1045-9219"},{"type":"electronic","value":"1558-2183"},{"type":"electronic","value":"2161-9883"}],"subject":[],"published":{"date-parts":[[2025,9]]}}}