{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,3]],"date-time":"2025-12-03T19:01:45Z","timestamp":1764788505697,"version":"3.46.0"},"reference-count":61,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022YFB2901202"],"award-info":[{"award-number":["2022YFB2901202"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Netw. Sci. Eng."],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/tnse.2025.3607331","type":"journal-article","created":{"date-parts":[[2025,9,8]],"date-time":"2025-09-08T17:46:19Z","timestamp":1757353579000},"page":"1962-1976","source":"Crossref","is-referenced-by-count":0,"title":["Parallel Gradient Computation and Synchronization: Enhancing the Efficiency of Distributed Training for LLMs"],"prefix":"10.1109","volume":"13","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6258-0878","authenticated-orcid":false,"given":"Hao","family":"Li","sequence":"first","affiliation":[{"name":"Electronic Information School, Wuhan University, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8533-1612","authenticated-orcid":false,"given":"Hao","family":"Jiang","sequence":"additional","affiliation":[{"name":"Electronic Information School, Wuhan University, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6955-2087","authenticated-orcid":false,"given":"Jing","family":"Wu","sequence":"additional","affiliation":[{"name":"Electronic Information School, Wuhan University, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4919-5712","authenticated-orcid":false,"given":"Guiao","family":"Yang","sequence":"additional","affiliation":[{"name":"Electronic Information School, Wuhan University, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4394-0628","authenticated-orcid":false,"given":"Jian","family":"Zhang","sequence":"additional","affiliation":[{"name":"Electronic Information School, Wuhan University, Wuhan, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2024.3365742"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN55064.2022.9891914"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2025.3557620"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2025.3539433"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2024.3352100"},{"article-title":"Reducing the barriers to entry for foundation model training","year":"2024","author":"Faraboschi","key":"ref6"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2021.12.003"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-019-2705-2"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2023.3330542"},{"article-title":"Gemma: Open models based on gemini research and technology","year":"2024","author":"Team","key":"ref10"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/s40430-024-05048-w"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2025.3563200"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.14778\/3352063.3352141"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4842-1043-7_1"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2024.3456354"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/MSST.2019.00004"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1364\/JOCN.462286"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1016\/j.future.2020.06.007"},{"key":"ref19","first-page":"629","article-title":"Gaia:$\\lbrace$Geo-Distributed $\\rbrace$ machine learning approaching $\\lbrace$ LAN$\\rbrace$ speeds","volume-title":"Proc. 14th USENIX Symp. Networked Syst. Des. Implementation","author":"Hsieh","year":"2017"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/COMST.2024.3465447"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.23919\/ISC.2024.10528939"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2022.3154387"},{"article-title":"Gradient-mask tuning elevates the upper limits of LLM performance","year":"2024","author":"Li","key":"ref23"},{"key":"ref24","first-page":"341","article-title":"Reducing activation recomputation in large transformer models","volume-title":"Proc. Mach. Learn. Syst.","volume":"5","author":"Korthikanti"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1147\/JRD.2019.2947013"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3605573.3605624"},{"article-title":"Branch-train-merge: Embarrassingly parallel training of expert language models","year":"2022","author":"Li","key":"ref27"},{"article-title":"Branch-train-mix: Mixing expert LLMs into a mixture-of-experts LLM","year":"2024","author":"Sukhbaatar","key":"ref28"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2025.3550841"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-industry.36"},{"article-title":"ACCO: Accumulate while you communicate, hiding communications in distributed LLM training","year":"2024","author":"Nabli","key":"ref31"},{"article-title":"TinyR1-32B-preview: Boosting accuracy with branch-merge distillation","year":"2025","author":"Sun","key":"ref32"},{"article-title":"LLM augmented llms: Expanding capabilities through composition","year":"2024","author":"Bansal","key":"ref33"},{"article-title":"Knowledge fusion of large language models","year":"2024","author":"Wan","key":"ref34"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICWS62655.2024.00177"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3545008.3545087"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.48550\/arxiv.1811.06965"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2024.3385639"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2025.3564736"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18072.2020.9218538"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10887718"},{"key":"ref42","first-page":"3259","article-title":"Linear mode connectivity and the lottery ticket hypothesis","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Frankle","year":"2020"},{"key":"ref43","first-page":"12589","article-title":"Composing parameter-efficient modules with arithmetic operation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Zhang"},{"key":"ref44","first-page":"17703","article-title":"Merging models with fisher-weighted averaging","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Matena","year":"2022"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICPADS60453.2023.00126"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.23919\/ISC.2024.10528939"},{"key":"ref47","first-page":"430","article-title":"Pathways: Asynchronous distributed dataflow for ML","volume-title":"Proc. Mach. Learn. Syst.","volume":"4","author":"Barham"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS54959.2023.00031"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/3642970.3655843"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/WCNC61545.2025.10978332"},{"article-title":"Efficient training of large language models on distributed infrastructures: A survey","year":"2024","author":"Duan","key":"ref51"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/TDSC.2025.3581728"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/BigData62323.2024.10825823"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM42981.2021.9488678"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/TSC.2024.3399654"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/tmi.2025.3591185"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS61541.2024.00031"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1145\/3572848.3577391"},{"article-title":"Optimized network architectures for large language model training with billions of parameters","year":"2023","author":"Wang","key":"ref59"},{"article-title":"Scaling laws for neural language models","year":"2020","author":"Kaplan","key":"ref60"},{"article-title":"The pile: An 800gb dataset of diverse text for language modeling","year":"2020","author":"Gao","key":"ref61"}],"container-title":["IEEE Transactions on Network Science and Engineering"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6488902\/11264281\/11153704.pdf?arnumber=11153704","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,3]],"date-time":"2025-12-03T18:43:57Z","timestamp":1764787437000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11153704\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":61,"URL":"https:\/\/doi.org\/10.1109\/tnse.2025.3607331","relation":{},"ISSN":["2327-4697","2334-329X"],"issn-type":[{"type":"electronic","value":"2327-4697"},{"type":"electronic","value":"2334-329X"}],"subject":[],"published":{"date-parts":[[2026]]}}}