{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,3]],"date-time":"2026-07-03T11:11:09Z","timestamp":1783077069477,"version":"3.54.6"},"reference-count":45,"publisher":"Tsinghua University Press","issue":"4","funder":[{"DOI":"10.13039\/501100012166","name":"National Key R&D Program of China","doi-asserted-by":"publisher","award":["2022ZD0115304"],"award-info":[{"award-number":["2022ZD0115304"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100020771","name":"National Natural Science Foundation of China for Young Scientists Fund","doi-asserted-by":"publisher","award":["62402266"],"award-info":[{"award-number":["62402266"]}],"id":[{"id":"10.13039\/501100020771","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100014219","name":"National Natural Science Foundation of China for Distinguished Young Scholar","doi-asserted-by":"publisher","award":["62225206"],"award-info":[{"award-number":["62225206"]}],"id":[{"id":"10.13039\/501100014219","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Big Data Min. Anal."],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.26599\/bdma.2025.9020031","type":"journal-article","created":{"date-parts":[[2025,5,12]],"date-time":"2025-05-12T17:46:07Z","timestamp":1747071967000},"page":"966-980","source":"Crossref","is-referenced-by-count":4,"title":["Training Large Models on Heterogeneous and Geo-Distributed Resource with Constricted Networks"],"prefix":"10.26599","volume":"8","author":[{"given":"Zan","family":"Zong","sequence":"first","affiliation":[{"name":"Tsinghua University,Department of Computer Science and Technology,Beijing,China,100084"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Minkun","family":"Guo","sequence":"additional","affiliation":[{"name":"Tsinghua University,Department of Computer Science and Technology,Beijing,China,100084"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mingshu","family":"Zhai","sequence":"additional","affiliation":[{"name":"Tsinghua University,Department of Computer Science and Technology,Beijing,China,100084"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yinan","family":"Tang","sequence":"additional","affiliation":[{"name":"IEIT SYSTEMS Co., Ltd.,Jinan,China,250014"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jianjiang","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer &#x0026; Communication Engineering, University of Science and Technology Beijing,Beijing,China,100083"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jidong","family":"Zhai","sequence":"additional","affiliation":[{"name":"Tsinghua University,Department of Computer Science and Technology,Beijing,China,100084"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"11138","reference":[{"key":"ref1","volume-title":"GPT-4 technical report","year":"2024"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"ref3","volume-title":"Gemini:A family of highly capable multimodal models","year":"2024"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2018.00062"},{"key":"ref5","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism","author":"Shoeybi","year":"2019"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3605573.3605613"},{"key":"ref7","first-page":"961","article-title":"SmartMoE: Efficiently training sparsely-activated models through combining offline and online parallelization","volume-title":"Proc. 2023 USENIX Annu. Technical Conf.","author":"Zhai"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651359"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.2307\/jj.28491914.17"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/3458336.3465301"},{"key":"ref13","first-page":"30","article-title":"MAST: Global scheduling of ML training across geo-distributed datacenters at hyperscale","volume-title":"Proc. 18th USENIX Conf. Operating Systems Design and Implementation","author":"Choudhury"},{"key":"ref14","first-page":"1846","article-title":"Decentralized training of foundation models in heterogeneous environments","volume-title":"Proc. 36th Int. Conf. Neural Information Processing Systems","author":"Yuan"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-demo.54"},{"key":"ref16","first-page":"1497","article-title":"CocktailSGD: Fine-tuning foundation models over 500mbps networks","volume-title":"Proc. 40th Int. Conf. Machine Learning","author":"Wang"},{"key":"ref17","volume-title":"Nvidia connectx infiniband","year":"2024"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1186\/s40537-021-00427-9"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i21.34417"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.14778\/3598581.3598604"},{"key":"ref21","first-page":"21","article-title":"HetPipe: Enabling large DNN training on (whimpy) heterogeneous GPU clusters through integration of pipelined model parallelism and data parallelism, in","volume-title":"Proc. 2020 USENIX Conf. Usenix Annu. Technical Conf., Virtual Event","author":"Park","year":"2020"},{"key":"ref22","first-page":"35","article-title":"Metis: Fast automatic distributed training on heterogeneous GPUs","volume-title":"Proc. 2024 USENIX Conf. Usenix Annu. Technical Conf.","author":"Um"},{"key":"ref23","first-page":"559","article-title":"Alpa: Automating inter- and intra-operator parallelism for distributed deep learning","volume-title":"Proc. 16th USENIX Symp. Operating Systems Design and Implementation","author":"Zheng"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378508"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3502181.3531462"},{"key":"ref26","first-page":"480","article-title":"AMP: Automatically finding model parallel strategies with heterogeneity awareness","volume-title":"Proc. 36th Int. Conf. Neural Information Processing Systems","author":"Li"},{"key":"ref27","first-page":"58","article-title":"Efficient large-scale language model training on GPU clusters using megatron-LM","volume-title":"Proc. Int. Conf. High Performance Computing, Networking, Storage and Analysis","author":"Narayanan"},{"key":"ref28","first-page":"10","article-title":"GPipe: Efficient training of giant neural networks using pipeline parallelism","volume-title":"Proc. 33rd Int. Conf. Neural Information Processing Systems, Virtual Event","author":"Huang","year":"2019"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476145"},{"key":"ref30","article-title":"Blink: Fast and generic collectives for distributed ML","volume-title":"Proc. 3rd MLSys Conf.","author":"Wang"},{"key":"ref31","first-page":"24829","article-title":"Piper: Multidimensional planner for DNN parallelization","volume-title":"Proc. 35th Conf. Neural Information Processing Systems, Virtual Event","author":"Tarnawski","year":"2021"},{"key":"ref32","volume-title":"GPU performance background","year":"2023"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/2408776.2408794"},{"key":"ref34","article-title":"Reducing activation recomputation in large transformer models","volume-title":"Proc. 6th MLSys Conf.","author":"Korthikanti"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2022.3215517"},{"key":"ref36","article-title":"Does compressing activations help model parallel training?","volume-title":"Proc. 7th MLSys Conf.","author":"Bian"},{"key":"ref37","first-page":"1278","article-title":"PowerSGD: Practical low-rank gradient compression for distributed optimization","volume-title":"Proc. 33rd Int. Conf. Neural Information Processing Systems","author":"Vogels"},{"key":"ref38","article-title":"Deep gradient compression: Reducing the communication bandwidth for distributed training","volume-title":"Proc. 6th Int. Conf. Learning Representations","author":"Lin"},{"key":"ref39","first-page":"5151","article-title":"Scalable methods for 8-bit training of neural networks","volume-title":"Proc. 32nd Int. Conf. Neural Information Processing Systems","author":"Banner"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.26599\/BDMA.2020.9020004"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2023.3266110"},{"key":"ref42","volume-title":"GPT series model training","year":"2023"},{"key":"ref43","volume-title":"LlaMA: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref44","article-title":"Pointer sentinel mixture models","volume-title":"Proc. 5th Int. Conf. Learning Representations","author":"Merity"},{"key":"ref45","volume-title":"The RefinedWeb dataset for Falcon LLM: Outperforming curated corpora with web data, and web data only","author":"Penedo","year":"2023"}],"container-title":["Big Data Mining and Analytics"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/8254253\/11002434\/11002440.pdf?arnumber=11002440","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,13]],"date-time":"2025-05-13T06:15:03Z","timestamp":1747116903000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11002440\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8]]},"references-count":45,"journal-issue":{"issue":"4"},"URL":"https:\/\/doi.org\/10.26599\/bdma.2025.9020031","relation":{},"ISSN":["2096-0654","2097-406X"],"issn-type":[{"value":"2096-0654","type":"print"},{"value":"2097-406X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,8]]}}}