{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,14]],"date-time":"2025-10-14T05:41:42Z","timestamp":1760420502836,"version":"build-2065373602"},"reference-count":32,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,9,22]],"date-time":"2025-09-22T00:00:00Z","timestamp":1758499200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,9,22]],"date-time":"2025-09-22T00:00:00Z","timestamp":1758499200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,9,22]]},"DOI":"10.1109\/icnp65844.2025.11192443","type":"proceedings-article","created":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T17:38:54Z","timestamp":1760377134000},"page":"1-11","source":"Crossref","is-referenced-by-count":0,"title":["MAZ3: Memory-Assisted ZeRO-3 for Efficient Collective Communication"],"prefix":"10.1109","author":[{"given":"Yang","family":"Liu","sequence":"first","affiliation":[{"name":"Northeastern University,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chenyang","family":"Hei","sequence":"additional","affiliation":[{"name":"Northeastern University,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fuliang","family":"Li","sequence":"additional","affiliation":[{"name":"Northeastern University,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chengxi","family":"Gao","sequence":"additional","affiliation":[{"name":"Chinese Academy of Sciences,Shenzhen Institutes of Advanced Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xingwei","family":"Wang","sequence":"additional","affiliation":[{"name":"Northeastern University,China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651379"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640423"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507778"},{"key":"ref4","first-page":"745","article-title":"MegaScale: Scaling large language model training to more than 10,000 GPUs","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Jiang"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1108\/LHTN-01-2023-0009"},{"volume-title":"NVIDIA Data Center GPU Resource Center","year":"2025","key":"ref7"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"ref10","first-page":"551","article-title":"Zero-offload: Democratizing billion-scale model training","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Ren"},{"volume-title":"Turing-NLG: A 17-billion-parameter language model by Microsoft","year":"2020","author":"Rosset","key":"ref11"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/3567955.3567959"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.14778\/3611540.3611569"},{"key":"ref14","first-page":"395","article-title":"G10: Enabling an efficient unified gpu memory and storage architecture with smart tensor migrations","volume-title":"Proceedings of the 56th Annual IEEE\/ACM International Symposium on Microarchitecture","author":"Zhang"},{"key":"ref15","first-page":"497","article-title":"Checkmate: Breaking the memory wall with optimal tensor rematerialization","volume-title":"Proceedings of Machine Learning and Systems","volume":"2","author":"Jain"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00080"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575736"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358284"},{"article-title":"BaM: A case for enabling fine-grain high throughput GPU-orchestrated access to storage","year":"2022","author":"Qureshi","key":"ref19"},{"volume-title":"NVIDIA Collective Communications Library (NCCL)","year":"2025","key":"ref20"},{"article-title":"TensorFlow: A System for Large-Scale Machine Learning","volume-title":"Proceedings of the 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI\u201916)","author":"Abadi","key":"ref21"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541967"},{"article-title":"Bandana: Using Non-Volatile Memory for Storing Deep Learning Models","volume-title":"Proceedings of the Conference on Systems and Machine Learning (SysML\u201919)","author":"Eisenman","key":"ref23"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"article-title":"iftop: display bandwidth usage on an interface","year":"2025","author":"Warren","key":"ref25"},{"article-title":"Using Deepspeed and Megatron to train Megatron-Turing NLG 530b, a large-scale generative language model","year":"2022","author":"Smith","key":"ref26"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2008.15"},{"volume-title":"Setting the standard in storage","year":"2025","key":"ref28"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/2541228.2541231"},{"key":"ref30","first-page":"559","article-title":"Alpa: Automating inter- and Intra-Operator parallelism for distributed deep learning","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng"},{"volume-title":"Megatron-DeepSpeed","year":"2025","key":"ref31"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00076"}],"event":{"name":"2025 IEEE 33rd International Conference on Network Protocols (ICNP)","start":{"date-parts":[[2025,9,22]]},"location":"Seoul, Korea, Republic of","end":{"date-parts":[[2025,9,25]]}},"container-title":["2025 IEEE 33rd International Conference on Network Protocols (ICNP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11192357\/11192322\/11192443.pdf?arnumber=11192443","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,14]],"date-time":"2025-10-14T05:08:58Z","timestamp":1760418538000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11192443\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,22]]},"references-count":32,"URL":"https:\/\/doi.org\/10.1109\/icnp65844.2025.11192443","relation":{},"subject":[],"published":{"date-parts":[[2025,9,22]]}}}