{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T09:41:29Z","timestamp":1775122889667,"version":"3.50.1"},"reference-count":64,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,2,1]],"date-time":"2023-02-01T00:00:00Z","timestamp":1675209600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,2,1]],"date-time":"2023-02-01T00:00:00Z","timestamp":1675209600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,2]]},"DOI":"10.1109\/hpca56546.2023.10071077","type":"proceedings-article","created":{"date-parts":[[2023,3,24]],"date-time":"2023-03-24T17:42:55Z","timestamp":1679679775000},"page":"556-569","source":"Crossref","is-referenced-by-count":18,"title":["MPress: Democratizing Billion-Scale Model Training on Multi-GPU Servers via Memory-Saving Inter-Operator Parallelism"],"prefix":"10.1109","author":[{"given":"Quan","family":"Zhou","sequence":"first","affiliation":[{"name":"University of Science and Technology of China"}]},{"given":"Haiquan","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China"}]},{"given":"Xiaoyan","family":"Yu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China"}]},{"given":"Cheng","family":"Li","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China"}]},{"given":"Youhui","family":"Bai","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China"}]},{"given":"Feng","family":"Yan","sequence":"additional","affiliation":[{"name":"University of Houston"}]},{"given":"Yinlong","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Bert from Google Research"},{"key":"ref2","article-title":"Code of MPress"},{"key":"ref3","article-title":"Convnet-burden"},{"key":"ref4","article-title":"DeepSpeed Code Repository"},{"key":"ref5","article-title":"Distributed training of deep learning models on Azure"},{"key":"ref6","article-title":"High Bandwidth Memory"},{"key":"ref7","article-title":"Introducing to P3.24xlarge of AWS"},{"key":"ref8","article-title":"NVIDIA Grace-Hopper Superchip Architecture"},{"key":"ref9","article-title":"NVIDIA H100 Tensor Core GPU Architecture"},{"key":"ref10","article-title":"Pytorch Homepage"},{"key":"ref11","article-title":"PyTorch Pretrained Bert"},{"key":"ref12","article-title":"Stanford Question Answering Dataset v1.1"},{"key":"ref13","article-title":"The Code Repo for PipeDream: Pipeline Parallelism for DNN Training"},{"key":"ref14","article-title":"White Paper for DGX-1 with Tesla V100"},{"key":"ref15","article-title":"Wikipedia Dataset"},{"key":"ref16","article-title":"Training deep nets with sublinear memory cost","author":"Chen","year":"2016"},{"key":"ref17","first-page":"3123","article-title":"Binaryconnect: Training deep neural networks with binary weights during propagations","author":"Courbariaux","year":"2015","journal-title":"Advances in neural information processing systems"},{"key":"ref18","article-title":"Binarized neural networks: Training deep neural networks with weights and activations constrained to+ 1 or-1","author":"Courbariaux","year":"2016"},{"key":"ref19","first-page":"1223","article-title":"Large scale distributed deep networks","volume-title":"Proceedings of Advances in neural information processing systems","author":"Dean"},{"key":"ref20","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"ref22","article-title":"Deep retrieval: An end-to-end learnable structure model for large-scale recommendations","volume-title":"CoRR","author":"Gao","year":"2020"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3368089.3417050"},{"key":"ref24","article-title":"Compressing deep convolutional networks using vector quantization","author":"Gong","year":"2014"},{"key":"ref25","first-page":"4125","article-title":"Memory-efficient backpropagation through time","volume":"29","author":"Gruslys","year":"2016","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref26","first-page":"1737","article-title":"Deep learning with limited numerical precision","volume-title":"International conference on machine learning","author":"Gupta"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001163"},{"key":"ref28","article-title":"Deep compression: Compressing deep neural networks with pruning, trained quantization and huffman coding","author":"Han","year":"2015"},{"key":"ref29","article-title":"Learning both weights and connections for efficient neural networks","author":"Han","year":"2015"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2021.08.002"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378530"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.48550\/arxiv.1811.06965"},{"key":"ref33","first-page":"497","article-title":"Checkmate: Breaking the memory wall with optimal tensor rematerialization","volume-title":"Proceedings of Machine Learning and Systems","volume":"2","author":"Jain"},{"key":"ref34","first-page":"463","article-title":"A unified architecture for accelerating distributed {DNN} training in heterogeneous {GPU\/CPU} clusters","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Jiang"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3243904"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/2925426.2926294"},{"key":"ref37","article-title":"Scaling laws for neural language models","volume-title":"CoRR","author":"Kaplan","year":"2020"},{"key":"ref38","article-title":"Dynamic tensor rematerialization","author":"Kirisame","year":"2020"},{"key":"ref39","article-title":"Reducing activation recomputation in large transformer models","volume-title":"CoRR","author":"Korthikanti","year":"2022"},{"key":"ref40","article-title":"Tflms: Large model support in tensorflow by graph rewriting","author":"Le","year":"2018"},{"key":"ref41","article-title":"Mixed precision training","author":"Micikevicius","year":"2017"},{"key":"ref42","article-title":"Imagenet\/resnet-50 training in 224 seconds","author":"Mikami","year":"2018"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"ref44","first-page":"1","article-title":"Efficient large-scale language model training on gpu clusters using megatron-lm","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Narayanan"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3330345.3330384"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"ref47","article-title":"Training large neural networks with constant memory using a new execution algorithm","author":"Pudipeddi","year":"2020"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"ref51","article-title":"Zero-offload: Democratizing billion-scale model training","author":"Ren","year":"2021"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/PMBS49563.2019.00017"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783721"},{"key":"ref54","article-title":"Horovod: fast and easy distributed deep learning in tensorflow","volume-title":"CoRR","author":"Sergeev","year":"2018"},{"key":"ref55","article-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism","volume-title":"CoRR","author":"Shoeybi","year":"2019"},{"key":"ref56","article-title":"Delta: Dynamically optimizing gpu memory beyond tensor recomputation","author":"Tang","year":"2022"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1145\/3530811"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1145\/3178487.3178491"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303953"},{"key":"ref60","first-page":"595","article-title":"Gandiva: Introspective cluster scheduling for deep learning","volume-title":"Proceedings of 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Xiao"},{"key":"ref61","article-title":"Gspmd: General and scalable parallelization for ml computation graphs","author":"Xu","year":"2021"},{"key":"ref62","article-title":"Xlnet: Generalized autoregressive pretraining for language understanding","volume-title":"CoRR","author":"Yang","year":"2019"},{"key":"ref63","article-title":"Pangu-\u03b1: Large-scale autoregressive pretrained chinese language models with auto-parallel computation","volume-title":"CoRR","author":"Zeng","year":"2021"},{"key":"ref64","article-title":"Alpa: Automating interand intra-operator parallelism for distributed deep learning","author":"Zheng","year":"2022"}],"event":{"name":"2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)","location":"Montreal, QC, Canada","start":{"date-parts":[[2023,2,25]]},"end":{"date-parts":[[2023,3,1]]}},"container-title":["2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10070856\/10070923\/10071077.pdf?arnumber=10071077","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,13]],"date-time":"2024-02-13T13:26:23Z","timestamp":1707830783000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10071077\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,2]]},"references-count":64,"URL":"https:\/\/doi.org\/10.1109\/hpca56546.2023.10071077","relation":{},"subject":[],"published":{"date-parts":[[2023,2]]}}}