{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T07:31:44Z","timestamp":1768030304655,"version":"3.49.0"},"reference-count":97,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"8","license":[{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61802358"],"award-info":[{"award-number":["61802358"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61772486"],"award-info":[{"award-number":["61772486"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CAREER-2048044"],"award-info":[{"award-number":["CAREER-2048044"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["IIS-1838024"],"award-info":[{"award-number":["IIS-1838024"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CCF-1756013"],"award-info":[{"award-number":["CCF-1756013"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1109\/tpds.2023.3266246","type":"journal-article","created":{"date-parts":[[2023,4,11]],"date-time":"2023-04-11T17:32:10Z","timestamp":1681234330000},"page":"1695-1712","source":"Crossref","is-referenced-by-count":1,"title":["A Generic, High-Performance, Compression-Aware Framework for Data Parallel DNN Training"],"prefix":"10.1109","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-6318-4505","authenticated-orcid":false,"given":"Hao","family":"Wu","sequence":"first","affiliation":[{"name":"Department of Computer Science and Technology, University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7554-3492","authenticated-orcid":false,"given":"Shiyi","family":"Wang","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6073-7011","authenticated-orcid":false,"given":"Youhui","family":"Bai","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7064-6120","authenticated-orcid":false,"given":"Cheng","family":"Li","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8964-8250","authenticated-orcid":false,"given":"Quan","family":"Zhou","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8691-5452","authenticated-orcid":false,"given":"Jun","family":"Yi","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9840-7754","authenticated-orcid":false,"given":"Feng","family":"Yan","sequence":"additional","affiliation":[{"name":"Computer Science Department and Electrical and Computer Engineering Department, University of Houston, Houston, TX, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5060-8411","authenticated-orcid":false,"given":"Ruichuan","family":"Chen","sequence":"additional","affiliation":[{"name":"Nokia Bell Labs, Murray Hill, NJ, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9586-0561","authenticated-orcid":false,"given":"Yinlong","family":"Xu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, University of Science and Technology of China, Hefei, China"}]}],"member":"263","reference":[{"key":"ref1","first-page":"265","article-title":"Tensorflow: A system for large-scale machine learning","volume-title":"Proc. USENIX Conf. Operating Syst. Des. Implementation","author":"Abadi"},{"key":"ref3","article-title":"Overlapping profiling results of HiPress","year":"2023"},{"key":"ref4","article-title":"Accordion: Adaptive gradient communication via critical learning regime identification","author":"Agarwal","year":"2020"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d17-1045"},{"key":"ref6","article-title":"Gradient compression in AWS","year":"2021"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/2858788.2688521"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519584"},{"key":"ref9","article-title":"Baidu. \u201cBringing HPC techniques to deep learning","year":"2017"},{"key":"ref10","article-title":"PaddlePaddle GitHub source code","year":"2021"},{"key":"ref11","first-page":"1","article-title":"The fifth PASCAL recognizing textual entailment challenge","volume-title":"Proc. 2nd Text Anal. Conf.","author":"Bentivogli"},{"key":"ref12","article-title":"signSGD: Compressed optimisation for non-convex problems","author":"Bernstein","year":"2018"},{"key":"ref13","article-title":"Open-source implementation of onebit algorithm","year":"2021"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11728"},{"key":"ref15","first-page":"1","article-title":"Revisiting distributed synchronous SGD","volume-title":"Proc. Int. Conf. Learn. Representations Workshop Track","author":"Chen"},{"key":"ref16","first-page":"578","article-title":"TVM: An automated end-to-end optimizing compiler for deep learning","volume-title":"Proc. 13th USENIX Symp. Operating Syst. Des. Implementation","author":"Chen"},{"key":"ref17","first-page":"571","article-title":"Project adam: Building an efficient and scalable deep learning training system","volume-title":"Proc. 11th USENIX Symp. Operating Syst. Des. Implementation","author":"Chilimbi"},{"key":"ref18","first-page":"1","article-title":"GradZip: Gradient compression using alternating matrix factorization for large-scale deep learning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Cho"},{"key":"ref19","article-title":"Intel nGraph: An intermediate representation, compiler, and executor for deep learning","author":"Cyphers","year":"2018"},{"key":"ref20","first-page":"1223","article-title":"Large scale distributed deep networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Dean"},{"key":"ref21","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3452296.3472904"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575703"},{"key":"ref26","article-title":"Taming momentum in a distributed asynchronous environment","author":"Hakimi","year":"2019"},{"key":"ref27","article-title":"Bank conflict in GPU","author":"Harris","year":"2013"},{"key":"ref28","article-title":"TicTac: Accelerating distributed deep learning with communication scheduling","author":"Hashemi","year":"2018"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref30","first-page":"1223","article-title":"More effective distributed ML via a stale synchronous parallel parameter server","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ho"},{"key":"ref31","article-title":"Priority-based parameter propagation for distributed dnn training","author":"Jayarajan","year":"2019"},{"key":"ref32","article-title":"Highly scalable deep learning training system with mixed-precision: Training imagenet in four minutes","author":"Jia","year":"2018"},{"key":"ref33","first-page":"463","article-title":"A unified architecture for accelerating distributed DNN training in heterogeneous gpu\/cpu clusters","volume-title":"Proc. 14th USENIX Symp. Operating Syst. Des. Implementation","author":"Jiang"},{"key":"ref34","first-page":"1","article-title":"U-GAT-IT: Unsupervised generative attentional networks with adaptive layer-instance normalization for image-to-image translation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kim"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303957"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.14778\/3342263.3342276"},{"key":"ref37","article-title":"LAPACK and INTEL math kernel library teams. linear algebra PACKage","year":"2022"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.5555\/2685048.2685095"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3545008.3545011"},{"key":"ref40","first-page":"279","article-title":"Accelerating distributed reinforcement learning with in-switch computing","volume-title":"Proc. 46th Int. Symp. Comput. Architecture","author":"Li"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3458336.3465289"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.14778\/3551793.3551828"},{"key":"ref43","first-page":"53","article-title":"3LC: Lightweight and effective traffic compression for distributed machine learning","volume-title":"Proc. Mach. Learn. Syst.","volume":"1","author":"Lim","year":"2019"},{"key":"ref44","article-title":"Deep gradient compression: Reducing the communication bandwidth for distributed training","author":"Lin","year":"2017"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3267809.3267840"},{"key":"ref46","article-title":"MARVELL White paper for 25 Gb ethernet","year":"2021"},{"key":"ref47","article-title":"Revisiting small batch training for deep neural networks","author":"Masters","year":"2018"},{"key":"ref48","article-title":"Mellanox corporate update","year":"2021"},{"key":"ref49","article-title":"The wikitext long term dependency language modeling dataset","year":"2016"},{"key":"ref50","article-title":"Regularizing and optimizing LSTM language models","author":"Merity","year":"2017"},{"key":"ref51","article-title":"Gradient compression in Meta","year":"2021"},{"key":"ref52","article-title":"Hook of PyTorch","year":"2021"},{"key":"ref53","article-title":"Gradient compression in PyTorch","year":"2022"},{"key":"ref54","article-title":"PyTorch PowerSGD communication hook","year":"2022"},{"key":"ref55","article-title":"ImageNet\/ResNet-50 training in 224 seconds","author":"Mikami","year":"2018"},{"key":"ref57","article-title":"Distributed training of deep learning models on azure","year":"2021"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/IPCCC47392.2019.8958738"},{"key":"ref60","article-title":"A timeline of innovation for NVIDIA","year":"2021"},{"key":"ref61","article-title":"The API reference guide for thrust, the CUDA C template library","year":"2021"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1145\/3132747.3132766"},{"key":"ref66","article-title":"Deep gradient compression implementation in the common layer using CUDA","author":"Pan","year":"2018"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2008.09.002"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356222"},{"key":"ref72","article-title":"Glow: Graph lowering compiler techniques for neural networks","year":"2019"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref75","first-page":"785","article-title":"Scaling distributed machine learning with in-network aggregation","volume-title":"Proc. 18th USENIX Symp. Netw. Syst. Des. Implementation","author":"Sapio"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2014-274"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W17-4739"},{"key":"ref78","article-title":"Horovod: Fast and easy distributed deep learning in tensorflow","author":"Sergeev","year":"2018"},{"key":"ref79","first-page":"10435","article-title":"Mesh-TensorFlow: Deep learning for supercomputers","volume-title":"Proc. 32nd Int. Conf. Neural Inf. Process. Syst.","author":"Shazeer"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM41043.2020.9155269"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2019.00220"},{"key":"ref82","first-page":"401","article-title":"Towards scalable distributed training of deep learning on public cloud clusters","volume-title":"Proc. Mach. Learn. Syst.","author":"Shi"},{"key":"ref83","article-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan","year":"2014"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-354"},{"key":"ref85","first-page":"3365","article-title":"Communication-efficient distributed learning via lazily aggregated quantized gradients","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Sun"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref87","first-page":"14269","article-title":"PowerSGD: Practical low-rank gradient compression for distributed optimization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Vogels"},{"key":"ref88","first-page":"172","article-title":"Blink: Fast and generic collectives for distributed ml","volume-title":"Proc. Conf. Mach. Learn. Syst.","volume":"2","author":"Wang"},{"key":"ref89","first-page":"9872","article-title":"ATOMO: Communication-efficient learning via atomic sparsification","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref90","article-title":"Unifying data, model and hybrid parallelism in deep learning via tensor tiling","author":"Wang","year":"2018"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303953"},{"key":"ref92","article-title":"ByteComp: Revisiting gradient compression in distributed training","author":"Wang","year":"2022"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1145\/2806777.2806778"},{"key":"ref94","first-page":"1509","article-title":"TernGrad: Ternary gradients to reduce communication in distributed deep learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wen"},{"key":"ref95","article-title":"List of NVIDIA graphics processing units","year":"2021"},{"key":"ref96","article-title":"Error compensated quantized SGD and its applications to large-scale distributed optimization","author":"Wu","year":"2018"},{"key":"ref97","first-page":"595","article-title":"Gandiva: Introspective cluster scheduling for deep learning","volume-title":"Proc. 13th USENIX Symp. Operating Syst. Des. Implementation","author":"Xiao"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1145\/2783258.2783323"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS51616.2021.00060"},{"key":"ref100","first-page":"21150","article-title":"DeepReduce: A sparse-tensor communication framework for federated deep learning","volume":"34","author":"Xu","year":"2021","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303975"},{"key":"ref102","article-title":"XLNet: Generalized autoregressive pretraining for language understanding","author":"Yang","year":"2019"},{"key":"ref103","first-page":"5123","article-title":"GradiVeQ: Vector quantization for bandwidth-efficient gradient aggregation in distributed CNN training","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yu"},{"key":"ref104","first-page":"181","article-title":"Poseidon: An efficient communication architecture for distributed deep learning on GPU clusters","volume-title":"Proc. USENIX Annu. Tech. Conf.","author":"Zhang"},{"key":"ref105","first-page":"337","article-title":"Daydream: Accurately estimating the efficacy of optimizations for DNN training","volume-title":"Proc. USENIX Annu. Tech. Conf.","author":"Zhu"},{"key":"ref106","first-page":"2595","article-title":"Parallelized stochastic gradient descent","volume-title":"Proc. 23rd Int. Conf. Neural Inf. Process. Syst.","author":"Zinkevich"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielam\/71\/11045205\/10098952-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/71\/11045205\/10098952.pdf?arnumber=10098952","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,26]],"date-time":"2025-06-26T17:45:19Z","timestamp":1750959919000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10098952\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8]]},"references-count":97,"journal-issue":{"issue":"8"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2023.3266246","relation":{},"ISSN":["1045-9219","1558-2183","2161-9883"],"issn-type":[{"value":"1045-9219","type":"print"},{"value":"1558-2183","type":"electronic"},{"value":"2161-9883","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,8]]}}}