{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T23:04:06Z","timestamp":1774047846644,"version":"3.50.1"},"reference-count":47,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100002855","name":"Ministry of Science and Technology of the People&apos;s Republic of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100002855","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2023YFA1011704"],"award-info":[{"award-number":["2023YFA1011704"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["CO2-EOR"],"award-info":[{"award-number":["CO2-EOR"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Journal of Systems Architecture"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.sysarc.2026.103770","type":"journal-article","created":{"date-parts":[[2026,3,9]],"date-time":"2026-03-09T16:55:57Z","timestamp":1773075357000},"page":"103770","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["HAGC: A Hardware-Aware Gradient Compression framework for distributed deep learning"],"prefix":"10.1016","volume":"175","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-5762-1904","authenticated-orcid":false,"given":"Aiqiang","family":"Yang","sequence":"first","affiliation":[]},{"given":"Jie","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Bo","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Xiang","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Qinglin","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Zeyao","family":"Mo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5224-4048","authenticated-orcid":false,"given":"Keqin","family":"Li","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.sysarc.2026.103770_b1","first-page":"297","article-title":"An efficient statistical-based gradient compression technique for distributed training systems","volume":"3","author":"M Abdelmoniem","year":"2021","journal-title":"Proc. Mach. Learn. Syst."},{"key":"10.1016\/j.sysarc.2026.103770_b2","series-title":"Deep gradient compression: Reducing the communication bandwidth for distributed training","author":"Lin","year":"2017"},{"key":"10.1016\/j.sysarc.2026.103770_b3","article-title":"PowerSGD: Practical low-rank gradient compression for distributed optimization","volume":"32","author":"Vogels","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.sysarc.2026.103770_b4","article-title":"QSGD: Communication-efficient SGD via gradient quantization and encoding","volume":"30","author":"Alistarh","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.sysarc.2026.103770_b5","doi-asserted-by":"crossref","DOI":"10.1109\/TCOMM.2024.3385922","article-title":"Effective communication with dynamic feature compression","author":"Talli","year":"2024","journal-title":"IEEE Trans. Commun."},{"issue":"276","key":"10.1016\/j.sysarc.2026.103770_b6","first-page":"1","article-title":"On biased compression for distributed learning","volume":"24","author":"Beznosikov","year":"2023","journal-title":"J. Mach. Learn. Res."},{"key":"10.1016\/j.sysarc.2026.103770_b7","unstructured":"M. Li, R.B. Basat, S. Vargaftik, C. Lao, K. Xu, M. Mitzenmacher, M. Yu, {THC}: Accelerating Distributed Deep Learning Using Tensor Homomorphic Compression, in: 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24), 2024, pp. 1191\u20131211."},{"key":"10.1016\/j.sysarc.2026.103770_b8","doi-asserted-by":"crossref","unstructured":"K. Huang, B. Ni, X. Yang, Efficient quantization for neural networks with binary weights and low bitwidth activations, in: Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 33, 2019, pp. 3854\u20133861.","DOI":"10.1609\/aaai.v33i01.33013854"},{"key":"10.1016\/j.sysarc.2026.103770_b9","series-title":"Quantizing deep convolutional networks for efficient inference: A whitepaper","author":"Krishnamoorthi","year":"2018"},{"key":"10.1016\/j.sysarc.2026.103770_b10","doi-asserted-by":"crossref","unstructured":"Y. Peng, Y. Zhu, Y. Chen, Y. Bao, B. Yi, C. Lan, C. Wu, C. Guo, A generic communication scheduler for distributed DNN training acceleration, in: Proceedings of the 27th ACM Symposium on Operating Systems Principles, 2019, pp. 16\u201329.","DOI":"10.1145\/3341301.3359642"},{"key":"10.1016\/j.sysarc.2026.103770_b11","doi-asserted-by":"crossref","DOI":"10.1016\/j.sysarc.2023.103025","article-title":"A convertible neural processor supporting adaptive quantization for real-time neural networks","volume":"145","author":"Kal","year":"2023","journal-title":"J. Syst. Archit."},{"key":"10.1016\/j.sysarc.2026.103770_b12","series-title":"International Conference on Machine Learning","first-page":"8852","article-title":"DAdaQuant: Doubly-adaptive quantization for communication-efficient federated learning","author":"H\u00f6nig","year":"2022"},{"key":"10.1016\/j.sysarc.2026.103770_b13","series-title":"Adaptive discrete communication bottlenecks with dynamic vector quantization","author":"Liu","year":"2022"},{"key":"10.1016\/j.sysarc.2026.103770_b14","doi-asserted-by":"crossref","DOI":"10.1016\/j.sysarc.2023.102927","article-title":"Communication compression techniques in distributed deep learning: A survey","volume":"142","author":"Wang","year":"2023","journal-title":"J. Syst. Archit."},{"key":"10.1016\/j.sysarc.2026.103770_b15","doi-asserted-by":"crossref","DOI":"10.1016\/j.sysarc.2025.103486","article-title":"A comprehensive review on hardware implementations of lattice-based cryptographic schemes","author":"Ahmadunnisa","year":"2025","journal-title":"J. Syst. Archit."},{"issue":"1","key":"10.1016\/j.sysarc.2026.103770_b16","doi-asserted-by":"crossref","first-page":"11658","DOI":"10.1038\/s41598-023-38916-x","article-title":"Two-layer accumulated quantized compression for communication-efficient federated learning: TLAQC","volume":"13","author":"Ren","year":"2023","journal-title":"Sci. Rep."},{"key":"10.1016\/j.sysarc.2026.103770_b17","series-title":"Spatio-temporal communication compression in distributed prime-dual flows","author":"Ren","year":"2024"},{"key":"10.1016\/j.sysarc.2026.103770_b18","series-title":"On distributed adaptive optimization with gradient compression","author":"Li","year":"2022"},{"key":"10.1016\/j.sysarc.2026.103770_b19","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"10478","article-title":"Step-ahead error feedback for distributed training with compressed gradient","volume":"35","author":"Xu","year":"2021"},{"key":"10.1016\/j.sysarc.2026.103770_b20","series-title":"International Conference on Machine Learning","first-page":"3252","article-title":"Error feedback fixes signsgd and other gradient compression schemes","author":"Karimireddy","year":"2019"},{"key":"10.1016\/j.sysarc.2026.103770_b21","series-title":"2023 IEEE International Symposium on Information Theory","first-page":"1160","article-title":"Compressed error harq: Feedback communication on noise-asymmetric channels","author":"Ankireddy","year":"2023"},{"key":"10.1016\/j.sysarc.2026.103770_b22","article-title":"Vector quantization with error uniformly distributed over an arbitrary set","author":"Ling","year":"2024","journal-title":"IEEE Trans. Inform. Theory"},{"key":"10.1016\/j.sysarc.2026.103770_b23","article-title":"Differentially private stochastic gradient descent via compression and memorization","volume":"135","author":"Phuong","year":"2023","journal-title":"J. Syst. Archit."},{"key":"10.1016\/j.sysarc.2026.103770_b24","doi-asserted-by":"crossref","unstructured":"A. Gondimalla, M. Thottethodi, T. Vijaykumar, Eureka: Efficient Tensor Cores for One-sided Unstructured Sparsity in DNN Inference, in: Proceedings of the 56th Annual IEEE\/ACM International Symposium on Microarchitecture, 2023, pp. 324\u2013337.","DOI":"10.1145\/3613424.3614312"},{"key":"10.1016\/j.sysarc.2026.103770_b25","series-title":"SC24: International Conference for High Performance Computing, Networking, Storage and Analysis","first-page":"1","article-title":"Accelerating communication in deep learning recommendation model training with dual-level adaptive lossy compression","author":"Feng","year":"2024"},{"issue":"7","key":"10.1016\/j.sysarc.2026.103770_b26","first-page":"1878","article-title":"Accelerating binarized neural networks via bit-tensor-cores in turing gpus","volume":"32","author":"Li","year":"2020","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"10.1016\/j.sysarc.2026.103770_b27","series-title":"2021 Design, Automation & Test in Europe Conference & Exhibition","first-page":"120","article-title":"Efficient tensor cores support in tvm for low-latency deep learning","author":"Sun","year":"2021"},{"key":"10.1016\/j.sysarc.2026.103770_b28","series-title":"2018 IEEE International Parallel and Distributed Processing Symposium Workshops","first-page":"522","article-title":"Nvidia tensor core programmability, performance & precision","author":"Markidis","year":"2018"},{"key":"10.1016\/j.sysarc.2026.103770_b29","series-title":"Tcfft: Accelerating half-precision FFT through tensor cores","author":"Li","year":"2021"},{"key":"10.1016\/j.sysarc.2026.103770_b30","doi-asserted-by":"crossref","unstructured":"Y. Liu, Y. Xue, Y. Cheng, L. Ma, Z. Miao, J. Xue, J. Huang, Scaling Deep Learning Computation over the Inter-Core Connected Intelligence Processor with T10, in: Proceedings of the ACM SIGOPS 30th Symposium on Operating Systems Principles, 2024, pp. 505\u2013521.","DOI":"10.1145\/3694715.3695955"},{"issue":"1","key":"10.1016\/j.sysarc.2026.103770_b31","doi-asserted-by":"crossref","first-page":"72","DOI":"10.1109\/TPDS.2020.3011893","article-title":"GPU tensor cores for fast arithmetic reductions","volume":"32","author":"Navarro","year":"2020","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"10.1016\/j.sysarc.2026.103770_b32","series-title":"Proceedings of the ACM International Conference on Supercomputing","first-page":"46","article-title":"Accelerating reduction and scan using tensor core units","author":"Dakkak","year":"2019"},{"key":"10.1016\/j.sysarc.2026.103770_b33","doi-asserted-by":"crossref","first-page":"10","DOI":"10.1016\/j.future.2022.04.023","article-title":"Squeeze: Efficient compact fractals for tensor core gpus","volume":"135","author":"Quezada","year":"2022","journal-title":"Future Gener. Comput. Syst."},{"key":"10.1016\/j.sysarc.2026.103770_b34","doi-asserted-by":"crossref","DOI":"10.1016\/j.sysarc.2025.103333","article-title":"Accelerating tensor multiplication by exploring hybrid product with hardware and software co-design","volume":"159","author":"Zhang","year":"2025","journal-title":"J. Syst. Archit."},{"key":"10.1016\/j.sysarc.2026.103770_b35","series-title":"Fp8 formats for deep learning","author":"Micikevicius","year":"2022"},{"key":"10.1016\/j.sysarc.2026.103770_b36","series-title":"Fp8-lm: Training fp8 large language models","author":"Peng","year":"2023"},{"key":"10.1016\/j.sysarc.2026.103770_b37","series-title":"2023 IEEE 43rd International Conference on Distributed Computing Systems","first-page":"361","article-title":"Evaluation and optimization of gradient compression for distributed deep learning","author":"Zhang","year":"2023"},{"key":"10.1016\/j.sysarc.2026.103770_b38","unstructured":"X.-Y. Liu, T. Zhang, H. Hong, H. Huang, H. Lu, High-performance computing primitives for tensor networks learning operations on GPUs, in: Proc. Int. Conf. Neural Inf. Process. Syst. Workshop Quantum Tensor Netw. Mach. Learn, 2020, pp. 1\u20139."},{"issue":"7840","key":"10.1016\/j.sysarc.2026.103770_b39","doi-asserted-by":"crossref","first-page":"52","DOI":"10.1038\/s41586-020-03070-1","article-title":"Parallel convolutional processing using an integrated photonic tensor core","volume":"589","author":"Feldmann","year":"2021","journal-title":"Nature"},{"key":"10.1016\/j.sysarc.2026.103770_b40","article-title":"Sparsified SGD with memory","volume":"31","author":"Stich","year":"2018","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.sysarc.2026.103770_b41","series-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"10.1016\/j.sysarc.2026.103770_b42","series-title":"International Conference on Machine Learning","first-page":"6105","article-title":"Efficientnet: Rethinking model scaling for convolutional neural networks","author":"Tan","year":"2019"},{"key":"10.1016\/j.sysarc.2026.103770_b43","doi-asserted-by":"crossref","unstructured":"K. He, X. Zhang, S. Ren, J. Sun, Deep residual learning for image recognition, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10.1016\/j.sysarc.2026.103770_b44","series-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan","year":"2014"},{"key":"10.1016\/j.sysarc.2026.103770_b45","series-title":"Learning Multiple Layers of Features from Tiny Images","author":"Krizhevsky","year":"2009"},{"issue":"11","key":"10.1016\/j.sysarc.2026.103770_b46","doi-asserted-by":"crossref","first-page":"2278","DOI":"10.1109\/5.726791","article-title":"Gradient-based learning applied to document recognition","volume":"86","author":"LeCun","year":"1998","journal-title":"Proc. IEEE"},{"key":"10.1016\/j.sysarc.2026.103770_b47","series-title":"2009 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"248","article-title":"Imagenet: A large-scale hierarchical image database","author":"Deng","year":"2009"}],"container-title":["Journal of Systems Architecture"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1383762126000883?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1383762126000883?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T21:55:29Z","timestamp":1774043729000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1383762126000883"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":47,"alternative-id":["S1383762126000883"],"URL":"https:\/\/doi.org\/10.1016\/j.sysarc.2026.103770","relation":{},"ISSN":["1383-7621"],"issn-type":[{"value":"1383-7621","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"HAGC: A Hardware-Aware Gradient Compression framework for distributed deep learning","name":"articletitle","label":"Article Title"},{"value":"Journal of Systems Architecture","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.sysarc.2026.103770","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"103770"}}