{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T16:06:05Z","timestamp":1780589165586,"version":"3.54.1"},"reference-count":55,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"1","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key Research and Development Project","award":["2022YFB2901600"],"award-info":[{"award-number":["2022YFB2901600"]}]},{"name":"General Program of National Natural Science Foundation of China","award":["61971377"],"award-info":[{"award-number":["61971377"]}]},{"name":"Key Project of Natural Science Foundation of Zhejiang Province","award":["LZ22F010008"],"award-info":[{"award-number":["LZ22F010008"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2024,1]]},"DOI":"10.1109\/tpds.2023.3331372","type":"journal-article","created":{"date-parts":[[2023,11,9]],"date-time":"2023-11-09T19:07:40Z","timestamp":1699556860000},"page":"123-139","source":"Crossref","is-referenced-by-count":11,"title":["US-Byte: An Efficient Communication Framework for Scheduling Unequal-Sized Tensor Blocks in Distributed Deep Learning"],"prefix":"10.1109","volume":"35","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-0599-8383","authenticated-orcid":false,"given":"Yunqi","family":"Gao","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0594-8998","authenticated-orcid":false,"given":"Bing","family":"Hu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9948-9165","authenticated-orcid":false,"given":"Mahdi Boloursaz","family":"Mashhadi","sequence":"additional","affiliation":[{"name":"5GIC &#x0026; 6GIC, Institute for Communication Systems (ICS), University of Surrey, Guildford, U.K."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9680-2811","authenticated-orcid":false,"given":"A-Long","family":"Jin","sequence":"additional","affiliation":[{"name":"University of Hong Kong, Pokfulam, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7886-5878","authenticated-orcid":false,"given":"Pei","family":"Xiao","sequence":"additional","affiliation":[{"name":"5GIC &#x0026; 6GIC, Institute for Communication Systems (ICS), University of Surrey, Guildford, U.K."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7958-9687","authenticated-orcid":false,"given":"Chunming","family":"Wu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM48880.2022.9796820"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2022.3217271"},{"key":"ref15","first-page":"265","article-title":"TensorFlow: A system for large-scale machine learning","author":"abadi","year":"2016","journal-title":"Proc USENIX Symp Oper Syst Des Implement"},{"key":"ref14","first-page":"8026","article-title":"PyTorch: An imperative style, high-performance deep learning library","author":"paszke","year":"2019","journal-title":"Proc Adv Neural Inf Proces Syst"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1145\/3341302.3342085"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM41043.2020.9155282"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3137321"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/IWQoS57198.2023.10188699"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3090331"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1145\/3343180.3343186"},{"key":"ref17","first-page":"181","article-title":"Poseidon: An efficient communication architecture for distributed deep learning on GPU clusters","author":"zhang","year":"2017","journal-title":"Proc USENIX Conf USENIX Annu Tech Conf"},{"key":"ref16","article-title":"MXNet: A flexible and efficient machine learning library for heterogeneous distributed systems","author":"chen","year":"2015"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3052862"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1145\/3386367.3431307"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM48880.2022.9796752"},{"key":"ref46","first-page":"954","article-title":"CMFL: Mitigating communication overhead for federated learning","author":"luping","year":"2019","journal-title":"Proc IEEE 39th Int Conf Distrib Comput Syst"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS51616.2021.00010"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737587"},{"key":"ref47","first-page":"629","article-title":"Gaia: Geo-distributed machine learning approaching LAN speeds","author":"hsieh","year":"2017","journal-title":"Proc USENIX Symp Netw Syst Des Implement"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2019.00220"},{"key":"ref41","article-title":"Deep gradient compression: Reducing the communication bandwidth for distributed training","author":"lin","year":"2017"},{"key":"ref44","first-page":"172","article-title":"Sparse communication for distributed gradient descent","author":"aji","year":"2020","journal-title":"Proc Conf Empir Methods Natural Lang Process"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/473"},{"key":"ref49","first-page":"1223","article-title":"More effective distributed ML via a stale synchronous parallel parameter server","author":"ho","year":"2013","journal-title":"Proc Adv Neural Inf Proces Syst"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2019.2959533"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2023.3240833"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2023.3244198"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM41043.2020.9155446"},{"key":"ref3","article-title":"Accurate, large minibatch SGD: Training imagenet in 1 hour","author":"goyal","year":"2017"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/DASC\/PiCom\/DataCom\/CyberSciTec.2018.000-4"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM42981.2021.9488803"},{"key":"ref40","first-page":"1509","article-title":"Terngrad: Ternary gradients to reduce communication in distributed deep learning","author":"wen","year":"2017","journal-title":"Proc Adv Neural Inf Proces Syst"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3123878.3131975"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/2934872.2934908"},{"key":"ref37","article-title":"Gloo","year":"2017"},{"key":"ref36","first-page":"1","article-title":"Naos: Serialization-free RDMA networking in Java","author":"taranov","year":"2021","journal-title":"Proc USENIX Annu Tech Conf"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"ref30","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2014"},{"key":"ref33","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2019","journal-title":"Proc Conf North Amer Chapter Assoc Comput Linguistics Hum Lang Technol"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737367"},{"key":"ref1","first-page":"1232","article-title":"Large scale distributed deep networks","author":"dean","year":"2012","journal-title":"Proc Adv Neural Inf Proces Syst"},{"key":"ref39","first-page":"1709","article-title":"QSGD: Communication-efficient SGD via gradient quantization and encoding","author":"alistarh","year":"2017","journal-title":"Proc Adv Neural Inf Proces Syst"},{"key":"ref38","first-page":"172","article-title":"Blink: Fast and generic collectives for distributed ML","author":"wang","year":"2020","journal-title":"Proc Conf Mach Learn Syst"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2008.09.002"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/2640087.2644155"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref25","article-title":"Learning multiple layers of features from tiny images","author":"krizhevsky","year":"2009"},{"key":"ref20","first-page":"132","article-title":"Priority-based parameter propagation for distributed DNN training","author":"jayarajan","year":"2019","journal-title":"Proc Conf Mach Learn Syst"},{"key":"ref22","article-title":"Horovod: Fast and easy distributed deep learning in TensorFlow","author":"sergeev","year":"2018"},{"key":"ref21","first-page":"418","article-title":"TicTac: Accelerating distributed deep learning with communication scheduling","author":"hashemi","year":"2019","journal-title":"Proc Conf Mach Learn Syst"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"ref27","first-page":"142","article-title":"Learning word vectors for sentiment analysis","author":"maas","year":"2011","journal-title":"Proc Conf North Amer Chapter Assoc Comput Linguistics Hum Lang Technol"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/71\/10323392\/10314018.pdf?arnumber=10314018","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,13]],"date-time":"2023-12-13T19:55:47Z","timestamp":1702497347000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10314018\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,1]]},"references-count":55,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2023.3331372","relation":{},"ISSN":["1045-9219","1558-2183","2161-9883"],"issn-type":[{"value":"1045-9219","type":"print"},{"value":"1558-2183","type":"electronic"},{"value":"2161-9883","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,1]]}}}