{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,23]],"date-time":"2026-03-23T19:36:41Z","timestamp":1774294601994,"version":"3.50.1"},"reference-count":256,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2022,1,15]],"date-time":"2022-01-15T00:00:00Z","timestamp":1642204800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2022,1,15]],"date-time":"2022-01-15T00:00:00Z","timestamp":1642204800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,1,15]],"date-time":"2022-01-15T00:00:00Z","timestamp":1642204800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2016YFB0800402"],"award-info":[{"award-number":["2016YFB0800402"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Hong Kong RGC Research Impact Fund","award":["R5060-19"],"award-info":[{"award-number":["R5060-19"]}]},{"DOI":"10.13039\/501100002920","name":"General Research Fund","doi-asserted-by":"publisher","award":["152221\/19E"],"award-info":[{"award-number":["152221\/19E"]}],"id":[{"id":"10.13039\/501100002920","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002920","name":"General Research Fund","doi-asserted-by":"publisher","award":["15220320\/20E"],"award-info":[{"award-number":["15220320\/20E"]}],"id":[{"id":"10.13039\/501100002920","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Collaborative Research Fund","award":["C5026-18G"],"award-info":[{"award-number":["C5026-18G"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61872310"],"award-info":[{"award-number":["61872310"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U1836204"],"award-info":[{"award-number":["U1836204"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U1936108"],"award-info":[{"award-number":["U1936108"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Major Projects of the National Social Science Foundation","award":["16ZDA092"],"award-info":[{"award-number":["16ZDA092"]}]},{"DOI":"10.13039\/501100010877","name":"Shenzhen Science and Technology Innovation Commission","doi-asserted-by":"publisher","award":["R2020A045"],"award-info":[{"award-number":["R2020A045"]}],"id":[{"id":"10.13039\/501100010877","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["200202176"],"award-info":[{"award-number":["200202176"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["210202079"],"award-info":[{"award-number":["210202079"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002858","name":"China Postdoctoral Science Foundation","doi-asserted-by":"publisher","award":["2019M661709"],"award-info":[{"award-number":["2019M661709"]}],"id":[{"id":"10.13039\/501100002858","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002920","name":"Research Grants Council of the Hong Kong Special Administrative Region, China","doi-asserted-by":"publisher","award":["PolyU15222621"],"award-info":[{"award-number":["PolyU15222621"]}],"id":[{"id":"10.13039\/501100002920","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Internet Things J."],"published-print":{"date-parts":[[2022,1,15]]},"DOI":"10.1109\/jiot.2021.3111624","type":"journal-article","created":{"date-parts":[[2021,9,10]],"date-time":"2021-09-10T20:42:11Z","timestamp":1631306531000},"page":"939-963","source":"Crossref","is-referenced-by-count":42,"title":["A Comprehensive Survey on Training Acceleration for Large Machine Learning Models in IoT"],"prefix":"10.1109","volume":"9","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7591-5315","authenticated-orcid":false,"given":"Haozhao","family":"Wang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7538-1985","authenticated-orcid":false,"given":"Zhihao","family":"Qu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0328-2894","authenticated-orcid":false,"given":"Qihua","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Haobo","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Boyuan","family":"Luo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3256-8261","authenticated-orcid":false,"given":"Wenchao","family":"Xu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9831-2202","authenticated-orcid":false,"given":"Song","family":"Guo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7791-5511","authenticated-orcid":false,"given":"Ruixuan","family":"Li","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref170","article-title":"Omnivore: An optimizer for multi-device deep learning on CPUs and GPUs","author":"hadjis","year":"2016"},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.1145\/3365657"},{"key":"ref171","article-title":"Placeto: Learning generalizable device placement algorithms for distributed machine learning","author":"addanki","year":"2019"},{"key":"ref174","first-page":"1223","article-title":"Large scale distributed deep networks","author":"dean","year":"2012","journal-title":"Proc Adv Neural Inf Process Syst (NeurIPS)"},{"key":"ref173","article-title":"A hierarchical model for device placement","author":"mirhoseini","year":"2018","journal-title":"Proc ICLR"},{"key":"ref176","article-title":"At stability&#x2019;s edge: How to adjust hyperparameters to preserve minima selection in asynchronous training of neural networks?","author":"giladi","year":"2020","journal-title":"Proc 8th Int Conf Learn Represent (ICLR)"},{"key":"ref175","first-page":"22","article-title":"Solving the straggler problem with bounded staleness","author":"cipar","year":"2013","journal-title":"Proc HotOS"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.2017.2736066"},{"key":"ref177","first-page":"4120","article-title":"Asynchronous stochastic gradient descent with delay compensation","author":"zheng","year":"2017","journal-title":"Proc 34th Int Conf Mach Learn (ICML)"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2018.00102"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2018.00091"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/2640087.2644155"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/s13748-012-0035-5"},{"key":"ref33","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref32","first-page":"9017","article-title":"FastGRNN: A fast, accurate, stable and tiny kilobyte sized gated recurrent neural network","author":"kusupati","year":"2018","journal-title":"Advances in neural information processing systems"},{"key":"ref31","author":"kalchbrenner","year":"2015","journal-title":"Grid long short-term memory"},{"key":"ref30","author":"koutnik","year":"2014","journal-title":"A clockwork RNN"},{"key":"ref37","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2018"},{"key":"ref36","first-page":"5754","article-title":"XLNet: Generalized autoregressive pretraining for language understanding","author":"yang","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1989.1.2.270"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1109\/ISIT.2019.8849684"},{"key":"ref180","first-page":"1215","article-title":"Lagrange coded computing: Optimal design for resiliency, security, and privacy","author":"yu","year":"2019","journal-title":"Proc 22nd Int Conf Artif Intell Stat (AISTATS)"},{"key":"ref185","first-page":"6415","article-title":"Moniqua: Modulo quantized communication in decentralized SGD","author":"lu","year":"2020","journal-title":"Proc 37th Int Conf Mach Learn (ICML)"},{"key":"ref184","first-page":"6155","article-title":"DoubleSqueeze: Parallel stochastic gradient descent with double-pass error-compensated compression","author":"tang","year":"2019","journal-title":"Proc 36th Int Conf Mach Learn (ICML)"},{"key":"ref183","article-title":"Approximate gradient coding via sparse random graphs","author":"charles","year":"2017"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1109\/ISIT.2019.8849514"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/473"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3084104"},{"key":"ref187","article-title":"Decentralized deep learning with arbitrary communication compression","author":"koloskova","year":"2020","journal-title":"Proc 8th Int Conf Learn Represent (ICLR)"},{"key":"ref186","first-page":"5977","article-title":"The convergence of sparsified gradient methods","author":"alistarh","year":"2018","journal-title":"Proc Annu Conf Neural Inf Process Syst (NeurIPS)"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"ref27","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2014"},{"key":"ref179","article-title":"Polynomially coded regression: Optimal straggler mitigation via data encoding","author":"li","year":"2018"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2019.2918951"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/COMST.2019.2904897"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/COMST.2020.2986024"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/5254.708428"},{"key":"ref23","doi-asserted-by":"crossref","first-page":"5476","DOI":"10.1109\/JIOT.2020.3030072","article-title":"A survey on federated learning: The journey from centralized to distributed on-site learning and beyond","volume":"8","author":"rahman","year":"2021","journal-title":"IEEE Internet of Things Journal"},{"key":"ref26","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"Advances in neural information processing systems"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298655"},{"key":"ref50","first-page":"1","article-title":"Learning from brains how to regularize machines","author":"li","year":"2019","journal-title":"Proc 33rd Int Conf Neural Inf Process Syst"},{"key":"ref51","article-title":"SlowMo: Improving communication-efficient distributed SGD with slow momentum","author":"wang","year":"2019"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM41043.2020.9155237"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.2981338"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2016.44"},{"key":"ref155","first-page":"485","article-title":"Tiresias: A GPU cluster manager for distributed deep learning","author":"gu","year":"2019","journal-title":"Proc 16th USENIX Symp Netw Syst Design Implement (NSDI)"},{"key":"ref150","first-page":"11285","article-title":"TinyTL: Reduce memory, not parameters for efficient on-device learning","author":"cai","year":"2020","journal-title":"Proc Annu Conf Neural Inf Process Syst"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1145\/3397315"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/SRDS51746.2020.00017"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2018.8546129"},{"key":"ref147","author":"han","year":"2015","journal-title":"Learning both weights and connections for efficient neural networks"},{"key":"ref148","first-page":"4857","article-title":"Learning to prune deep neural networks via layer-wise optimal brain surgeon","author":"dong","year":"2017","journal-title":"Proc Annu Conf Neural Inf Process Syst (NeurIPS)"},{"key":"ref149","article-title":"Deep compression: Compressing deep neural networks with pruning, trained quantization and Huffman coding","author":"han","year":"2016"},{"key":"ref59","first-page":"1","article-title":"On the convergence of Adam and beyond","author":"reddi","year":"2018","journal-title":"Proc 6th Int Conf Learn Represent (ICLR)"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33015240"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33015741"},{"key":"ref56","article-title":"WNGrad: Learn the learning rate in gradient descent","author":"wu","year":"2018"},{"key":"ref55","first-page":"11574","article-title":"Adagrad stepsizes: Sharp convergence over nonconvex landscapes","volume":"97","author":"ward","year":"2019","journal-title":"Proc 36th Int Conf Mach Learn (ICML)"},{"key":"ref54","article-title":"Finite-sum smooth optimization with SARAH","author":"nguyen","year":"2019"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/422"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2020.2975189"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-36133-2_1"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751225"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"key":"ref165","article-title":"Salus: Fine-grained GPU sharing primitives for deep learning applications","author":"yu","year":"2019"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2021.3063180"},{"key":"ref163","first-page":"4302","article-title":"Gradient coding from cyclic MDS codes and expander graphs","author":"raviv","year":"2018","journal-title":"Proc 35th Int Conf Mach Learn (ICML)"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2019.00062"},{"key":"ref161","first-page":"3368","article-title":"Gradient coding: Avoiding stragglers in distributed learning","author":"tandon","year":"2017","journal-title":"Proc 34th Int Conf Mach Learn (ICML)"},{"key":"ref160","first-page":"37","article-title":"Exploiting bounded staleness to speed up big data analytics","author":"cui","year":"2014","journal-title":"Proc USENIX Annu Techn Conf (ATC)"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.3013306"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2018.2875246"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.3023000"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.3022323"},{"key":"ref8","first-page":"3555","article-title":"Multi-layered gradient boosting decision trees","author":"feng","year":"2018","journal-title":"Proc Annu Conf Neural Inf Process Syst (NeurIPS)"},{"key":"ref159","first-page":"2737","article-title":"Asynchronous parallel stochastic gradient for nonconvex optimization","author":"lian","year":"2015","journal-title":"Proc Adv Neural Inf Process Syst (NeurIPS)"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.3301241"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080254"},{"key":"ref9","first-page":"1106","article-title":"ImageNet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"Proc 26th Annu Conf Neural Inf Process Syst (NeurIPS)"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018682"},{"key":"ref45","first-page":"840","article-title":"Rethinking loss design for large-scale 3D shape retrieval","author":"li","year":"2018","journal-title":"Proc 28th Int Joint Conf Artif Intell"},{"key":"ref48","first-page":"139","article-title":"Efficient full-matrix adaptive regularization","volume":"97","author":"agarwal","year":"2019","journal-title":"Proc 36th Int Conf Mach Learn (ICML)"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33013894"},{"key":"ref42","first-page":"1273","article-title":"Communication-efficient learning of deep networks from decentralized data","author":"mcmahan","year":"2017","journal-title":"Proc AISTATS"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/MIC.2006.74"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.2967772"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.3022534"},{"key":"ref73","first-page":"2781","article-title":"Nonconvex variance reduced optimization with arbitrary sampling","author":"horv\u00e1th","year":"2019","journal-title":"Proc 36th Int Conf Mach Learn (ICML)"},{"key":"ref72","first-page":"1646","article-title":"SAGA: A fast incremental gradient method with support for non-strongly convex composite objectives","author":"defazio","year":"2014","journal-title":"Proc Annu Conf Neural Inf Process Syst"},{"key":"ref71","first-page":"3100","article-title":"Improved zeroth-order variance reduced algorithms and analysis for nonconvex optimization","volume":"97","author":"ji","year":"2019","journal-title":"Proc 36th Int Conf Mach Learn (ICML)"},{"key":"ref70","first-page":"1","article-title":"Stabilized SVRG: Simple variance reduction for nonconvex optimization","volume":"99","author":"ge","year":"2019","journal-title":"Proc 32nd Annu Conf Learn Theory"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1145\/3097983.3098147"},{"key":"ref77","article-title":"Control batch size and learning rate to generalize well: Theoretical and empirical evidence","author":"he","year":"2019","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref74","first-page":"983","article-title":"On the convergence of stochastic gradient descent with adaptive stepsizes","volume":"89","author":"li","year":"2018","journal-title":"Proc AISTATS"},{"key":"ref75","article-title":"Finite-time performance bounds and adaptive learning rate selection for two time-scale reinforcement learning","author":"gupta","year":"2019"},{"key":"ref78","first-page":"10420","article-title":"Escaping saddle points with adaptive gradient methods","volume":"97","author":"staib","year":"2019","journal-title":"Proc 36th Int Conf Mach Learn (ICML)"},{"key":"ref79","first-page":"1","article-title":"Adaptive gradient methods with dynamic bound of learning rate","author":"luo","year":"2019","journal-title":"Proc 7th Int Conf Learn Represent (ICLR)"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/355"},{"key":"ref62","article-title":"Improving neural networks by preventing co-adaptation of feature detectors","author":"hinton","year":"2012"},{"key":"ref61","first-page":"2013","article-title":"Incorporating Nesterov momentum into adam","author":"dozat","year":"2016","journal-title":"Proc ICLR Workshop"},{"key":"ref63","first-page":"448","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","author":"ioffe","year":"2015","journal-title":"Proc 32nd Int Conf Mach Learn (ICML)"},{"key":"ref64","first-page":"901","article-title":"Weight normalization: A simple reparameterization to accelerate training of deep neural networks","author":"salimans","year":"2016","journal-title":"Proc Annu Conf Neural Inf Process Syst (NeurIPS)"},{"key":"ref65","first-page":"2909","article-title":"Improving the robustness of deep neural networks via adversarial training with triplet loss","author":"li","year":"2018","journal-title":"Proc 28th Int Joint Conf Artif Intell"},{"key":"ref66","first-page":"3531","article-title":"Analyzing and improving representations with the soft nearest neighbor loss","volume":"97","author":"frosst","year":"2019","journal-title":"Proc 36th Int Conf Mach Learn (ICML)"},{"key":"ref67","first-page":"1","article-title":"Towards explaining the regularization effect of initial large learning rate in training neural networks","author":"li","year":"2019","journal-title":"Proc 33rd Int Conf Neural Inf Process Syst"},{"key":"ref68","first-page":"5996","article-title":"Learning optimal linear regularizers","volume":"97","author":"streeter","year":"2019","journal-title":"Proc 36th Int Conf Mach Learn (ICML)"},{"key":"ref69","first-page":"315","article-title":"Accelerating stochastic gradient descent using predictive variance reduction","author":"johnson","year":"2013","journal-title":"Proc 26th Annu Conf Neural Inf Process Syst (NeurIPS)"},{"key":"ref197","first-page":"1306","article-title":"Gradient sparsification for communication-efficient distributed optimization","author":"wangni","year":"2018","journal-title":"Proc Adv Neural Inf Process Syst (NeurIPS)"},{"key":"ref198","article-title":"Deep gradient compression: Reducing the communication bandwidth for distributed training","author":"lin","year":"2018","journal-title":"Proc 6th Int Conf Learn Represent (ICLR)"},{"key":"ref199","first-page":"4452","article-title":"Sparsified SGD with memory","author":"stich","year":"2018","journal-title":"Proc Annu Conf Neural Inf Process Syst (NeurIPS)"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2014-274"},{"key":"ref194","first-page":"1709","article-title":"QSGD: Communication-efficient SGD via gradient quantization and encoding","author":"alistarh","year":"2017","journal-title":"Proc Annu Conf Neural Inf Process Syst (NeurIPS)"},{"key":"ref195","first-page":"3478","article-title":"Decentralized stochastic optimization and gossip algorithms with compressed communication","author":"koloskova","year":"2019","journal-title":"Proc 36th Int Conf Mach Learn (ICML)"},{"key":"ref196","article-title":"Strategies and principles of distributed machine learning on big data","author":"xing","year":"2015"},{"key":"ref95","first-page":"629","article-title":"Gaia: Geo-distributed machine learning approaching LAN speeds","author":"hsieh","year":"2017","journal-title":"Proc NSDI"},{"key":"ref94","first-page":"181","article-title":"Poseidon: An efficient communication architecture for distributed deep learning on GPU clusters","author":"zhang","year":"2017","journal-title":"Proc ATC"},{"key":"ref190","first-page":"14236","article-title":"PowerSGD: Practical low-rank gradient compression for distributed optimization","author":"vogels","year":"2019","journal-title":"Proc Annu Conf Neural Inf Process Syst (NeurIPS)"},{"key":"ref93","article-title":"AMPNet: Asynchronous model-parallel training for dynamic neural networks","author":"gaunt","year":"2017"},{"key":"ref191","article-title":"Practical low-rank communication compression in decentralized deep learning","author":"vogels","year":"2020","journal-title":"Proc Annu Conf Neural Inf Process Syst"},{"key":"ref92","first-page":"2834","article-title":"On model parallelization and scheduling strategies for distributed machine learning","author":"lee","year":"2014","journal-title":"Proc NeurIPS"},{"key":"ref192","first-page":"4035","article-title":"ZipML: Training linear models with end-to-end low precision, and a little bit of deep learning","author":"zhang","year":"2017","journal-title":"Proc 34th Int Conf Mach Learn (ICML)"},{"key":"ref91","first-page":"1676","article-title":"Spotlight: Optimizing device placement for training deep neural networks","author":"gao","year":"2018","journal-title":"Proc 35th Int Conf Mach Learn (ICML)"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1145\/3241539.3241559"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2018.8485875"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1145\/3357223.3362708"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2016.2579198"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_37"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01014"},{"key":"ref84","first-page":"165","article-title":"Optimal distributed online prediction using mini-batches","volume":"13","author":"dekel","year":"2012","journal-title":"J Mach Learn Res"},{"key":"ref83","article-title":"Federated optimization:Distributed optimization beyond the datacenter","author":"konec?\u00fd","year":"2015"},{"key":"ref80","first-page":"5941","article-title":"Exact natural gradient in deep linear networks and application to the nonlinear case","author":"bernacchia","year":"2018","journal-title":"Advances in neural information processing systems"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1137\/100802001"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2019.01.037"},{"key":"ref86","first-page":"3149","article-title":"LightGBM: A highly efficient gradient boosting decision tree","author":"ke","year":"2017","journal-title":"Proc NeurIPS"},{"key":"ref87","first-page":"103","article-title":"GPipe: Efficient training of giant neural networks using pipeline parallelism","author":"huang","year":"2019","journal-title":"Proc NeurIPS"},{"key":"ref88","author":"hosmer","year":"1989","journal-title":"Applied Logistic Regression"},{"key":"ref200","first-page":"2","article-title":"Algorithmic regularization in over-parameterized matrix sensing and neural networks with quadratic activations","author":"li","year":"2018","journal-title":"Proc Conf Learn Theory (COLT)"},{"key":"ref101","article-title":"Towards federated learning at scale: System design","author":"bonawitz","year":"2019"},{"key":"ref100","first-page":"3068","article-title":"Communication-efficient distributed dual coordinate ascent","author":"jaggi","year":"2014","journal-title":"Proc NIPS"},{"key":"ref209","first-page":"5050","article-title":"LAG: Lazily aggregated gradient for communication-efficient distributed learning","author":"chen","year":"2018","journal-title":"Proc Adv Neural Inf Process Syst (NeurIPS)"},{"key":"ref203","author":"wu","year":"2020","journal-title":"On the convergence of quantized parallel restarted SGD for serverless learning"},{"key":"ref204","article-title":"Accurate, large minibatch SGD: Training ImageNet in 1 hour","author":"goyal","year":"2017"},{"key":"ref201","first-page":"1827","article-title":"Characterizing implicit bias in terms of optimization geometry","author":"gunasekar","year":"2018","journal-title":"Proc 35th Int Conf Mach Learn (ICML)"},{"key":"ref202","article-title":"Qsparse-local-SGD: Communication efficient distributed SGD with quantization, sparsification, and local computations","author":"basu","year":"2019"},{"key":"ref207","first-page":"7252","article-title":"Bayesian nonparametric federated learning of neural networks","author":"yurochkin","year":"2019","journal-title":"Proc 36th Int Conf Mach Learn (ICML)"},{"key":"ref208","first-page":"10334","article-title":"Is local SGD better than minibatch SGD?","author":"woodworth","year":"2020","journal-title":"Proc 37th Int Conf Mach Learn (ICML)"},{"key":"ref205","author":"you","year":"2020","journal-title":"The limit of the batch size"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/447"},{"key":"ref211","first-page":"954","article-title":"CMFL: Mitigating communication overhead for federated learning","author":"luping","year":"2019","journal-title":"Proc IEEE 39th Int Conf Distrib Comput Syst (ICDCS)"},{"key":"ref210","first-page":"3365","article-title":"Communication-efficient distributed learning via lazily aggregated quantized gradients","author":"sun","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst (NeurIPS)"},{"key":"ref212","article-title":"Intermittent pulling with local compensation for communication-efficient federated learning","author":"wang","year":"2020","journal-title":"IEEE Trans Emerg Topics Comput"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2021.3099977"},{"key":"ref214","doi-asserted-by":"publisher","DOI":"10.1145\/3225058.3225069"},{"key":"ref215","article-title":"Large batch optimization for deep learning: Training BERT in 76 minutes","author":"you","year":"2020","journal-title":"Proc 8th Int Conf Learn Represent (ICLR)"},{"key":"ref216","first-page":"1998","article-title":"Gradient diversity: A key ingredient for scalable distributed learning","author":"yin","year":"2018","journal-title":"Proc Int Conf Artif Intell Stat (AISTATS)"},{"key":"ref217","article-title":"Don&#x2019;t use large mini-batches, use local SGD","author":"lin","year":"2020","journal-title":"Proc 8th Int Conf Learn Represent (ICLR)"},{"key":"ref218","article-title":"Local SGD converges fast and communicates little","author":"stich","year":"2019","journal-title":"Proc 7th Int Conf Learn Represent (ICLR)"},{"key":"ref219","article-title":"Is local SGD better than minibatch SGD?","author":"woodworth","year":"2020"},{"key":"ref220","first-page":"2545","article-title":"Trading redundancy for communication: Speeding up distributed SGD for non-convex optimization","author":"haddadpour","year":"2019","journal-title":"Proc 36th Int Conf Mach Learn (ICML)"},{"key":"ref222","first-page":"13579","article-title":"Communication trade-offs for local-SGD with large step size","author":"dieuleveut","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst (NeurIPS)"},{"key":"ref221","first-page":"2165","article-title":"Order optimal one-shot distributed learning","author":"sharif-nassab","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst (NeurIPS)"},{"key":"ref229","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2021.3087272"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.1145\/3337821.3337828"},{"key":"ref227","first-page":"8056","article-title":"Pipe-SGD: A decentralized pipelined SGD framework for distributed deep net training","author":"li","year":"2018","journal-title":"Proc Adv Neural Inf Process Syst (NeurIPS)"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM41043.2020.9155446"},{"key":"ref225","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"ref224","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33015289"},{"key":"ref223","first-page":"11080","article-title":"Local SGD with periodic averaging: Tighter analysis and adaptive synchronization","author":"haddadpour","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst (NeurIPS)"},{"key":"ref127","author":"thapa","year":"2020","journal-title":"Splitfed When federated learning meets split learning"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1145\/3320269.3384740"},{"key":"ref125","article-title":"Split learning for health: Distributed deep learning without sharing raw patient data","author":"vepakomma","year":"2018"},{"key":"ref124","first-page":"1379","article-title":"Dynamic network surgery for efficient DNNs","author":"guo","year":"2016","journal-title":"Proc Annu Conf Neural Inf Process Syst (NeurIPS)"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737614"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037698"},{"key":"ref130","article-title":"A survey of model compression and acceleration for deep neural networks","author":"cheng","year":"2017"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1137\/S0895479896305696"},{"key":"ref134","article-title":"Accumulation bit-width scaling for ultra-low precision training of deep networks","author":"sakr","year":"2019","journal-title":"Proc 7th Int Conf Learn Represent (ICLR)"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-020-09816-7"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2020.2976475"},{"key":"ref232","first-page":"4150","article-title":"PipeTransformer: Automated elastic pipelining for distributed training of large-scale models","volume":"139","author":"he","year":"2021","journal-title":"Proc 38th Int Conf Mach Learn"},{"key":"ref233","first-page":"307","article-title":"HetPipe: Enabling large DNN training on (Whimpy) heterogeneous GPU clusters through integration of pipelined model parallelism and data parallelism","author":"park","year":"2020","journal-title":"Proc USENIX Annu Techn Conf (USENIX ATC)"},{"key":"ref230","first-page":"103","article-title":"GPipe: Efficient training of giant neural networks using pipeline parallelism","author":"huang","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref231","article-title":"PipeDream: Fast and efficient pipeline parallel DNN training","author":"harlap","year":"2018"},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.3002925"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.3025365"},{"key":"ref235","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.3026589"},{"key":"ref234","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.3009358"},{"key":"ref237","doi-asserted-by":"publisher","DOI":"10.1109\/TCC.2018.2789446"},{"key":"ref236","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.3017377"},{"key":"ref136","first-page":"1737","article-title":"Deep learning with limited numerical precision","volume":"37","author":"gupta","year":"2015","journal-title":"Proc 32nd Int Conf Mach Learn (ICML)"},{"key":"ref135","article-title":"Relaxed quantization for discretized neural networks","author":"louizos","year":"2019","journal-title":"Proc 7th Int Conf Learn Represent (ICLR)"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1626"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541967"},{"key":"ref139","year":"2021"},{"key":"ref140","first-page":"1967","article-title":"Pelee: A real-time object detection system on mobile devices","author":"wang","year":"2018","journal-title":"Proc Annu Conf Neural Inf Process Syst (NeurIPS)"},{"key":"ref141","article-title":"Binarized neural networks on the ImageNet classification task","author":"wu","year":"2016"},{"key":"ref142","article-title":"Neural networks with few multiplications","author":"lin","year":"2016","journal-title":"Proc 4th Int Conf Learn Represent (ICLR)"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_32"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2019.2959035"},{"key":"ref144","first-page":"4596","article-title":"Adaptive quantization for deep neural network","author":"zhou","year":"2018","journal-title":"Proc 32nd AAAI Conf Artif Intell (AAAI)"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.2963996"},{"key":"ref145","first-page":"3123","article-title":"BinaryConnect: Training deep neural networks with binary weights during propagations","author":"courbariaux","year":"2015","journal-title":"Proc Annu Conf Neural Inf Process Syst (NeurIPS)"},{"key":"ref241","doi-asserted-by":"publisher","DOI":"10.1109\/TMC.2021.3083154"},{"key":"ref242","doi-asserted-by":"publisher","DOI":"10.1109\/TCOMM.2019.2944169"},{"key":"ref243","doi-asserted-by":"publisher","DOI":"10.1109\/OJSP.2020.3036276"},{"key":"ref244","author":"zhu","year":"2018","journal-title":"Towards an Intelligent Edge Wireless Communication Meets Machine Learning"},{"key":"ref240","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.3022590"},{"key":"ref248","doi-asserted-by":"publisher","DOI":"10.1109\/WCNC49053.2021.9417322"},{"key":"ref247","doi-asserted-by":"publisher","DOI":"10.1109\/TWC.2020.2985039"},{"key":"ref246","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2014.2334709"},{"key":"ref245","author":"yang","year":"2019","journal-title":"Energy efficient federated learning over wireless communication networks"},{"key":"ref249","doi-asserted-by":"crossref","first-page":"2184","DOI":"10.1109\/JSAC.2019.2933969","article-title":"Machine learning in the air","volume":"37","author":"g\u00fcnd\u00fcz","year":"2019","journal-title":"IEEE J Sel Areas Commun"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1007\/BF00994018"},{"key":"ref108","article-title":"Horovod: Fast and easy distributed deep learning in tensorflow","author":"sergeev","year":"2018"},{"key":"ref107","year":"2020"},{"key":"ref106","first-page":"7663","article-title":"Communication compression for decentralized training","author":"tang","year":"2018","journal-title":"Proc Annu Conf Neural Inf Process Syst"},{"key":"ref105","first-page":"3049","article-title":"Asynchronous decentralized parallel stochastic gradient descent","author":"lian","year":"2018","journal-title":"Proc 35th Int Conf Mach Learn (ICML)"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1145\/3035918.3035933"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080244"},{"key":"ref102","first-page":"595","article-title":"Gandiva: Introspective cluster scheduling for deep learning","author":"xiao","year":"2018","journal-title":"Proc OSDI"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/TSIPN.2017.2695121"},{"key":"ref110","author":"neter","year":"1974","journal-title":"Applied Linear Statistical Models Regression Analysis of Variance and Experimental Designs"},{"key":"ref250","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2020.2981904"},{"key":"ref251","author":"zhu","year":"2020","journal-title":"One-bit over-the-air aggregation for communication-efficient federated edge learning Design and convergence analysis"},{"key":"ref254","year":"2021"},{"key":"ref255","year":"2021","journal-title":"Apple Core ML Framework"},{"key":"ref252","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM41043.2020.9155479"},{"key":"ref253","author":"anil","year":"2020","journal-title":"Second order optimization made practical"},{"key":"ref256","doi-asserted-by":"publisher","DOI":"10.1145\/3387514.3405878"},{"key":"ref10","author":"brown","year":"2020","journal-title":"Language models are few-shot learners"},{"key":"ref11","author":"amodei","year":"2018","journal-title":"AI and Compute"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.97"},{"key":"ref13","article-title":"Cisco global cloud index: Forecast and methodology, 2016&#x2013;2021","year":"0"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.3020911"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.3007662"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.3015986"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1145\/1402958.1402967"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3267809.3267817"},{"key":"ref117","year":"0","journal-title":"MPI Forum Message Passing Interface (MPI)Forum Home Page"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3363554"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3377454"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1145\/1592568.1592577"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2014.2304432"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1137\/130943170"},{"key":"ref116","first-page":"2165","article-title":"DSA: Decentralized double stochastic averaging gradient algorithm","volume":"17","author":"mokhtari","year":"2016","journal-title":"J Mach Learn Res"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1137\/14096668X"},{"key":"ref120","article-title":"Distributed machine learning via sufficient factor broadcasting","author":"xie","year":"2014"},{"key":"ref121","first-page":"795","article-title":"Lighter-communication distributed machine learning via sufficient factor broadcasting","author":"xie","year":"2016","journal-title":"Proc UAI"},{"key":"ref122","first-page":"345","article-title":"Towards accurate binary convolutional neural network","author":"lin","year":"2017","journal-title":"Proc Annu Conf Neural Inf Process Syst (NeurIPS)"},{"key":"ref123","first-page":"177","article-title":"Octo: Int8 training with loss-aware compensation and backward quantization for tiny on-device learning","author":"zhou","year":"2021","journal-title":"Proc USENIX Annu Techn Conf (USENIX ATC)"}],"container-title":["IEEE Internet of Things Journal"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6488907\/9672206\/09534784.pdf?arnumber=9534784","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,4,8]],"date-time":"2022-04-08T18:00:59Z","timestamp":1649440859000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9534784\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,1,15]]},"references-count":256,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/jiot.2021.3111624","relation":{},"ISSN":["2327-4662","2372-2541"],"issn-type":[{"value":"2327-4662","type":"electronic"},{"value":"2372-2541","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,1,15]]}}}