{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,11]],"date-time":"2026-02-11T22:02:20Z","timestamp":1770847340011,"version":"3.50.1"},"reference-count":73,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2025,6,1]],"date-time":"2025-06-01T00:00:00Z","timestamp":1748736000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,6,1]],"date-time":"2025-06-01T00:00:00Z","timestamp":1748736000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,1]],"date-time":"2025-06-01T00:00:00Z","timestamp":1748736000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100004359","name":"Vetenskapsr\u00e5det","doi-asserted-by":"publisher","award":["2020-03607"],"award-info":[{"award-number":["2020-03607"]}],"id":[{"id":"10.13039\/501100004359","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100030798","name":"Digital Futures","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100030798","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Sweden&#x0027;s Innovation Agency"},{"name":"National Academic Infrastructure for Supercomputing in Sweden"},{"name":"Chalmers Centre for Computational Science and Engineering"},{"DOI":"10.13039\/501100004359","name":"Vetenskapsr\u00e5det","doi-asserted-by":"publisher","award":["2022-06725"],"award-info":[{"award-number":["2022-06725"]}],"id":[{"id":"10.13039\/501100004359","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Control Netw. Syst."],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1109\/tcns.2025.3527255","type":"journal-article","created":{"date-parts":[[2025,1,8]],"date-time":"2025-01-08T15:30:23Z","timestamp":1736350223000},"page":"1721-1732","source":"Crossref","is-referenced-by-count":1,"title":["Parallel Momentum Methods Under Biased Gradient Estimations"],"prefix":"10.1109","volume":"12","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4884-4600","authenticated-orcid":false,"given":"Ali","family":"Beikmohammadi","sequence":"first","affiliation":[{"name":"Department of Computer and System Science, Stockholm University, Stockholm, Sweden"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4473-2011","authenticated-orcid":false,"given":"Sarit","family":"Khirirat","sequence":"additional","affiliation":[{"name":"King Abdullah University of Science and Technology (KAUST), Thuwal, Saudi Arabia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6617-8683","authenticated-orcid":false,"given":"Sindri","family":"Magn\u00fasson","sequence":"additional","affiliation":[{"name":"Department of Computer and System Science, Stockholm University, Stockholm, Sweden"}]}],"member":"263","reference":[{"key":"ref1","first-page":"265","article-title":"On optimization methods for deep learning","volume-title":"Proc. 28th Int. Conf. Int. Conf. Mach. Learn.","author":"Le","year":"2011"},{"key":"ref2","first-page":"215","article-title":"An analysis of single-layer networks in unsupervised feature learning","volume-title":"Proc. 14th Int. Conf. Artif. Intell. Statist-JMLR Workshop","author":"Coates","year":"2011"},{"key":"ref3","first-page":"1","article-title":"Parameter server for distributed machine learning","volume-title":"Proc. Big Learn. NIPS Workshop","author":"Li","year":"2013"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"ref5","first-page":"1139","article-title":"On the importance of initialization and momentum in deep learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Sutskever","year":"2013"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-35289-8_25"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-7908-2604-3_16"},{"key":"ref8","article-title":"Parallelized stochastic gradient descent","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Zinkevich","year":"2010"},{"key":"ref9","first-page":"1","article-title":"SGDR: Stochastic gradient descent with warm restarts","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Loshchilov","year":"2022"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TAC.2016.2525015"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CDC.2017.8264077"},{"key":"ref12","first-page":"14977","article-title":"The step decay schedule: A near optimal, geometrically decaying learning rate procedure for least squares","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ge","year":"2019"},{"key":"ref13","first-page":"14226","article-title":"On the convergence of step decay step-size for stochastic optimization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang","year":"2021"},{"key":"ref14","first-page":"3935","article-title":"Almost sure convergence rates for stochastic gradient descent and stochastic heavy ball","volume-title":"Proc. Conf. Learn. Theory","author":"Sebbouh","year":"2021"},{"key":"ref15","first-page":"6281","article-title":"Minibatch vs local SGD for heterogeneous distributed learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Woodworth","year":"2020"},{"issue":"1","key":"ref16","first-page":"962","article-title":"Importance sampling for minibatches","volume":"19","author":"Csiba","year":"2018","journal-title":"J. Mach. Learn. Res."},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1137\/140961791"},{"key":"ref18","first-page":"315","article-title":"Accelerating stochastic gradient descent using predictive variance reduction","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Johnson","year":"2013"},{"key":"ref19","first-page":"17309","article-title":"Random reshuffling: Simple analysis with vast improvements","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Mishchenko","year":"2020"},{"key":"ref20","first-page":"2624","article-title":"Random shuffling beats SGD after finite epochs","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Haochen","year":"2019"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/s10107-019-01440-w"},{"key":"ref22","first-page":"1707","article-title":"QSGD: Communication-efficient SGD via gradient quantization and encoding","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Alistarh","year":"2017"},{"issue":"276","key":"ref23","first-page":"1","article-title":"On biased compression for distributed learning","volume":"24","author":"Beznosikov","year":"2023","journal-title":"J. Mach. Learn. Res."},{"key":"ref24","first-page":"1310","article-title":"On the difficulty of training recurrent neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Pascanu","year":"2013"},{"key":"ref25","first-page":"4618","article-title":"Byzantine stochastic gradient descent","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Alistarh","year":"2018"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2022.3153135"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33011544"},{"key":"ref28","first-page":"1126","article-title":"Model-agnostic meta-learning for fast adaptation of deep networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Finn","year":"2017"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TAC.1983.1103184"},{"key":"ref30","first-page":"1","article-title":"Analysis of SGD with biased gradient estimators","volume-title":"Proc. 2020 Workshop Beyond First Order Methods ML Syst.","author":"Ajalloeian","year":"2020"},{"key":"ref31","first-page":"18261","article-title":"An improved analysis of stochastic gradient descent with momentum","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Liu","year":"2020"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2019.2952219"},{"key":"ref33","article-title":"Better theory for SGD in the nonconvex world","author":"Khaled","year":"2023","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref34","first-page":"1017","article-title":"Stochastic gradient descent, weighted sampling, and the randomized Kaczmarz algorithm","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Needell","year":"2014"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/8996.003.0006"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1016\/0041-5553(64)90137-5"},{"key":"ref37","first-page":"543","article-title":"A method of solving a convex programming problem with convergence rate $\\mathcal {O}(\\frac{1}{k^{2}})$","volume-title":"Doklady Akademii Nauk","volume":"269","author":"Nesterov","year":"1983"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/BF01069146"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/410"},{"key":"ref40","first-page":"7184","article-title":"On the linear speedup analysis of communication efficient momentum SGD for distributed non-convex optimization","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Yu","year":"2019"},{"key":"ref41","first-page":"1","article-title":"Quasi-hyperbolic momentum and Adam for deep learning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ma","year":"2018"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2019.2963066"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ECC.2015.7330562"},{"key":"ref44","first-page":"1","article-title":"Linearly convergent stochastic heavy ball method for minimizing generalization error","volume-title":"Proc. 10th NIPS Workshop Optim. Mach. Learn.","author":"Loizou","year":"2017"},{"key":"ref45","first-page":"410","article-title":"On the convergence of Nesterov\u2019s accelerated gradient method in stochastic settings","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Assran","year":"2020"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/TAC.2019.2942513"},{"key":"ref47","article-title":"Handbook of convergence theorems for (stochastic) gradient methods","author":"Garrigos","year":"2023"},{"key":"ref48","first-page":"10379","article-title":"SMG: A shuffling gradient-based method with momentum","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Tran","year":"2021"},{"key":"ref49","first-page":"3252","article-title":"Error feedback fixes signSGD and other gradient compression schemes","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Karimireddy","year":"2019"},{"key":"ref50","first-page":"1","article-title":"Why gradient clipping accelerates training: A theoretical justification for adaptivity","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhang","year":"2019"},{"key":"ref51","first-page":"7325","article-title":"Stability and convergence of stochastic gradient clipping: Beyond Lipschitz continuity and smoothness","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Mai","year":"2021"},{"key":"ref52","first-page":"15511","article-title":"Improved analysis of clipping algorithms for non-convex optimization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Zhang","year":"2020"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1137\/S1052623497331063"},{"issue":"24","key":"ref54","first-page":"1","article-title":"On biased stochastic gradient estimation","volume":"23","author":"Driggs","year":"2022","journal-title":"J. Mach. Learn. Res."},{"key":"ref55","first-page":"2759","article-title":"Biased stochastic first-order methods for conditional stochastic optimization and applications in meta learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Hu","year":"2020"},{"key":"ref56","first-page":"23158","article-title":"A guide through the zoo of biased SGD","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Demidovich","year":"2024"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46128-1_50"},{"key":"ref58","first-page":"8759","article-title":"Uniform convergence of gradients for non-convex learning and optimization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Foster","year":"2018"},{"key":"ref59","article-title":"Estimating full Lipschitz constants of deep neural networks","author":"Herrera","year":"2020"},{"key":"ref60","first-page":"313","article-title":"The power of adaptivity in SGD: Self-tuning step sizes with unbounded gradients and affine variance","volume-title":"Proc. Conf. Learn. Theory","author":"Faw","year":"2022"},{"key":"ref61","first-page":"5209","article-title":"Feature noise induces loss discrepancy across groups","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Khani","year":"2020"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/tit.2010.2048503"},{"key":"ref63","first-page":"4384","article-title":"EF21: A new, simpler, theoretically better, and practically faster error feedback","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Richtrik","year":"2021"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i9.16987"},{"key":"ref65","first-page":"15383","article-title":"Why are adaptive methods good for attention models?","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Zhang","year":"2020"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11795"},{"key":"ref67","doi-asserted-by":"crossref","DOI":"10.1137\/1.9781611976595","volume-title":"Lectures on Stochastic Programming: Modeling and Theory","author":"Shapiro","year":"2021"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1111\/j.1467-9868.2009.00718.x"},{"key":"ref69","first-page":"3557","article-title":"Personalized federated learning with theoretical guarantees: A model-agnostic meta-learning approach","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Fallah","year":"2020"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1007\/s10107-016-1017-3"},{"key":"ref71","article-title":"The MNIST database of handwritten digits","author":"LeCun","year":"1998"},{"key":"ref72","article-title":"Fashion-MNIST: A novel image dataset for benchmarking machine learning algorithms","author":"Xiao","year":"2017"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"}],"container-title":["IEEE Transactions on Control of Network Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6509490\/11045659\/10833794.pdf?arnumber=10833794","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,11]],"date-time":"2026-02-11T20:56:42Z","timestamp":1770843402000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10833794\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6]]},"references-count":73,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/tcns.2025.3527255","relation":{},"ISSN":["2325-5870","2372-2533"],"issn-type":[{"value":"2325-5870","type":"electronic"},{"value":"2372-2533","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,6]]}}}