{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T21:05:10Z","timestamp":1776891910145,"version":"3.51.2"},"reference-count":44,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neurocomputing"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.neucom.2026.133413","type":"journal-article","created":{"date-parts":[[2026,3,19]],"date-time":"2026-03-19T08:37:53Z","timestamp":1773909473000},"page":"133413","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Convergence analysis of the last iterate in distributed stochastic gradient descent with momentum"],"prefix":"10.1016","volume":"682","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-1734-5244","authenticated-orcid":false,"given":"Difei","family":"Cheng","sequence":"first","affiliation":[]},{"given":"Ruinan","family":"Jin","sequence":"additional","affiliation":[]},{"given":"Bo","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.neucom.2026.133413_bib0005","series-title":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"3675","article-title":"Decentralized deep learning using momentum-accelerated consensus","author":"Balu","year":"2021"},{"key":"10.1016\/j.neucom.2026.133413_bib0010","author":"Defazio"},{"key":"10.1016\/j.neucom.2026.133413_bib0015","author":"Gao"},{"issue":"9","key":"10.1016\/j.neucom.2026.133413_bib0020","doi-asserted-by":"crossref","first-page":"7270","DOI":"10.1109\/TIT.2025.3588401","article-title":"On the convergence of (stochastic) gradient descent for kolmogorov\u2013arnold networks","volume":"71","author":"Gao","year":"2025","journal-title":"IEEE Trans. Inf. Theory"},{"key":"10.1016\/j.neucom.2026.133413_bib0025","article-title":"Understanding the role of momentum in stochastic gradient methods","volume":"32","author":"Gitman","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.neucom.2026.133413_bib0030","series-title":"2013 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"6645","article-title":"Speech recognition with deep recurrent neural networks","author":"Graves","year":"2013"},{"key":"10.1016\/j.neucom.2026.133413_bib0035","series-title":"2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.neucom.2026.133413_bib0040","doi-asserted-by":"crossref","first-page":"504","DOI":"10.1126\/science.1127647","article-title":"Reducing the dimensionality of data with neural networks","volume":"313","author":"Hinton","year":"2006","journal-title":"Science"},{"key":"10.1016\/j.neucom.2026.133413_bib0045","doi-asserted-by":"crossref","first-page":"1294","DOI":"10.1137\/24M1639464","article-title":"Non-convergence to global minimizers for Adam and stochastic gradient descent optimization and constructions of local minimizers in the training of artificial neural networks","volume":"13","author":"Jentzen","year":"2025","journal-title":"SIAM\/ASA J. Uncertain. Quantif."},{"key":"10.1016\/j.neucom.2026.133413_bib0050","article-title":"A linear speedup analysis of distributed deep learning with sparse and quantized communication","volume":"31","author":"Jiang","year":"2018","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.neucom.2026.133413_bib0055","article-title":"Collaborative deep learning in fixed topology networks","volume":"30","author":"Jiang","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.neucom.2026.133413_bib0060","doi-asserted-by":"crossref","DOI":"10.3389\/frai.2021.573731","article-title":"On consensus-optimality trade-offs in collaborative deep learning","volume":"4","author":"Jiang","year":"2021","journal-title":"Front. Artif. Intell."},{"key":"10.1016\/j.neucom.2026.133413_bib0065","author":"Jin"},{"key":"10.1016\/j.neucom.2026.133413_bib0070","series-title":"Advances in Neural Information Processing Systems","first-page":"36559","article-title":"Revisit last-iterate convergence of msgd under milder requirement on step size","author":"Jin","year":"2022"},{"key":"10.1016\/j.neucom.2026.133413_bib0075","author":"Jin"},{"key":"10.1016\/j.neucom.2026.133413_bib0080","first-page":"359","article-title":"Convergence of proximal-gradient stochastic variational inference under non-decreasing step-size sequence","volume":"319","author":"Khan","year":"2015","journal-title":"J. Comp. Neurol."},{"key":"10.1016\/j.neucom.2026.133413_bib0085","first-page":"167","article-title":"Combining ordered subsets and momentum for accelerated x-ray CT image reconstruction","volume":"34","author":"Kim","year":"2014","journal-title":"IEEE Trans. Med. Imaging"},{"key":"10.1016\/j.neucom.2026.133413_bib0090","first-page":"1097","article-title":"Imagenet classification with deep convolutional neural networks","volume":"25","author":"Krizhevsky","year":"2012","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.neucom.2026.133413_bib0095","article-title":"Can decentralized algorithms outperform centralized algorithms? A case study for decentralized parallel stochastic gradient descent","volume":"30","author":"Lian","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.neucom.2026.133413_bib0100","series-title":"ICLR","article-title":"Deep gradient compression: reducing the communication bandwidth for distributed training","author":"Lin","year":"2018"},{"key":"10.1016\/j.neucom.2026.133413_bib0105","doi-asserted-by":"crossref","first-page":"27","DOI":"10.1016\/j.neucom.2023.01.032","article-title":"Last-iterate convergence analysis of stochastic momentum methods for neural networks","volume":"527","author":"Liu","year":"2023","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neucom.2026.133413_bib0110","first-page":"1","article-title":"Almost sure convergence rates analysis and saddle avoidance of stochastic gradient methods","volume":"25","author":"Liu","year":"2024","journal-title":"J. Mach. Learn. Res."},{"key":"10.1016\/j.neucom.2026.133413_bib0115","series-title":"Human Language Technologies: the 2010 Annual Conference of the North American Chapter of the Association for Computational Linguistics","first-page":"456","article-title":"Distributed training strategies for the structured perceptron","author":"McDonald","year":"2010"},{"key":"10.1016\/j.neucom.2026.133413_bib0120","doi-asserted-by":"crossref","first-page":"953","DOI":"10.1109\/JPROC.2018.2817461","article-title":"Network topology and communication-computation tradeoffs in decentralized optimization","volume":"106","author":"Nedi\u0107","year":"2018","journal-title":"Proc. IEEE"},{"key":"10.1016\/j.neucom.2026.133413_bib0125","series-title":"Introductory Lectures on Convex Optimization: A Basic Course","volume":"vol. 87","author":"Nesterov","year":"2013"},{"key":"10.1016\/j.neucom.2026.133413_bib0130","series-title":"International Conference on Machine Learning","first-page":"3750","article-title":"SGD and hogwild! convergence without the bounded gradients assumption","author":"Nguyen","year":"2018"},{"key":"10.1016\/j.neucom.2026.133413_bib0135","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/0041-5553(64)90137-5","article-title":"Some methods of speeding up the convergence of iteration methods","volume":"4","author":"Polyak","year":"1964","journal-title":"USSR Comput. Math. Math. Phys."},{"key":"10.1016\/j.neucom.2026.133413_bib0140","doi-asserted-by":"crossref","first-page":"400","DOI":"10.1214\/aoms\/1177729586","article-title":"A stochastic approximation method","volume":"22","author":"Robbins","year":"1951","journal-title":"Ann. Math. Stat."},{"key":"10.1016\/j.neucom.2026.133413_bib0145","series-title":"Conference on Learning Theory","first-page":"3935","article-title":"Almost sure convergence rates for stochastic gradient descent and stochastic heavy ball","author":"Sebbouh","year":"2021"},{"key":"10.1016\/j.neucom.2026.133413_bib0150","doi-asserted-by":"crossref","first-page":"954","DOI":"10.1109\/JSAIT.2021.3103920","article-title":"2020squarm: communication-efficient momentum SGD for decentralized optimization","volume":"2","author":"Singh","year":"2021","journal-title":"IEEE J. Sel. Areas Inf. Theory"},{"key":"10.1016\/j.neucom.2026.133413_bib0155","author":"Smith"},{"key":"10.1016\/j.neucom.2026.133413_bib0160","author":"Stich"},{"key":"10.1016\/j.neucom.2026.133413_bib0165","series-title":"International Conference on Machine Learning","first-page":"1139","article-title":"On the importance of initialization and momentum in deep learning","author":"Sutskever","year":"2013"},{"key":"10.1016\/j.neucom.2026.133413_bib0170","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.neucom.2018.04.048","article-title":"Adaptive deep feature learning network with nesterov momentum and its application to rotating machinery fault diagnosis","volume":"305","author":"Tang","year":"2018","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neucom.2026.133413_bib0175","author":"Tao"},{"key":"10.1016\/j.neucom.2026.133413_bib0180","article-title":"Cooperative SGD: a unified framework for the design and analysis of local-update SGD algorithms","volume":"22","author":"Wang","year":"2021","journal-title":"J. Mach. Learn. Res."},{"key":"10.1016\/j.neucom.2026.133413_bib0185","doi-asserted-by":"crossref","first-page":"3609","DOI":"10.1080\/03610926.2018.1478102","article-title":"On almost sure convergence for sums of stochastic sequence","volume":"48","author":"Wang","year":"2019","journal-title":"Commun. Stat.-Theory Methods"},{"key":"10.1016\/j.neucom.2026.133413_bib0190","series-title":"Proceedings of the 28th International Conference on Machine Learning (ICML-11)","first-page":"681","article-title":"Bayesian learning via stochastic gradient Langevin dynamics","author":"Welling","year":"2011"},{"key":"10.1016\/j.neucom.2026.133413_bib0195","author":"Yan"},{"key":"10.1016\/j.neucom.2026.133413_bib0200","series-title":"International Conference on Machine Learning","first-page":"7184","article-title":"On the linear speedup analysis of communication efficient momentum SGD for distributed non-convex optimization","author":"Yu","year":"2019"},{"key":"10.1016\/j.neucom.2026.133413_bib0205","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"5693","article-title":"Parallel restarted SGD with faster convergence and less communication: demystifying why model averaging works for deep learning","volume":"vol. 33","author":"Yu","year":"2019"},{"key":"10.1016\/j.neucom.2026.133413_bib0210","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"3029","article-title":"Decentlam: decentralized momentum SGD for large-batch deep training","author":"Yuan","year":"2021"},{"key":"10.1016\/j.neucom.2026.133413_bib0215","doi-asserted-by":"crossref","first-page":"2834","DOI":"10.1109\/TSP.2018.2818081","article-title":"On nonconvex decentralized gradient descent","volume":"66","author":"Zeng","year":"2018","journal-title":"IEEE Trans. Signal Process."},{"key":"10.1016\/j.neucom.2026.133413_bib0220","article-title":"Deep learning with elastic averaging SGD","volume":"28","author":"Zhang","year":"2015","journal-title":"Adv. Neural Inf. Process. Syst."}],"container-title":["Neurocomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231226008106?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231226008106?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:32:13Z","timestamp":1776889933000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0925231226008106"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":44,"alternative-id":["S0925231226008106"],"URL":"https:\/\/doi.org\/10.1016\/j.neucom.2026.133413","relation":{},"ISSN":["0925-2312"],"issn-type":[{"value":"0925-2312","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Convergence analysis of the last iterate in distributed stochastic gradient descent with momentum","name":"articletitle","label":"Article Title"},{"value":"Neurocomputing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neucom.2026.133413","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"133413"}}