{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T01:46:45Z","timestamp":1777340805420,"version":"3.51.4"},"reference-count":67,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Artificial Intelligence"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.artint.2026.104541","type":"journal-article","created":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T06:48:43Z","timestamp":1776840523000},"page":"104541","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["On the convergence of SignSGD under weak first- and second-order gradient Lipschitz"],"prefix":"10.1016","volume":"356","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5024-1900","authenticated-orcid":false,"given":"Tao","family":"Sun","sequence":"first","affiliation":[]},{"given":"Xinwang","family":"Liu","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.artint.2026.104541_bib0001","doi-asserted-by":"crossref","first-page":"400","DOI":"10.1214\/aoms\/1177729586","article-title":"A stochastic approximation method","author":"Robbins","year":"1951","journal-title":"Annals Math. Stat."},{"key":"10.1016\/j.artint.2026.104541_bib0002","series-title":"Proceedings of the 35th International Conference on Machine Learning","first-page":"560","article-title":"signSGD: compressed optimisation for non-convex problems","volume":"80","author":"Bernstein","year":"2018"},{"key":"10.1016\/j.artint.2026.104541_bib0003","series-title":"International Conference on Learning Representations","article-title":"SignSGD with majority vote is communication efficient and fault tolerant","author":"Bernstein","year":"2019"},{"key":"10.1016\/j.artint.2026.104541_bib0004","series-title":"Proceedings of the 36th International Conference on Machine Learning","first-page":"3252","article-title":"Error feedback fixes SignSGD and other gradient compression schemes","volume":"97","author":"Karimireddy","year":"2019"},{"key":"10.1016\/j.artint.2026.104541_bib0005","series-title":"Proceedings of the 40th International Conference on Machine Learning","article-title":"Momentum ensures convergence of SIGNSGD under weaker assumptions","author":"Sun","year":"2023"},{"key":"10.1016\/j.artint.2026.104541_bib0006","doi-asserted-by":"crossref","unstructured":"X. Chen, C. Liang, D. Huang, E. Real, K. Wang, Y. Liu, H. Pham, X. Dong, T. Luong, C.-J. Hsieh, et al., Symbolic discovery of optimization algorithms,(2023) arXiv: 2302.06675.","DOI":"10.52202\/075280-2140"},{"issue":"2","key":"10.1016\/j.artint.2026.104541_bib0007","doi-asserted-by":"crossref","first-page":"223","DOI":"10.1137\/16M1080173","article-title":"Optimization methods for large-scale machine learning","volume":"60","author":"Bottou","year":"2018","journal-title":"Siam Rev."},{"key":"10.1016\/j.artint.2026.104541_bib0008","series-title":"International Conference on Learning Representations","article-title":"Why gradient clipping accelerates training: a theoretical justification for adaptivity","author":"Zhang","year":"2020"},{"key":"10.1016\/j.artint.2026.104541_bib0009","series-title":"Low Rank Approximation: Algorithms, Implementation, Applications","volume":"906","author":"Markovsky","year":"2012"},{"key":"10.1016\/j.artint.2026.104541_bib0010","first-page":"15511","article-title":"Improved analysis of clipping algorithms for non-convex optimization","volume":"33","author":"Zhang","year":"2020","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"10.1016\/j.artint.2026.104541_bib0011","series-title":"International Conference on Machine Learning","first-page":"1310","article-title":"On the difficulty of training recurrent neural networks","author":"Pascanu","year":"2013"},{"key":"10.1016\/j.artint.2026.104541_bib0012","series-title":"Deep Learning","author":"Goodfellow","year":"2016"},{"key":"10.1016\/j.artint.2026.104541_bib0013","series-title":"International Conference on Learning Representations","article-title":"Regularizing and optimizing LSTM language models","author":"Merity","year":"2018"},{"key":"10.1016\/j.artint.2026.104541_bib0014","series-title":"International Conference on Machine Learning","first-page":"1243","article-title":"Convolutional sequence to sequence learning","author":"Gehring","year":"2017"},{"key":"10.1016\/j.artint.2026.104541_bib0015","series-title":"International Conference on Learning Representations","article-title":"Can gradient clipping mitigate label noise?","author":"Menon","year":"2019"},{"key":"10.1016\/j.artint.2026.104541_bib0016","series-title":"International Conference on Machine Learning","first-page":"654","article-title":"\u201cConvex Until Proven Guilty\u201d: dimension-free acceleration of gradient descent on non-convex functions","author":"Carmon","year":"2017"},{"issue":"2","key":"10.1016\/j.artint.2026.104541_bib0017","doi-asserted-by":"crossref","first-page":"1751","DOI":"10.1137\/17M1114296","article-title":"Accelerated methods for nonconvex optimization","volume":"28","author":"Carmon","year":"2018","journal-title":"SIAM J. Optim."},{"issue":"2","key":"10.1016\/j.artint.2026.104541_bib0018","doi-asserted-by":"crossref","first-page":"395","DOI":"10.1137\/20M1321759","article-title":"First-order methods for nonconvex quadratic minimization","volume":"62","author":"Carmon","year":"2020","journal-title":"SIAM Rev."},{"key":"10.1016\/j.artint.2026.104541_bib0019","first-page":"1195","article-title":"Finding approximate local minima for nonconvex optimization in linear time","author":"Agarwal","year":"2017","journal-title":"ACM Symp. Theory Comput. (STOC)"},{"key":"10.1016\/j.artint.2026.104541_bib0020","series-title":"Conference On Learning Theory","first-page":"1042","article-title":"Accelerated gradient descent escapes saddle points faster than gradient descent","author":"Jin","year":"2018"},{"key":"10.1016\/j.artint.2026.104541_bib0021","series-title":"International Conference on Machine Learning","first-page":"12901","article-title":"Restarted nonconvex accelerated gradient descent: no more polylogarithmic factor in the O(\u03f5\u22127\/4) complexity","author":"Li","year":"2022"},{"key":"10.1016\/j.artint.2026.104541_bib0022","series-title":"International Conference on Learning Representations","article-title":"Why gradient clipping accelerates training: a theoretical justification for adaptivity","author":"Zhang","year":"2019"},{"key":"10.1016\/j.artint.2026.104541_bib0023","first-page":"2771","article-title":"Non-convex distributionally robust optimization: non-asymptotic analysis","volume":"34","author":"Jin","year":"2021","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"10.1016\/j.artint.2026.104541_bib0024","series-title":"International Conference on Machine Learning","first-page":"7325","article-title":"Stability and convergence of stochastic gradient clipping: beyond Lipschitz continuity and smoothness","author":"Mai","year":"2021"},{"key":"10.1016\/j.artint.2026.104541_bib0025","unstructured":"L. Sun, P. Richt\u00e1rik, A note on the convergence of mirrored stein variational gradient descent under (L0,L1)\u2212 smoothness condition,(2022) arXiv: 2206.09709."},{"key":"10.1016\/j.artint.2026.104541_bib0026","series-title":"Doklady Akademii Nauk","first-page":"543","article-title":"A method of solving a convex programming problem with convergence rate O\\bigl(k\u23032\\bigr)","volume":"269","author":"Nesterov","year":"1983"},{"key":"10.1016\/j.artint.2026.104541_bib0027","series-title":"Proceedings of the 37th International Conference on Machine Learning","first-page":"410","article-title":"On the convergence of Nesterov\u2019s accelerated gradient method in stochastic settings","author":"Assran","year":"2020"},{"key":"10.1016\/j.artint.2026.104541_bib0028","series-title":"International Conference on Machine Learning","first-page":"2260","article-title":"Momentum improves normalized SGD","author":"Cutkosky","year":"2020"},{"key":"10.1016\/j.artint.2026.104541_bib0029","article-title":"Stochastic cubic regularization for fast nonconvex optimization","volume":"31","author":"Tripuraneni","year":"2018","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"10.1016\/j.artint.2026.104541_bib0030","article-title":"Natasha 2: faster non-convex optimization than SGD","volume":"31","author":"Allen-Zhu","year":"2018","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"10.1016\/j.artint.2026.104541_bib0031","series-title":"Conference on Learning Theory","first-page":"1192","article-title":"Sharp analysis for nonconvex sgd escaping from saddle points","author":"Fang","year":"2019"},{"key":"10.1016\/j.artint.2026.104541_bib0032","unstructured":"D. Zhou, P. Xu, Q. Gu, Finding local minima via stochastic nested variance reduction,(2018) arXiv: 1806.08782."},{"key":"10.1016\/j.artint.2026.104541_bib0033","series-title":"Conference on Learning Theory","first-page":"797","article-title":"Escaping from saddle points\u2014online stochastic gradient for tensor decomposition","author":"Ge","year":"2015"},{"key":"10.1016\/j.artint.2026.104541_bib0034","series-title":"International Conference on Machine Learning","first-page":"1724","article-title":"How to escape saddle points efficiently","author":"Jin","year":"2017"},{"key":"10.1016\/j.artint.2026.104541_bib0035","series-title":"International Conference on Algorithmic Learning Theory","first-page":"176","article-title":"Faster perturbed stochastic gradient methods for finding local minima","author":"Chen","year":"2022"},{"key":"10.1016\/j.artint.2026.104541_bib0036","series-title":"International Conference on Learning Representations","article-title":"signsignSGD via Zeroth-Order Oracle","author":"Liu","year":"2019"},{"key":"10.1016\/j.artint.2026.104541_bib0037","series-title":"International Conference on Learning Representations","article-title":"Sign bits are all you need for black-box attacks","author":"Al-Dujaili","year":"2020"},{"key":"10.1016\/j.artint.2026.104541_bib0038","series-title":"Advances in Neural Information Processing Systems","first-page":"14615","article-title":"Election coding for distributed learning: protecting SignSGD against byzantine attacks","volume":"33","author":"Sohn","year":"2020"},{"key":"10.1016\/j.artint.2026.104541_bib0039","unstructured":"R. Jin, Y. Huang, X. He, T. Wu, H. Dai, Stochastic-sign SGD for federated learning with theoretical guarantees,(2020) arXiv: 2002.10940."},{"key":"10.1016\/j.artint.2026.104541_bib0040","series-title":"Proceedings of the 38th International Conference on Machine Learning","first-page":"9224","article-title":"Stochastic sign descent methods: new algorithms and better theory","volume":"139","author":"Safaryan","year":"2021"},{"key":"10.1016\/j.artint.2026.104541_bib0041","doi-asserted-by":"crossref","unstructured":"M. Crawshaw, M. Liu, F. Orabona, W. Zhang, Z. Zhuang, Robustness to unbounded smoothness of generalized SignSGD,(2022) arXiv: 2208.11195.","DOI":"10.52202\/068431-0723"},{"issue":"8","key":"10.1016\/j.artint.2026.104541_bib0042","doi-asserted-by":"crossref","first-page":"7053","DOI":"10.1109\/TPAMI.2025.3566510","article-title":"On Nonconvex SGD under unbounded noise with weak gradient Lipschitz and delayed stochastic gradient","volume":"47","author":"Sun","year":"2025","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.artint.2026.104541_bib0043","unstructured":"T. Sun, X. Liu, K. Yuan, Gradient normalization provably benefits nonconvex sgd under heavy-tailed noise,(2024) arXiv: 2410.16561."},{"key":"10.1016\/j.artint.2026.104541_bib0044","series-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","first-page":"15287","article-title":"Investigating the role of weight decay in enhancing nonconvex sgd","author":"Sun","year":"2025"},{"key":"10.1016\/j.artint.2026.104541_bib0045","series-title":"ICLR (Poster)","article-title":"Adam: a method for stochastic optimization","author":"Kingma","year":"2015"},{"key":"10.1016\/j.artint.2026.104541_bib0046","unstructured":"Y. Dong, H. Li, Z. Lin, Convergence rate analysis of LION,(2024) arXiv: 2411.07724."},{"key":"10.1016\/j.artint.2026.104541_bib0047","series-title":"International Conference on Machine Learning","first-page":"47475","article-title":"Mean-field analysis on two-layer neural networks from a kernel perspective","author":"Takakura","year":"2024"},{"key":"10.1016\/j.artint.2026.104541_bib0048","doi-asserted-by":"crossref","first-page":"24610","DOI":"10.52202\/075280-1070","article-title":"Global convergence analysis of local SGD for two-layer neural network without overparameterization","volume":"36","author":"Bao","year":"2023","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"10.1016\/j.artint.2026.104541_bib0049","first-page":"30167","article-title":"Implicit bias of gradient descent for two-layer relu and leaky relu networks on nearly-orthogonal data","volume":"36","author":"Kou","year":"2023","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"10.1016\/j.artint.2026.104541_bib0050","series-title":"Neural Networks: tricks of the Trade","first-page":"9","article-title":"Efficient backprop","author":"LeCun","year":"2002"},{"key":"10.1016\/j.artint.2026.104541_bib0051","unstructured":"I. Loshchilov, F. Hutter, Decoupled weight decay regularization,(2017) arXiv: 1711.05101."},{"issue":"5","key":"10.1016\/j.artint.2026.104541_bib0052","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/0041-5553(64)90137-5","article-title":"Some methods of speeding up the convergence of iteration methods","volume":"4","author":"Polyak","year":"1964","journal-title":"Ussr Comput. Math. Math. Phys."},{"key":"10.1016\/j.artint.2026.104541_bib0053","series-title":"Introduction to Optimization","first-page":"32","volume":"Vol. 1","author":"Polyak","year":"1987"},{"issue":"1\u20132","key":"10.1016\/j.artint.2026.104541_bib0054","doi-asserted-by":"crossref","first-page":"59","DOI":"10.1007\/s10107-015-0871-8","article-title":"Accelerated gradient methods for nonconvex nonlinear and stochastic programming","volume":"156","author":"Ghadimi","year":"2016","journal-title":"Math. Program."},{"key":"10.1016\/j.artint.2026.104541_bib0055","series-title":"International Conference on Machine Learning","first-page":"1139","article-title":"On the importance of initialization and momentum in deep learning","author":"Sutskever","year":"2013"},{"key":"10.1016\/j.artint.2026.104541_bib0056","series-title":"3rd International Conference on Learning Representations (ICLR 2015)","article-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan","year":"2015"},{"key":"10.1016\/j.artint.2026.104541_bib0057","series-title":"Technical Report","article-title":"Learning Multiple Layers of Features from Tiny Images","author":"Krizhevsky","year":"2009"},{"key":"10.1016\/j.artint.2026.104541_bib0058","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.artint.2026.104541_bib0059","series-title":"International Conference on Learning Representations","article-title":"Pointer sentinel mixture models","author":"Merity","year":"2017"},{"key":"10.1016\/j.artint.2026.104541_bib0060","series-title":"Interspeech","first-page":"1045","article-title":"Recurrent neural network based language model","volume":"vol. 2","author":"Mikolov","year":"2010"},{"key":"10.1016\/j.artint.2026.104541_bib0061","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Adv. Neural Inform. Process. Syst."},{"issue":"140","key":"10.1016\/j.artint.2026.104541_bib0062","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"10.1016\/j.artint.2026.104541_bib0063","unstructured":"H. Touvron, T. Lavril, G. Izacard, X. Martinet, M.-A. Lachaux, T. Lacroix, B. Rozi\u00e8re, N. Goyal, E. Hambro, F. Azhar, et al., LLaMA: open and efficient foundation language models,(2023) arXiv: 2302.13971."},{"issue":"4","key":"10.1016\/j.artint.2026.104541_bib0064","doi-asserted-by":"crossref","first-page":"292","DOI":"10.2307\/1967124","article-title":"Note on the derivatives with respect to a parameter of the solutions of a system of differential equations","volume":"20","author":"Gronwall","year":"1919","journal-title":"Ann. Math."},{"key":"10.1016\/j.artint.2026.104541_bib0065","unstructured":"H.Z. Munthe-Kaas, O. Verdier, G. Vilmart, A short proof of Isserlis\u2019 theorem,(2025) arXiv: 2503.01588."},{"issue":"1\/2","key":"10.1016\/j.artint.2026.104541_bib0066","doi-asserted-by":"crossref","first-page":"134","DOI":"10.2307\/2331932","article-title":"On a formula for the product-moment coefficient of any order of a normal frequency distribution in any number of variables","volume":"12","author":"Isserlis","year":"1918","journal-title":"Biometrika"},{"key":"10.1016\/j.artint.2026.104541_bib0067","series-title":"Aspects of Multivariate Statistical Theory","author":"Muirhead","year":"2009"}],"container-title":["Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0004370226000676?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0004370226000676?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T01:09:55Z","timestamp":1777338595000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0004370226000676"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":67,"alternative-id":["S0004370226000676"],"URL":"https:\/\/doi.org\/10.1016\/j.artint.2026.104541","relation":{},"ISSN":["0004-3702"],"issn-type":[{"value":"0004-3702","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"On the convergence of SignSGD under weak first- and second-order gradient Lipschitz","name":"articletitle","label":"Article Title"},{"value":"Artificial Intelligence","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.artint.2026.104541","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"104541"}}