{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,8]],"date-time":"2025-07-08T09:05:45Z","timestamp":1751965545673,"version":"3.37.3"},"reference-count":81,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"12","license":[{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key Research and Development Program of China","award":["2020AAA0105601"],"award-info":[{"award-number":["2020AAA0105601"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["12371512","62276208"],"award-info":[{"award-number":["12371512","62276208"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100017596","name":"Natural Science Basic Research Program of Shaanxi Province","doi-asserted-by":"publisher","award":["2024JC-JCQN-02"],"award-info":[{"award-number":["2024JC-JCQN-02"]}],"id":[{"id":"10.13039\/501100017596","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2024,12]]},"DOI":"10.1109\/tpami.2024.3444002","type":"journal-article","created":{"date-parts":[[2024,8,15]],"date-time":"2024-08-15T17:35:49Z","timestamp":1723743349000},"page":"10375-10388","source":"Crossref","is-referenced-by-count":2,"title":["Sharpness-Aware Lookahead for Accelerating Convergence and Improving Generalization"],"prefix":"10.1109","volume":"46","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7091-898X","authenticated-orcid":false,"given":"Chengli","family":"Tan","sequence":"first","affiliation":[{"name":"School of Mathematics and Statistics, Xi&#x2019;an Jiaotong University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8395-1180","authenticated-orcid":false,"given":"Jiangshe","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Mathematics and Statistics, Xi&#x2019;an Jiaotong University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1462-7248","authenticated-orcid":false,"given":"Junmin","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Mathematics and Statistics, Xi&#x2019;an Jiaotong University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1793-5836","authenticated-orcid":false,"given":"Yihong","family":"Gong","sequence":"additional","affiliation":[{"name":"College of Software Engineering, Xi&#x2019;an Jiaotong University, Xi&#x2019;an, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"issue":"1","key":"ref2","first-page":"4839","article-title":"Beyond english-centric multilingual machine translation","volume":"22","author":"Fan","year":"2021","journal-title":"J. Mach. Learn. Res."},{"key":"ref3","first-page":"3915","article-title":"Self-supervised learning with random-projection quantizer for speech recognition","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Chiu"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-022-00550-z"},{"article-title":"On empirical comparisons of optimizers for deep learning","year":"2019","author":"Choi","key":"ref5"},{"article-title":"Benchmarking neural network training algorithms","year":"2023","author":"Dahl","key":"ref6"},{"key":"ref7","first-page":"9593","article-title":"Lookahead optimizer: K steps forward, 1 step back","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Zhang"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-7908-2604-3_16"},{"key":"ref9","first-page":"1","article-title":"Adam: A method for stochastic optimization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kingma"},{"key":"ref10","first-page":"1","article-title":"Taming GANs with lookahead-minmax","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Chavdarova"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01110"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00971"},{"key":"ref13","first-page":"9367","article-title":"Descending through a crowded valley-benchmarking deep learning optimizers","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Schmidt"},{"key":"ref14","first-page":"4764","article-title":"Bidirectional looking with a novel double exponential moving average to adaptive and non-adaptive momentum optimizers","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Chen"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.1.1"},{"key":"ref16","first-page":"1","article-title":"On large-batch training for deep learning: Generalization gap and sharp minima","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Keskar"},{"key":"ref17","first-page":"16577","article-title":"When do flat minima optimizers work?","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Kaddour"},{"key":"ref18","first-page":"840","article-title":"A modern look at the relationship between sharpness and generalization","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Andriushchenko"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1088\/1742-5468\/ab39d9"},{"key":"ref20","first-page":"1","article-title":"Snapshot ensembles: Train 1, get m for free","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Huang"},{"key":"ref21","first-page":"1","article-title":"Sharpness-aware minimization for efficiently improving generalization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Foret"},{"key":"ref22","first-page":"8289","article-title":"How SGD selects the global minima in over-parameterized learning: A dynamical stability perspective","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wu"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3071289"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4471-2063-6_2"},{"key":"ref25","first-page":"1019","article-title":"Sharp minima can generalize for deep nets","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Dinh"},{"key":"ref26","first-page":"1024","article-title":"Sharpness minimization algorithms do not only minimize sharpness to achieve better generalization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wen"},{"article-title":"Gradient descent happens in a tiny subspace","year":"2018","author":"Gur-Ari","key":"ref27"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3178101"},{"article-title":"The law of parsimony in gradient descent for learning deep linear networks","year":"2023","author":"Yaras","key":"ref29"},{"key":"ref30","first-page":"948","article-title":"Understanding gradient descent on the edge of stability in deep learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Arora"},{"key":"ref31","first-page":"28040","article-title":"The inductive bias of flatness regularization for deep matrix factorization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Gatmiry"},{"key":"ref32","first-page":"1","article-title":"Fantastic generalization measures and where to find them","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Jiang"},{"key":"ref33","first-page":"8299","article-title":"Low-pass filtering SGD for recovering flat optima in the deep learning optimization landscape","volume-title":"Proc. Int. Conf. Artif. Intell. Statist.","author":"Bisla"},{"article-title":"Averaging weights leads to wider optima and better generalization","year":"2018","author":"Izmailov","key":"ref34"},{"key":"ref35","first-page":"1","article-title":"Eliminating sharp minima from SGD with truncated heavy-tailed noise","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Wang"},{"key":"ref36","first-page":"15042","article-title":"Stochastic optimization with heavy-tailed noise via accelerated gradient clipping","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Gorbunov"},{"key":"ref37","first-page":"5827","article-title":"A tail-index analysis of stochastic gradient noise in deep neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Simsekli"},{"article-title":"SALR: Sharpness-aware learning rate scheduler for improved generalization","year":"2021","author":"Yue","key":"ref38"},{"key":"ref39","first-page":"5905","article-title":"ASAM: Adaptive sharpness-aware minimization for scale-invariant learning of deep neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kwon"},{"key":"ref40","first-page":"11148","article-title":"Fisher SAM: Information geometry and sharpness aware minimisation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kim"},{"key":"ref41","first-page":"1","article-title":"Surrogate gap minimization improves sharpness-aware training","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhuang"},{"key":"ref42","first-page":"24543","article-title":"Random sharpness-aware minimization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Liu"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599501"},{"key":"ref44","first-page":"1","article-title":"Qualitatively characterizing neural network optimization problems","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Goodfellow"},{"key":"ref45","first-page":"6391","article-title":"Visualizing the loss landscape of neural nets","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Li"},{"key":"ref46","first-page":"2698","article-title":"An alternative view: When does SGD escape local minima?","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kleinberg"},{"key":"ref47","first-page":"1","article-title":"When vision transformers outperform resnets without pre-training or strong data augmentations","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Chen"},{"key":"ref48","first-page":"26982","article-title":"Penalizing gradient norm for efficiently improving generalization in deep learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Zhao"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511804441"},{"key":"ref50","first-page":"639","article-title":"Towards understanding sharpness-aware minimization","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Andriushchenko"},{"key":"ref51","first-page":"1","article-title":"How sharpness-aware minimization minimizes sharpness?","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Wen"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1561\/2200000050"},{"key":"ref53","first-page":"499","article-title":"Stability and generalization","volume":"2","author":"Bousquet","year":"2002","journal-title":"J. Mach. Learn. Res."},{"key":"ref54","first-page":"797","article-title":"Escaping from saddle points\u2013online stochastic gradient for tensor decomposition","volume-title":"Proc. Conf. Learn. Theory","author":"Ge"},{"key":"ref55","first-page":"1","article-title":"A simple convergence proof of adam and adagrad","volume":"2022","author":"D\u00e9fossez","year":"2022","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref56","first-page":"1225","article-title":"Train faster, generalize better: Stability of stochastic gradient descent","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Hardt"},{"key":"ref57","first-page":"27290","article-title":"Towards understanding why lookahead generalizes better than SGD and beyond","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Zhou"},{"key":"ref58","first-page":"2364","article-title":"Stability of SGD: Tightness analysis and improved bounds","volume-title":"Proc. Int. Conf. Uncertain. Artif. Intell.","author":"Zhang"},{"article-title":"Improving generalization performance by switching from adam to SGD","year":"2017","author":"Keskar","key":"ref59"},{"key":"ref60","first-page":"4151","article-title":"The marginal value of adaptive gradient methods in machine learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wilson"},{"key":"ref61","first-page":"1","article-title":"On the convergence of adam and beyond","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Reddi"},{"key":"ref62","first-page":"1","article-title":"Decoupled weight decay regularization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Loshchilov"},{"article-title":"Learning multiple layers of features from tiny images","year":"2009","author":"Krizhevsky","key":"ref63"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/5.726791"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2990339"},{"key":"ref67","doi-asserted-by":"crossref","DOI":"10.5244\/C.30.87","article-title":"Wide residual networks","author":"Zagoruyko","year":"2016"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.668"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00612"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00020"},{"key":"ref71","first-page":"1","article-title":"SGDR: Stochastic gradient descent with warm restarts","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Loshchilov"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2010.11929"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref74","first-page":"18795","article-title":"Adabelief optimizer: Adapting stepsizes by the belief in observed gradients","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Zhuang"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.21236\/ADA273556"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1285"},{"key":"ref77","first-page":"1","article-title":"Pointer sentinel mixture models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Merity"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/BigData50022.2020.9378171"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/10746266\/10637761.pdf?arnumber=10637761","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:27:15Z","timestamp":1732667235000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10637761\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12]]},"references-count":81,"journal-issue":{"issue":"12"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2024.3444002","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"type":"print","value":"0162-8828"},{"type":"electronic","value":"2160-9292"},{"type":"electronic","value":"1939-3539"}],"subject":[],"published":{"date-parts":[[2024,12]]}}}