{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T08:37:36Z","timestamp":1758098256798,"version":"3.44.0"},"reference-count":75,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"10","license":[{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1109\/tpami.2025.3581310","type":"journal-article","created":{"date-parts":[[2025,6,27]],"date-time":"2025-06-27T13:45:46Z","timestamp":1751031946000},"page":"8538-8549","source":"Crossref","is-referenced-by-count":0,"title":["Systematic Investigation of Sparse Perturbed Sharpness-Aware Minimization Optimizer"],"prefix":"10.1109","volume":"47","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-2911-4965","authenticated-orcid":false,"given":"Peng","family":"Mi","sequence":"first","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5659-3464","authenticated-orcid":false,"given":"Li","family":"Shen","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Technology, Shenzhen Campus of Sun Yat-sen University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tianhe","family":"Ren","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5110-4526","authenticated-orcid":false,"given":"Yiyi","family":"Zhou","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2142-4664","authenticated-orcid":false,"given":"Tianshuo","family":"Xu","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3912-9306","authenticated-orcid":false,"given":"Xiaoshuai","family":"Sun","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9640-6472","authenticated-orcid":false,"given":"Tongliang","family":"Liu","sequence":"additional","affiliation":[{"name":"The University of Sydney, Camperdown, NSW, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9163-2932","authenticated-orcid":false,"given":"Rongrong","family":"Ji","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7225-5449","authenticated-orcid":false,"given":"Dacheng","family":"Tao","sequence":"additional","affiliation":[{"name":"College of Computing and Data Science, Nanyang Technological University, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","year":"2020","author":"Dosovitskiy","key":"ref1"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.5244\/C.30.87"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","year":"2018","author":"Devlin","key":"ref4"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"article-title":"Large scale GAN training for high fidelity natural image synthesis","year":"2018","author":"Brock","key":"ref6"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1997.595474"},{"key":"ref8","first-page":"1928","article-title":"Asynchronous methods for deep reinforcement learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Mnih"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"article-title":"Improved regularization of convolutional neural networks with cutout","year":"2017","author":"DeVries","key":"ref10"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4899-7502-7_79-1"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00612"},{"article-title":"Improving neural networks by preventing co-adaptation of feature detectors","year":"2012","author":"Hinton","key":"ref13"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.5555\/3045118.3045167"},{"article-title":"On large-batch training for deep learning: Generalization gap and sharp minima","year":"2016","author":"Keskar","key":"ref15"},{"author":"Foret","key":"ref16","article-title":"Sharpness-aware minimization for efficiently improving generalization"},{"key":"ref17","first-page":"529","article-title":"Simplifying neural nets by discovering flat minima","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Hochreiter"},{"key":"ref18","first-page":"5949","article-title":"Exploring generalization in deep learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Neyshabur"},{"key":"ref19","first-page":"2958","article-title":"Adversarial weight perturbation helps robust generalization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wu"},{"key":"ref20","first-page":"5905","article-title":"ASAM: Adaptive sharpness-aware minimization for scale-invariant learning of deep neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kwon"},{"article-title":"Efficient sharpness-aware minimization for improved training of neural networks","year":"2021","author":"Du","key":"ref21"},{"article-title":"When vision transformers outperform resnets without pre-training or strong data augmentations","year":"2021","author":"Chen","key":"ref22"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-7908-2604-3_16"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1098\/rsta.1922.0009"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/307400.307435"},{"article-title":"Learning multiple layers of features from tiny images","year":"2009","author":"Krizhevsky","key":"ref26"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"article-title":"Make sharpness-aware minimization stronger: A sparsified perturbation approach","year":"2022","author":"Mi","key":"ref28"},{"key":"ref29","first-page":"1019","article-title":"Sharp minima can generalize for Deep Nets","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Dinh"},{"article-title":"Understanding sharpness-aware minimization","year":"2021","author":"Andriushchenko","key":"ref30"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00538"},{"key":"ref32","first-page":"24543","article-title":"Random sharpness-aware minimization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Liu"},{"article-title":"How does sharpness-aware minimization minimize sharpness?","year":"2022","author":"Wen","key":"ref33"},{"article-title":"Penalizing gradient norm for efficiently improving generalization in deep learning","year":"2022","author":"Zhao","key":"ref34"},{"article-title":"Surrogate gap minimization improves sharpness-aware training","year":"2022","author":"Zhuang","key":"ref35"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1611835114"},{"key":"ref37","first-page":"24193","article-title":"Training neural networks with fixed sparse masks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Sung"},{"article-title":"Faster gaze prediction with dense networks and fisher pruning","year":"2018","author":"Theis","key":"ref38"},{"key":"ref39","first-page":"598","article-title":"Optimal brain damage","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"LeCun"},{"article-title":"Deep compression: Compressing deep neural networks with pruning, trained quantization and Huffman coding","year":"2015","author":"Han","key":"ref40"},{"key":"ref41","first-page":"2943","article-title":"Rigging the lottery: Making all tickets winners","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Evci"},{"article-title":"Deep rewiring: Training very sparse deep networks","year":"2017","author":"Bellec","key":"ref42"},{"article-title":"Sparse networks from scratch: Faster training without losing performance","year":"2019","author":"Dettmers","key":"ref43"},{"key":"ref44","first-page":"20744","article-title":"Top-kast: Top-k always sparse training","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Jayakumar"},{"key":"ref45","first-page":"4646","article-title":"Parameter efficient training of deep convolutional neural networks by dynamic sparse reparameterization","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Mostafa"},{"key":"ref46","first-page":"9908","article-title":"Sparse training via boosting pruning plasticity with neuroregeneration","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Liu"},{"author":"Liu","key":"ref47","article-title":"The unreasonable effectiveness of random pruning: Return of the most naive baseline for sparse training"},{"article-title":"The lottery ticket hypothesis: Finding sparse, trainable neural networks","year":"2018","author":"Frankle","key":"ref48"},{"key":"ref49","first-page":"18098","article-title":"Woodfisher: Efficient second-order approximation for neural network compression","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Singh"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01152"},{"article-title":"Pruning filters for efficient convnets","year":"2016","author":"Li","key":"ref51"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00083"},{"article-title":"Bi-directional masks for efficient N: M sparse training","year":"2023","author":"Zhang","key":"ref53"},{"article-title":"Learning N: M fine-grained structured sparse neural networks from scratch","year":"2021","author":"Zhou","key":"ref55"},{"key":"ref57","first-page":"21099","article-title":"Accelerated sparse neural training: A provable and efficient method to find n: M transposable masks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Hubara"},{"key":"ref58","first-page":"13316","article-title":"Channel permutations for N: M sparsity","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Pool"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1162\/08997660260028683"},{"issue":"146","key":"ref60","first-page":"1","article-title":"New insights and perspectives on the natural gradient method","volume":"21","author":"Martens","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref61","first-page":"9573","article-title":"Fast approximate natural gradient descent in a Kronecker factored eigenbasis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"George"},{"key":"ref62","first-page":"2408","article-title":"Optimizing neural networks with Kronecker-factored approximate curvature","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Martens"},{"key":"ref63","first-page":"573","article-title":"A Kronecker-factored approximate fisher matrix for convolution layers","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Grosse"},{"article-title":"Concurrent adversarial learning for large-batch training","year":"2021","author":"Liu","key":"ref64"},{"key":"ref65","first-page":"1","article-title":"Towards practical adam: Non-convexity, convergence theory, and mini-batch acceleration","volume":"23","author":"Chen","year":"2022","journal-title":"J. Mach. Learn. Res."},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1137\/16M1080173"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1137\/120880811"},{"key":"ref68","first-page":"499","article-title":"Stability and generalization","volume":"2","author":"Bousquet","year":"2002","journal-title":"J. Mach. Learn. Res."},{"article-title":"Stabilizing sharpness-aware minimization through a simple renormalization strategy","year":"2024","author":"Tan","key":"ref69"},{"article-title":"Very deep convolutional networks for large-scale image recognition","year":"2014","author":"Simonyan","key":"ref70"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref72","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Touvron"},{"article-title":"High performance convolutional neural networks for document processing","volume-title":"Proc. 10th Int. Workshop Front. Handwriting Recognit.","author":"Chellapilla","key":"ref73"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.508"},{"key":"ref76","first-page":"6391","article-title":"Visualizing the loss landscape of neural nets","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Li"},{"article-title":"The break-even point on optimization trajectories of deep neural networks","year":"2020","author":"Jastrzebski","key":"ref77"},{"key":"ref78","first-page":"2232","article-title":"An investigation into neural net optimization via hessian eigenvalue density","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Ghorbani"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/11163533\/11054316.pdf?arnumber=11054316","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T13:11:23Z","timestamp":1758028283000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11054316\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10]]},"references-count":75,"journal-issue":{"issue":"10"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3581310","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"type":"print","value":"0162-8828"},{"type":"electronic","value":"2160-9292"},{"type":"electronic","value":"1939-3539"}],"subject":[],"published":{"date-parts":[[2025,10]]}}}