{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T05:06:58Z","timestamp":1764997618321,"version":"3.40.3"},"publisher-location":"Cham","reference-count":53,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031703430"},{"type":"electronic","value":"9783031703447"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-70344-7_22","type":"book-chapter","created":{"date-parts":[[2024,8,29]],"date-time":"2024-08-29T08:02:43Z","timestamp":1724918563000},"page":"375-391","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Enhancing Sharpness-Aware Minimization by\u00a0Learning Perturbation Radius"],"prefix":"10.1007","author":[{"given":"Xuehao","family":"Wang","sequence":"first","affiliation":[]},{"given":"Weisen","family":"Jiang","sequence":"additional","affiliation":[]},{"given":"Shuai","family":"Fu","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,8,22]]},"reference":[{"key":"22_CR1","doi-asserted-by":"crossref","unstructured":"Allende, G.B., Still, G.: Solving bilevel programs with the KKT-approach. Mathematical Programming (2013)","DOI":"10.1007\/s10107-012-0535-x"},{"key":"22_CR2","unstructured":"Andriushchenko, M., Flammarion, N.: Towards understanding sharpness-aware minimization. In: ICML (2022)"},{"key":"22_CR3","doi-asserted-by":"crossref","unstructured":"Bahri, D., Mobahi, H., Tay, Y.: Sharpness-aware minimization improves language model generalization. In: ACL (2022)","DOI":"10.18653\/v1\/2022.acl-long.508"},{"key":"22_CR4","unstructured":"Bisla, D., Wang, J., Choromanska, A.: Low-pass filtering SGD for recovering flat optima in the deep learning optimization landscape. In: AISTATS (2022)"},{"key":"22_CR5","doi-asserted-by":"crossref","unstructured":"Bottou, L., Curtis, F.E., Nocedal, J.: Optimization methods for large-scale machine learning. In: SIAM Review (2018)","DOI":"10.1137\/16M1080173"},{"key":"22_CR6","doi-asserted-by":"crossref","unstructured":"Bracken, J., McGill, J.T.: Mathematical programs with optimization problems in the constraints. Operations Research (1973)","DOI":"10.21236\/AD0765295"},{"key":"22_CR7","unstructured":"Cha, J., et al.: SWAD: Domain generalization by seeking flat minima. In: NeurIPS (2021)"},{"key":"22_CR8","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: ICLR (2021)"},{"key":"22_CR9","unstructured":"Du, J., et al.: Efficient sharpness-aware minimization for improved training of neural networks. In: ICLR (2022)"},{"key":"22_CR10","unstructured":"Dziugaite, G.K., Roy, D.M.: Computing nonvacuous generalization bounds for deep (stochastic) neural networks with many more parameters than training data. In: UAI (2017)"},{"key":"22_CR11","doi-asserted-by":"crossref","unstructured":"Feurer, M., Hutter, F.: Hyperparameter optimization. In: AutoML (2019)","DOI":"10.1007\/978-3-030-05318-5_1"},{"key":"22_CR12","unstructured":"Foret, P., Kleiner, A., Mobahi, H., Neyshabur, B.: Sharpness-aware minimization for efficiently improving generalization. In: ICLR (2021)"},{"key":"22_CR13","unstructured":"Franceschi, L., Frasconi, P., Salzo, S., Grazzi, R., Pontil, M.: Bilevel programming for hyperparameter optimization and meta-learning. In: ICML (2018)"},{"key":"22_CR14","unstructured":"Ghadimi, S., Wang, M.: Approximation methods for bilevel programming. Preprint arXiv:1802.02246 (2018)"},{"key":"22_CR15","doi-asserted-by":"crossref","unstructured":"Han, D., Kim, J., Kim, J.: Deep pyramidal residual networks. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.668"},{"key":"22_CR16","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Delving deep into rectifiers: surpassing human-level performance on ImageNet classification. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.123"},{"key":"22_CR17","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"22_CR18","unstructured":"He, P., Liu, X., Gao, J., Chen, W.: Deberta: decoding-enhanced bert with disentangled attention. Preprint arXiv:2006.03654 (2020)"},{"key":"22_CR19","unstructured":"Hochreiter, S., Schmidhuber, J.: Simplifying neural nets by discovering flat minima. In: NeurIPS (1994)"},{"key":"22_CR20","unstructured":"Hong, M., Wai, H.T., Wang, Z., Yang, Z.: A two-timescale framework for bilevel optimization: complexity analysis and application to actor-critic. Preprint arXiv:2007.05170 (2020)"},{"key":"22_CR21","unstructured":"Izmailov, P., Podoprikhin, D., Garipov, T., Vetrov, D., Wilson, A.G.: Averaging weights leads to wider optima and better generalization. In: UAI (2018)"},{"key":"22_CR22","unstructured":"Jiang, W., Kwok, J., Zhang, Y.: Effective meta-regularization by kernelized proximal regularization. In: NeurIPS (2021)"},{"key":"22_CR23","unstructured":"Jiang, W., Kwok, J., Zhang, Y.: Subspace learning for effective meta-learning. In: ICML (2022)"},{"key":"22_CR24","unstructured":"Jiang, W., Yang, H., Zhang, Y., Kwok, J.: An adaptive policy to employ sharpness-aware minimization. In: ICLR (2023)"},{"key":"22_CR25","unstructured":"Jiang, W., Zhang, Y., Kwok, J.: Effective structured prompting by meta-learning and representative verbalizer. In: ICML (2023)"},{"key":"22_CR26","unstructured":"Jiang, Y., Neyshabur, B., Mobahi, H., Krishnan, D., Bengio, S.: Fantastic generalization measures and where to find them. In: ICLR (2020)"},{"key":"22_CR27","unstructured":"Keskar, N.S., Mudigere, D., Nocedal, J., Smelyanskiy, M., Tang, P.T.P.: On large-batch training for deep learning: generalization gap and sharp minima. In: ICLR (2017)"},{"key":"22_CR28","unstructured":"Khan, M., Nielsen, D., Tangkaratt, V., Lin, W., Gal, Y., Srivastava, A.: Fast and scalable Bayesian deep learning by weight-perturbation in Adam. In: ICML (2018)"},{"key":"22_CR29","unstructured":"Koh, P.W., Liang, P.: Understanding black-box predictions via influence functions. In: ICML (2017)"},{"key":"22_CR30","unstructured":"Krizhevsky, A., Hinton, G.: Learning multiple layers of features from tiny images. Tech. rep. (2009)"},{"key":"22_CR31","unstructured":"Kwon, J., Kim, J., Park, H., Choi, I.K.: ASAM: adaptive sharpness-aware minimization for scale-invariant learning of deep neural networks. In: ICML (2021)"},{"key":"22_CR32","unstructured":"Liao, R., et al.: Reviving and improving recurrent back-propagation. In: ICML (2018)"},{"key":"22_CR33","unstructured":"Liu, R., Liu, X., Yuan, X., Zeng, S., Zhang, J.: A value-function-based interior-point method for non-convex bi-level optimization. In: ICML (2021)"},{"key":"22_CR34","unstructured":"Liu, S., James, S., Davison, A.J., Johns, E.: Auto-Lambda: disentangling dynamic task relationships. TMLR (2022)"},{"key":"22_CR35","doi-asserted-by":"crossref","unstructured":"Liu, Y., Mai, S., Chen, X., Hsieh, C.J., You, Y.: Towards efficient and scalable sharpness-aware minimization. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01204"},{"key":"22_CR36","unstructured":"Liu, Y., Mai, S., Cheng, M., Chen, X., Hsieh, C.J., You, Y.: Random sharpness-aware minimization. In: NeurIPS (2022)"},{"key":"22_CR37","unstructured":"Mi, P., et al.: Make sharpness-aware minimization stronger: a sparsified perturbation approach. In: NeurIPS (2022)"},{"key":"22_CR38","unstructured":"Pedregosa, F.: Hyperparameter optimization with approximate gradient. In: ICML (2016)"},{"key":"22_CR39","unstructured":"Petzka, H., Kamp, M., Adilova, L., Sminchisescu, C., Boley, M.: Relative flatness and generalization. In: NeurIPS (2021)"},{"key":"22_CR40","unstructured":"Qu, Z., Li, X., Duan, R., Liu, Y., Tang, B., Lu, Z.: Generalized federated learning via sharpness aware minimization. In: ICML (2022)"},{"key":"22_CR41","doi-asserted-by":"crossref","unstructured":"Reddi, S.J., Hefny, A., Sra, S., Poczos, B., Smola, A.: Stochastic variance reduction for nonconvex optimization. In: ICML (2016)","DOI":"10.1109\/ALLERTON.2016.7852377"},{"key":"22_CR42","doi-asserted-by":"crossref","unstructured":"Russakovsky, O., et al.: ImageNet large scale visual recognition challenge. In: IJCV (2015)","DOI":"10.1007\/s11263-015-0816-y"},{"key":"22_CR43","doi-asserted-by":"crossref","unstructured":"Sinha, A., Soun, T., Deb, K.: Using Karush-Kuhn-Tucker proximity measure for solving bilevel optimization problems. Swarm and Evolutionary Computation (2019)","DOI":"10.1016\/j.swevo.2018.06.004"},{"key":"22_CR44","unstructured":"Stadie, B., Zhang, L., Ba, J.: Learning intrinsic rewards as a bi-level optimization problem. In: UAI (2020)"},{"key":"22_CR45","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS (2017)"},{"key":"22_CR46","doi-asserted-by":"crossref","unstructured":"Wang, A., Singh, A., Michael, J., Hill, F., Levy, O., Bowman, S.R.: GLUE: a multi-task benchmark and analysis platform for natural language understanding. Preprint arXiv:1804.07461 (2018)","DOI":"10.18653\/v1\/W18-5446"},{"key":"22_CR47","unstructured":"Ye, F., Lin, B., Yue, Z., Guo, P., Xiao, Q., Zhang, Y.: Multi-objective meta learning. In: NeurIPS (2021)"},{"key":"22_CR48","doi-asserted-by":"crossref","unstructured":"Zagoruyko, S., Komodakis, N.: Wide residual networks. In: BMVC (2016)","DOI":"10.5244\/C.30.87"},{"key":"22_CR49","doi-asserted-by":"crossref","unstructured":"Zhang, C., Bengio, S., Hardt, M., Recht, B., Vinyals, O.: Understanding deep learning (still) requires rethinking generalization. In: Communications of the ACM (2021)","DOI":"10.1145\/3446776"},{"key":"22_CR50","unstructured":"Zhao, Y., Zhang, H., Hu, X.: Penalizing gradient norm for efficiently improving generalization in deep learning. In: ICML (2022)"},{"key":"22_CR51","unstructured":"Zhao, Y., Zhang, H., Hu, X.: Randomized sharpness-aware training for boosting computational efficiency in deep learning. Preprint arXiv:2203.09962 (2022)"},{"key":"22_CR52","unstructured":"Zhou, M., Liu, T., Li, Y., Lin, D., Zhou, E., Zhao, T.: Toward understanding the importance of noise in training neural networks. In: ICML (2019)"},{"key":"22_CR53","unstructured":"Zhuang, J., et al.: Surrogate gap minimization improves sharpness-aware training. In: ICLR (2022)"}],"container-title":["Lecture Notes in Computer Science","Machine Learning and Knowledge Discovery in Databases. Research Track"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-70344-7_22","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,29]],"date-time":"2024-08-29T08:09:38Z","timestamp":1724918978000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-70344-7_22"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031703430","9783031703447"],"references-count":53,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-70344-7_22","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"22 August 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECML PKDD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Joint European Conference on Machine Learning and Knowledge Discovery in Databases","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Vilnius","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lithuania","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 September 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecml2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/2024.ecmlpkdd.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}