{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T22:32:34Z","timestamp":1773441154948,"version":"3.50.1"},"publisher-location":"Cham","reference-count":27,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031442032","type":"print"},{"value":"9783031442049","type":"electronic"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-44204-9_20","type":"book-chapter","created":{"date-parts":[[2023,9,21]],"date-time":"2023-09-21T04:02:11Z","timestamp":1695268931000},"page":"236-247","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Heavy-Tailed Regularization of\u00a0Weight Matrices in\u00a0Deep Neural Networks"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-0187-3535","authenticated-orcid":false,"given":"Xuanzhe","family":"Xiao","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3406-5051","authenticated-orcid":false,"given":"Zeng","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4292-8782","authenticated-orcid":false,"given":"Chuanlong","family":"Xie","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6404-6441","authenticated-orcid":false,"given":"Fengwei","family":"Zhou","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,9,22]]},"reference":[{"key":"20_CR1","doi-asserted-by":"crossref","unstructured":"Auffinger, A., Ben Arous, G., P\u00e9ch\u00e9, S.: Poisson convergence for the largest eigenvalues of heavy tailed random matrices. In: Annales de l\u2019IHP Probabilit\u00e9s et Statistiques, vol. 45, pp. 589\u2013610 (2009)","DOI":"10.1214\/08-AIHP188"},{"key":"20_CR2","first-page":"29364","volume":"34","author":"M Barsbey","year":"2021","unstructured":"Barsbey, M., Sefidgaran, M., Erdogdu, M.A., Richard, G., Simsekli, U.: Heavy tails in SGD and compressibility of overparametrized neural networks. Adv. Neural. Inf. Process. Syst. 34, 29364\u201329378 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"20_CR3","doi-asserted-by":"crossref","unstructured":"Bartlett, P., Maiorov, V., Meir, R.: Almost linear VC dimension bounds for piecewise polynomial networks. Adv. Neural. Inf. Process. Syst. 11 (1998)","DOI":"10.1162\/089976698300017016"},{"key":"20_CR4","unstructured":"Bartlett, P.L., Foster, D.J., Telgarsky, M.J.: Spectrally-normalized margin bounds for neural networks. Adv. Neural. Inf. Process. Syst. 30 (2017)"},{"issue":"Nov","key":"20_CR5","first-page":"463","volume":"3","author":"PL Bartlett","year":"2002","unstructured":"Bartlett, P.L., Mendelson, S.: Rademacher and Gaussian complexities: risk bounds and structural results. J. Mach. Learn. Res. 3(Nov), 463\u2013482 (2002)","journal-title":"J. Mach. Learn. Res."},{"key":"20_CR6","doi-asserted-by":"publisher","unstructured":"Chen, Q., Zhao, H., Li, W., Huang, P., Ou, W.: Behavior sequence transformer for e-commerce recommendation in Alibaba. In: Proceedings of the 1st International Workshop on Deep Learning Practice for High-Dimensional Sparse Data (2019). https:\/\/doi.org\/10.1145\/3326937.3341261, http:\/\/dx.doi.org\/10.1145\/3326937.3341261","DOI":"10.1145\/3326937.3341261"},{"issue":"3","key":"20_CR7","doi-asserted-by":"publisher","first-page":"517","DOI":"10.1007\/s10687-016-0251-7","volume":"19","author":"RA Davis","year":"2016","unstructured":"Davis, R.A., Heiny, J., Mikosch, T., Xie, X.: Extreme value analysis for the sample autocovariance matrices of heavy-tailed multivariate time series. Extremes 19(3), 517\u2013547 (2016). https:\/\/doi.org\/10.1007\/s10687-016-0251-7","journal-title":"Extremes"},{"issue":"3","key":"20_CR8","doi-asserted-by":"publisher","first-page":"767","DOI":"10.1016\/j.spa.2015.10.001","volume":"126","author":"RA Davis","year":"2016","unstructured":"Davis, R.A., Mikosch, T., Pfaffel, O.: Asymptotic theory for the sample covariance matrix of a heavy-tailed multivariate time series. Stochast. Process. Appl. 126(3), 767\u2013799 (2016)","journal-title":"Stochast. Process. Appl."},{"issue":"1","key":"20_CR9","doi-asserted-by":"publisher","first-page":"18","DOI":"10.1016\/j.spa.2013.07.005","volume":"124","author":"RA Davis","year":"2014","unstructured":"Davis, R.A., Pfaffel, O., Stelzer, R.: Limit theory for the largest eigenvalues of sample covariance matrices with heavy-tails. Stochast. Process. Appl. 124(1), 18\u201350 (2014)","journal-title":"Stochast. Process. Appl."},{"key":"20_CR10","doi-asserted-by":"publisher","unstructured":"Galassi, A., Lippi, M., Torroni, P.: Attention in natural language processing. IEEE Trans. Neural Netw. Learn. Syst. 32(10), 4291\u20134308 (2020). https:\/\/doi.org\/10.1109\/tnnls.2020.3019893, http:\/\/dx.doi.org\/10.1109\/tnnls.2020.3019893","DOI":"10.1109\/tnnls.2020.3019893"},{"key":"20_CR11","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"20_CR12","unstructured":"Hodgkinson, L., Mahoney, M.: Multiplicative noise and heavy tails in stochastic optimization. In: International Conference on Machine Learning, pp. 4262\u20134274. PMLR (2021)"},{"key":"20_CR13","unstructured":"Mandt, S., Hoffman, M.D., Blei, D.M.: Stochastic gradient descent as approximate Bayesian inference. arXiv preprint arXiv:1704.04289 (2017)"},{"key":"20_CR14","unstructured":"Martin, C.H., Mahoney, M.W.: Traditional and heavy-tailed self regularization in neural network models. arXiv preprint arXiv:1901.08276 (2019)"},{"key":"20_CR15","doi-asserted-by":"crossref","unstructured":"Martin, C.H., Mahoney, M.W.: Heavy-tailed universality predicts trends in test accuracies for very large pre-trained deep neural networks. In: Proceedings of the 2020 SIAM International Conference on Data Mining, pp. 505\u2013513. SIAM (2020)","DOI":"10.1137\/1.9781611976236.57"},{"issue":"165","key":"20_CR16","first-page":"1","volume":"22","author":"CH Martin","year":"2021","unstructured":"Martin, C.H., Mahoney, M.W.: Implicit self-regularization in deep neural networks: evidence from random matrix theory and implications for learning. J. Mach. Learn. Res. 22(165), 1\u201373 (2021)","journal-title":"J. Mach. Learn. Res."},{"key":"20_CR17","unstructured":"Martin, C.H., Mahoney, M.W.: Post-mortem on a deep learning contest: a Simpson\u2019s paradox and the complementary roles of scale metrics versus shape metrics. arXiv preprint arXiv:2106.00734 (2021)"},{"issue":"1","key":"20_CR18","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1038\/s41467-021-24025-8","volume":"12","author":"CH Martin","year":"2021","unstructured":"Martin, C.H., Peng, T.S., Mahoney, M.W.: Predicting trends in the quality of state-of-the-art neural networks without access to training or testing data. Nat. Commun. 12(1), 1\u201313 (2021)","journal-title":"Nat. Commun."},{"key":"20_CR19","unstructured":"Meng, X., Yao, J.: Impact of classification difficulty on the weight matrices spectra in deep learning and application to early-stopping. arXiv preprint arXiv:2111.13331 (2021)"},{"key":"20_CR20","unstructured":"Nagarajan, V., Kolter, J.Z.: Uniform convergence may be unable to explain generalization in deep learning. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"20_CR21","unstructured":"Neyshabur, B., Bhojanapalli, S., Srebro, N.: A PAC-Bayesian approach to spectrally-normalized margin bounds for neural networks. arXiv preprint arXiv:1707.09564 (2017)"},{"key":"20_CR22","first-page":"5138","volume":"33","author":"U Simsekli","year":"2020","unstructured":"Simsekli, U., Sener, O., Deligiannidis, G., Erdogdu, M.A.: Hausdorff dimension, heavy tails, and generalization in neural networks. Adv. Neural. Inf. Process. Syst. 33, 5138\u20135151 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"20_CR23","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1214\/ECP.v9-1112","volume":"9","author":"A Soshnikov","year":"2004","unstructured":"Soshnikov, A.: Poisson statistics for the largest eigenvalues of Wigner random matrices with heavy tails. Electron. Commun. Probab. 9, 82\u201391 (2004)","journal-title":"Electron. Commun. Probab."},{"issue":"5","key":"20_CR24","doi-asserted-by":"publisher","first-page":"851","DOI":"10.1162\/neco.1994.6.5.851","volume":"6","author":"V Vapnik","year":"1994","unstructured":"Vapnik, V., Levin, E., Le Cun, Y.: Measuring the VC-dimension of a learning machine. Neural Comput. 6(5), 851\u2013876 (1994)","journal-title":"Neural Comput."},{"key":"20_CR25","unstructured":"Vaswani, A., et al.: Attention is all you need. ArXiv abs\/1706.03762 (2017)"},{"issue":"3","key":"20_CR26","doi-asserted-by":"publisher","first-page":"107","DOI":"10.1145\/3446776","volume":"64","author":"C Zhang","year":"2021","unstructured":"Zhang, C., Bengio, S., Hardt, M., Recht, B., Vinyals, O.: Understanding deep learning (still) requires rethinking generalization. Commun. ACM 64(3), 107\u2013115 (2021)","journal-title":"Commun. ACM"},{"key":"20_CR27","first-page":"21285","volume":"33","author":"P Zhou","year":"2020","unstructured":"Zhou, P., Feng, J., Ma, C., Xiong, C., Hoi, S.C.H., et al.: Towards theoretically understanding why SGD generalizes better than Adam in deep learning. Adv. Neural. Inf. Process. Syst. 33, 21285\u201321296 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."}],"container-title":["Lecture Notes in Computer Science","Artificial Neural Networks and Machine Learning \u2013 ICANN 2023"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-44204-9_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,21]],"date-time":"2023-09-21T06:27:06Z","timestamp":1695277626000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-44204-9_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031442032","9783031442049"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-44204-9_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"22 September 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICANN","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Artificial Neural Networks","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Heraklion","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Greece","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26 September 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"32","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icann2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/e-nns.org\/icann2023\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"easyacademia.org","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"947","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"426","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"22","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"45% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.4","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"type of other papers accepted  : 9 Abstract","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}