{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,30]],"date-time":"2025-09-30T00:21:09Z","timestamp":1759191669898,"version":"3.44.0"},"publisher-location":"Cham","reference-count":37,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032060778","type":"print"},{"value":"9783032060785","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,9,30]],"date-time":"2025-09-30T00:00:00Z","timestamp":1759190400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,30]],"date-time":"2025-09-30T00:00:00Z","timestamp":1759190400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-06078-5_25","type":"book-chapter","created":{"date-parts":[[2025,9,29]],"date-time":"2025-09-29T18:50:56Z","timestamp":1759171856000},"page":"436-452","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Gathering and Exploiting Higher-Order Information when Training Large Structured Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1007-0144","authenticated-orcid":false,"given":"Pierre","family":"Wolinski","sequence":"first","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,9,30]]},"reference":[{"issue":"2","key":"25_CR1","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1162\/089976698300017746","volume":"10","author":"S Amari","year":"1998","unstructured":"Amari, S.: Natural gradient works efficiently in learning. Neural Comput. 10(2), 251\u2013276 (1998)","journal-title":"Neural Comput."},{"key":"25_CR2","unstructured":"Arora, S., Du, S., Hu, W., Li, Z., Wang, R.: Fine-grained analysis of optimization and generalization for overparameterized two-layer neural networks. In: International Conference on Machine Learning, pp. 322\u2013332 (2019)"},{"key":"25_CR3","doi-asserted-by":"publisher","first-page":"359","DOI":"10.1007\/s10107-016-1065-8","volume":"163","author":"EG Birgin","year":"2017","unstructured":"Birgin, E.G., Gardenghi, J., Mart\u00ednez, J.M., Santos, S.A., Toint, P.L.: Worst-case evaluation complexity for unconstrained nonlinear optimization using high-order regularized models. Math. Program. 163, 359\u2013368 (2017)","journal-title":"Math. Program."},{"key":"25_CR4","first-page":"536","volume":"25","author":"AL Cauchy","year":"1847","unstructured":"Cauchy, A.L.: M\u00e9thode g\u00e9n\u00e9rale pour la r\u00e9solution des syst\u00e8mes d\u2019\u00e9quations simultan\u00e9es. Comptes rendus hebdomadaires des s\u00e9ances de l\u2019Acad\u00e9mie des sciences, Paris 25, 536\u2013538 (1847)","journal-title":"Comptes rendus hebdomadaires des s\u00e9ances de l\u2019Acad\u00e9mie des sciences, Paris"},{"key":"25_CR5","unstructured":"Clevert, D.A., Unterthiner, T., Hochreiter, S.: Fast and accurate deep network learning by exponential linear units (ELUS). arXiv preprint arXiv:1511.07289 (2015)"},{"key":"25_CR6","unstructured":"Dangel, F.J.: Backpropagation beyond the gradient. Ph.D. thesis, Universit\u00e4t T\u00fcbingen (2023)"},{"key":"25_CR7","unstructured":"Dieudonn\u00e9, J.: Foundations of Modern Analysis. No.\u00a010 in Pure and Applied Mathematics, Academic press (1960)"},{"key":"25_CR8","unstructured":"Du, S., Lee, J., Li, H., Wang, L., Zhai, X.: Gradient descent finds global minima of deep neural networks. In: International Conference on Machine Learning, pp. 1675\u20131685 (2019)"},{"key":"25_CR9","unstructured":"Gill, P.E., Murray, W., Wright, M.H.: Practical Optimization. Academic Press, San Diego (1981)"},{"key":"25_CR10","unstructured":"Goldfarb, D., Ren, Y., Bahamou, A.: Practical quasi-Newton methods for training deep neural networks. In: Advances in Neural Information Processing Systems, vol.\u00a033, pp. 2386\u20132396 (2020)"},{"key":"25_CR11","unstructured":"Gower, R., Kovalev, D., Lieder, F., Richt\u00e1rik, P.: RSN: randomized subspace Newton. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"25_CR12","unstructured":"Gupta, V., Koren, T., Singer, Y.: Shampoo: preconditioned stochastic tensor optimization. In: International Conference on Machine Learning, pp. 1842\u20131850 (2018)"},{"key":"25_CR13","unstructured":"Hanzely, F., Doikov, N., Nesterov, Y., Richtarik, P.: Stochastic subspace cubic Newton method. In: International Conference on Machine Learning, pp. 4027\u20134038 (2020)"},{"key":"25_CR14","unstructured":"Jacot, A., Gabriel, F., Hongler, C.: Neural tangent kernel: convergence and generalization in neural networks. In: Advances in Neural Information Processing Systems, vol.\u00a031 (2018)"},{"key":"25_CR15","unstructured":"Kornblith, S., Norouzi, M., Lee, H., Hinton, G.: Similarity of neural network representations revisited. In: International Conference on Machine Learning, pp. 3519\u20133529 (2019)"},{"issue":"11","key":"25_CR16","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y LeCun","year":"1998","unstructured":"LeCun, Y., Bottou, L., Bengio, Y., Haffner, P.: Gradient-based learning applied to document recognition. Proc. IEEE 86(11), 2278\u20132324 (1998)","journal-title":"Proc. IEEE"},{"key":"25_CR17","unstructured":"Lee, J., et al.: Wide neural networks of any depth evolve as linear models under gradient descent. In: Advances in Neural Information Processing Systems, vol.\u00a032 (2019)"},{"key":"25_CR18","unstructured":"Lu, Y., Harandi, M., Hartley, R., Pascanu, R.: Block mean approximation for efficient second order optimization. arXiv preprint arXiv:1804.05484 (2018)"},{"key":"25_CR19","doi-asserted-by":"publisher","unstructured":"Luenberger, D.G., Ye, Y.: Linear and Nonlinear Programming. Springer, 4th edn. (2008). https:\/\/doi.org\/10.1007\/978-0-387-74503-9","DOI":"10.1007\/978-0-387-74503-9"},{"key":"25_CR20","unstructured":"Martens, J., Grosse, R.: Optimizing neural networks with Kronecker-factored approximate curvature. In: International Conference on Machine Learning, pp. 2408\u20132417 (2015)"},{"issue":"4","key":"25_CR21","doi-asserted-by":"publisher","first-page":"667","DOI":"10.1002\/cpa.22008","volume":"75","author":"S Mei","year":"2022","unstructured":"Mei, S., Montanari, A.: The generalization error of random features regression: precise asymptotics and the double descent curve. Commun. Pure Appl. Math. 75(4), 667\u2013766 (2022)","journal-title":"Commun. Pure Appl. Math."},{"key":"25_CR22","doi-asserted-by":"crossref","unstructured":"Nash, S.G.: Newton-type minimization via the Lanczos method. SIAM J. Numer. Anal. 21(4), 770\u2013788 (1984)","DOI":"10.1137\/0721052"},{"key":"25_CR23","doi-asserted-by":"publisher","unstructured":"Nesterov, Y.: Introductory Lectures on Convex Optimization: A Basic Course, vol.\u00a087. Springer Science & Business Media (2003). https:\/\/doi.org\/10.1007\/978-1-4419-8853-9","DOI":"10.1007\/978-1-4419-8853-9"},{"issue":"1","key":"25_CR24","doi-asserted-by":"publisher","first-page":"177","DOI":"10.1007\/s10107-006-0706-8","volume":"108","author":"Y Nesterov","year":"2006","unstructured":"Nesterov, Y., Polyak, B.T.: Cubic regularization of newton method and its global performance. Math. Program. 108(1), 177\u2013205 (2006)","journal-title":"Math. Program."},{"key":"25_CR25","doi-asserted-by":"publisher","unstructured":"Nocedal, J., Wright, S.J.: Numerical optimization. Springer (1999). https:\/\/doi.org\/10.1007\/978-0-387-40065-5","DOI":"10.1007\/978-0-387-40065-5"},{"key":"25_CR26","doi-asserted-by":"publisher","first-page":"284","DOI":"10.1109\/LSP.2021.3050708","volume":"28","author":"T Nonomura","year":"2021","unstructured":"Nonomura, T., Ono, S., Nakai, K., Saito, Y.: Randomized subspace newton convex method applied to data-driven sensor selection problem. IEEE Signal Process. Lett. 28, 284\u2013288 (2021)","journal-title":"IEEE Signal Process. Lett."},{"key":"25_CR27","doi-asserted-by":"crossref","unstructured":"Ollivier, Y.: Riemannian metrics for neural networks I: feedforward networks. Inf. Inference J. IMA 4(2), 108\u2013153 (2015)","DOI":"10.1093\/imaiai\/iav006"},{"issue":"1","key":"25_CR28","doi-asserted-by":"publisher","first-page":"147","DOI":"10.1162\/neco.1994.6.1.147","volume":"6","author":"BA Pearlmutter","year":"1994","unstructured":"Pearlmutter, B.A.: Fast exact multiplication by the hessian. Neural Comput. 6(1), 147\u2013160 (1994)","journal-title":"Neural Comput."},{"key":"25_CR29","unstructured":"Ren, Y., Goldfarb, D.: Tensor normal training for deep learning models. In: Advances in Neural Information Processing Systems, vol.\u00a034, pp. 26040\u201326052 (2021)"},{"key":"25_CR30","unstructured":"Sagun, L., Evci, U., Guney, V.U., Dauphin, Y., Bottou, L.: Empirical analysis of the Hessian of over-parametrized neural networks. In: International Conference on Learning Representations (2018)"},{"key":"25_CR31","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"issue":"9","key":"25_CR32","doi-asserted-by":"publisher","first-page":"1607","DOI":"10.1016\/S0893-6080(98)00091-4","volume":"11","author":"YJ Wang","year":"1998","unstructured":"Wang, Y.J., Lin, C.T.: A second-order learning algorithm for multilayer networks based on block hessian matrix. Neural Netw. 11(9), 1607\u20131622 (1998)","journal-title":"Neural Netw."},{"key":"25_CR33","unstructured":"Yang, G., Hu, E.J.: Tensor programs iv: Feature learning in infinite-width neural networks. In: International Conference on Machine Learning, pp. 11727\u201311737 (2021)"},{"key":"25_CR34","doi-asserted-by":"crossref","unstructured":"Yao, Z., Gholami, A., Shen, S., Mustafa, M., Keutzer, K., Mahoney, M.: ADAHESSIAN: an adaptive second order optimizer for machine learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 35, no. 12, pp. 10665\u201310673 (2021)","DOI":"10.1609\/aaai.v35i12.17275"},{"key":"25_CR35","doi-asserted-by":"crossref","unstructured":"Yuan, R., Lazaric, A., Gower, R.M.: Sketched newton-Raphson. SIAM J. Optim. 32(3), 1555\u20131583 (2022)","DOI":"10.1137\/21M139788X"},{"issue":"67","key":"25_CR36","first-page":"1","volume":"23","author":"C Zhang","year":"2022","unstructured":"Zhang, C., Bengio, S., Singer, Y.: Are all layers created equal? J. Mach. Learn. Res. 23(67), 1\u201328 (2022)","journal-title":"J. Mach. Learn. Res."},{"key":"25_CR37","unstructured":"Zhang, G., Martens, J., Grosse, R.B.: Fast convergence of natural gradient descent for over-parameterized neural networks. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"}],"container-title":["Lecture Notes in Computer Science","Machine Learning and Knowledge Discovery in Databases. Research Track"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-06078-5_25","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,29]],"date-time":"2025-09-29T18:51:08Z","timestamp":1759171868000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-06078-5_25"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,30]]},"ISBN":["9783032060778","9783032060785"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-06078-5_25","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,30]]},"assertion":[{"value":"30 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that\u00a0are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"ECML PKDD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Joint European Conference on Machine Learning and Knowledge Discovery in Databases","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Porto","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Portugal","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecml2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ecmlpkdd.org\/2025\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}