{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,5,18]],"date-time":"2024-05-18T00:40:19Z","timestamp":1715992819994},"publisher-location":"New York, NY, USA","reference-count":76,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,3]]},"DOI":"10.1145\/3659914.3659931","type":"proceedings-article","created":{"date-parts":[[2024,5,15]],"date-time":"2024-05-15T14:13:51Z","timestamp":1715782431000},"update-policy":"http:\/\/dx.doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["PETScML: Second-Order Solvers for Training Regression Problems in Scientific Machine Learning"],"prefix":"10.1145","author":[{"ORCID":"http:\/\/orcid.org\/0000-0002-0435-0433","authenticated-orcid":false,"given":"Stefano","family":"Zampini","sequence":"first","affiliation":[{"name":"King Abdullah University of Science and Technology, Thuwal, Saudi Arabia"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-2577-1106","authenticated-orcid":false,"given":"Umberto","family":"Zerbinati","sequence":"additional","affiliation":[{"name":"University of Oxford, Oxford, United Kingdom"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-1692-5812","authenticated-orcid":false,"given":"George","family":"Turkyyiah","sequence":"additional","affiliation":[{"name":"King Abdullah University of Science and Technology, Thuwal, Saudi Arabia"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-4052-7224","authenticated-orcid":false,"given":"David","family":"Keyes","sequence":"additional","affiliation":[{"name":"King Abdullah University of Science and Technology, Thuwal, Saudi Arabia"}]}],"member":"320","published-online":{"date-parts":[[2024,6,3]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cam.2005.12.030"},{"key":"e_1_3_2_1_2_1","volume-title":"Second Order Optimization Made Practical. CoRR abs\/2002.09018","author":"Anil Rohan","year":"2020","unstructured":"Rohan Anil, Vineet Gupta, Tomer Koren, Kevin Regan, and Yoram Singer. 2020. Second Order Optimization Made Practical. CoRR abs\/2002.09018 (2020). arXiv:2002.09018"},{"key":"e_1_3_2_1_4_1","volume-title":"Lois Curfman McInnes","author":"Balay Satish","year":"2023","unstructured":"Satish Balay, Shrirang Abhyankar, Mark F. Adams, Steven Benson, Jed Brown, Peter Brune, Kris Buschelman, Emil M. Constantinescu, Lisandro Dalcin, Alp Dener, Victor Eijkhout, Jacob Faibussowitsch, William D. Gropp, V\u00e1clav Hapla, Tobin Isaac, Pierre Jolivet, Dmitry Karpeev, Dinesh Kaushik, Matthew G. Knepley, Fande Kong, Scott Kruger, Dave A. May, Lois Curfman McInnes, Richard Tran Mills, Lawrence Mitchell, Todd Munson, Jose E. Roman, Karl Rupp, Patrick Sanan, Jason Sarich, Barry F. Smith, Stefano Zampini, Hong Zhang, Hong Zhang, and Junchao Zhang. 2023. PETSc Web page. https:\/\/petsc.org\/"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1002\/nla.2322"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1137\/16M1080173"},{"key":"e_1_3_2_1_7_1","volume-title":"Data-driven discovery of Green's functions with human-understandable deep learning. Scientific reports 12, 1","author":"Boull\u00e9 Nicolas","year":"2022","unstructured":"Nicolas Boull\u00e9, Christopher J Earls, and Alex Townsend. 2022. Data-driven discovery of Green's functions with human-understandable deep learning. Scientific reports 12, 1 (2022), 4824."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.2303904120"},{"key":"e_1_3_2_1_9_1","volume-title":"Rational neural networks. Advances in neural information processing systems 33","author":"Boull\u00e9 Nicolas","year":"2020","unstructured":"Nicolas Boull\u00e9, Yuji Nakatsukasa, and Alex Townsend. 2020. Rational neural networks. Advances in neural information processing systems 33 (2020), 14243--14253."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1137\/130936725"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1517384113"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1137\/140954362"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.5555\/3119411.3119541"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1137\/S106482750037620X"},{"key":"e_1_3_2_1_15_1","volume-title":"Nicholas IM Gould, and Philippe L Toint","author":"Conn Andrew R","year":"2000","unstructured":"Andrew R Conn, Nicholas IM Gould, and Philippe L Toint. 2000. Trust region methods. SIAM."},{"key":"e_1_3_2_1_16_1","volume-title":"Approximation by superpositions of a sigmoidal function. Mathematics of control, signals and systems 2, 4","author":"Cybenko George","year":"1989","unstructured":"George Cybenko. 1989. Approximation by superpositions of a sigmoidal function. Mathematics of control, signals and systems 2, 4 (1989), 303--314."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.advwatres.2011.04.013"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3627037"},{"key":"e_1_3_2_1_19_1","volume-title":"Elizabeth Qian, and Andrew M Stuart.","author":"de Hoop Maarten V","year":"2022","unstructured":"Maarten V de Hoop, Daniel Zhengyu Huang, Elizabeth Qian, and Andrew M Stuart. 2022. The cost-accuracy trade-off in operator learning with neural networks. Journal of Machine Learning (2022)."},{"key":"e_1_3_2_1_20_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_21_1","article-title":"Adaptive subgradient methods for online learning and stochastic optimization","volume":"12","author":"Duchi John","year":"2011","unstructured":"John Duchi, Elad Hazan, and Yoram Singer. 2011. Adaptive subgradient methods for online learning and stochastic optimization. Journal of machine learning research 12, 7 (2011).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1137\/0917003"},{"key":"e_1_3_2_1_23_1","volume-title":"Matthew James Johnson, and Chris Leary","author":"Frostig Roy","year":"2018","unstructured":"Roy Frostig, Matthew James Johnson, and Chris Leary. 2018. Compiling machine learning programs via high-level tracing. Systems for Machine Learning 4(9) (2018)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3495925"},{"key":"e_1_3_2_1_25_1","volume-title":"large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677","author":"Goyal Priya","year":"2017","unstructured":"Priya Goyal, Piotr Doll\u00e1r, Ross Girshick, Pieter Noordhuis, Lukasz Wesolowski, Aapo Kyrola, Andrew Tulloch, Yangqing Jia, and Kaiming He. 2017. Accurate, large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677 (2017)."},{"key":"e_1_3_2_1_26_1","volume-title":"Shampoo: Preconditioned Stochastic Tensor Optimization.","author":"Gupta Vineet","year":"2018","unstructured":"Vineet Gupta, Tomer Koren, and Yoram Singer. 2018. Shampoo: Preconditioned Stochastic Tensor Optimization."},{"key":"e_1_3_2_1_27_1","volume-title":"Gaussian Error Linear Units (GELUs). arXiv preprint arXiv:1606.08415","author":"Hendrycks Dan","year":"2016","unstructured":"Dan Hendrycks and Kevin Gimpel. 2016. Gaussian Error Linear Units (GELUs). arXiv preprint arXiv:1606.08415 (2016)."},{"key":"e_1_3_2_1_28_1","volume-title":"Multilayer feedforward networks are universal approximators. Neural networks 2, 5","author":"Hornik Kurt","year":"1989","unstructured":"Kurt Hornik, Maxwell Stinchcombe, and Halbert White. 1989. Multilayer feedforward networks are universal approximators. Neural networks 2, 5 (1989), 359--366."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"e_1_3_2_1_30_1","first-page":"7703","article-title":"Going beyond linear transformers with recurrent fast weight programmers","volume":"34","author":"Irie Kazuki","year":"2021","unstructured":"Kazuki Irie, Imanol Schlag, R\u00f3bert Csord\u00e1s, and J\u00fcrgen Schmidhuber. 2021. Going beyond linear transformers with recurrent fast weight programmers. Advances in Neural Information Processing Systems 34 (2021), 7703--7717.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433707"},{"key":"e_1_3_2_1_32_1","volume-title":"ADAM: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. ADAM: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.5555\/3546258.3546548"},{"key":"e_1_3_2_1_34_1","first-page":"1","article-title":"Neural Operator: Learning Maps Between Function Spaces With Applications to PDEs","volume":"24","author":"Kovachki Nikola B","year":"2023","unstructured":"Nikola B Kovachki, Zongyi Li, Burigede Liu, Kamyar Azizzadenesheli, Kaushik Bhattacharya, Andrew M Stuart, and Anima Anandkumar. 2023. Neural Operator: Learning Maps Between Function Spaces With Applications to PDEs. J. Mach. Learn. Res. 24, 89 (2023), 1--97.","journal-title":"J. Mach. Learn. Res."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1093\/imatrm\/tnac001"},{"key":"e_1_3_2_1_36_1","volume-title":"Deep Learning. nature 521, 7553","author":"LeCun Yann","year":"2015","unstructured":"Yann LeCun, Yoshua Bengio, and Geoffrey Hinton. 2015. Deep Learning. nature 521, 7553 (2015), 436--444."},{"key":"e_1_3_2_1_37_1","volume-title":"Preconditioned stochastic gradient descent","author":"Xi-Lin Li.","year":"2017","unstructured":"Xi-Lin Li. 2017. Preconditioned stochastic gradient descent. IEEE transactions on neural networks and learning systems 29, 5 (2017), 1454--1466."},{"key":"e_1_3_2_1_38_1","volume-title":"Fourier Neural Operator for parametric partial differential equations. arXiv preprint arXiv:2010.08895","author":"Li Zongyi","year":"2020","unstructured":"Zongyi Li, Nikola Kovachki, Kamyar Azizzadenesheli, Burigede Liu, Kaushik Bhattacharya, Andrew Stuart, and Anima Anandkumar. 2020. Fourier Neural Operator for parametric partial differential equations. arXiv preprint arXiv:2010.08895 (2020)."},{"key":"e_1_3_2_1_39_1","volume-title":"On the limited memory BFGS method for large scale optimization. Mathematical programming 45, 1--3","author":"Liu Dong C","year":"1989","unstructured":"Dong C Liu and Jorge Nocedal. 1989. On the limited memory BFGS method for large scale optimization. Mathematical programming 45, 1--3 (1989), 503--528."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jcp.2023.112548"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1137\/19M1302211"},{"key":"e_1_3_2_1_42_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_43_1","volume-title":"Learning nonlinear operators via DeepONet based on the universal approximation theorem of operators. Nature machine intelligence 3, 3","author":"Lu Lu","year":"2021","unstructured":"Lu Lu, Pengzhan Jin, Guofei Pang, Zhongqiang Zhang, and George Em Karniadakis. 2021. Learning nonlinear operators via DeepONet based on the universal approximation theorem of operators. Nature machine intelligence 3, 3 (2021), 218--229."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cma.2022.114778"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1137\/21M1465718"},{"key":"e_1_3_2_1_46_1","volume-title":"International conference on machine learning. PMLR, 2408--2417","author":"Martens James","year":"2015","unstructured":"James Martens and Roger Grosse. 2015. Optimizing neural networks with kronecker-factored approximate curvature. In International conference on machine learning. PMLR, 2408--2417."},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the 28th international conference on machine learning (ICML-11)","author":"Martens James","year":"2011","unstructured":"James Martens and Ilya Sutskever. 2011. Learning recurrent neural networks with Hessian-free optimization. In Proceedings of the 28th international conference on machine learning (ICML-11). 1033--1040."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2021.102831"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.5555\/3104322.3104425"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1137\/19M1288802"},{"key":"e_1_3_2_1_51_1","volume-title":"Numerical optimization","author":"Nocedal Jorge","unstructured":"Jorge Nocedal and Stephen J Wright. 2006. Numerical optimization. Springer."},{"key":"e_1_3_2_1_52_1","unstructured":"Thomas O'Leary-Roseberry Nick Alger and Omar Ghattas. 2021. Low Rank Saddle Free Newton: A Scalable Method for Stochastic Nonconvex Optimization. arXiv:2002.02881"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jcp.2023.112555"},{"key":"e_1_3_2_1_54_1","volume-title":"ASDL: A Unified Interface for Gradient Preconditioning in PyTorch. arXiv preprint arXiv:2305.04684","author":"Osawa Kazuki","year":"2023","unstructured":"Kazuki Osawa, Satoki Ishikawa, Rio Yokota, Shigang Li, and Torsten Hoefler. 2023. ASDL: A Unified Interface for Gradient Preconditioning in PyTorch. arXiv preprint arXiv:2305.04684 (2023)."},{"key":"e_1_3_2_1_55_1","unstructured":"Kazuki Osawa Yohei Tsuji Yuichiro Ueno Akira Naruse Rio Yokota and S. Matsuoka. 2018. Second-order Optimization Method for Large Mini-batch: Training ResNet-50 on ImageNet in 35 Epochs. ArXiv abs\/1811.12019 (2018)."},{"key":"e_1_3_2_1_56_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476152"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.5555\/3291125.3291150"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jcp.2018.10.045"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10107-018-1346-5"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.psychres.2021.114135"},{"key":"e_1_3_2_1_62_1","volume-title":"Deep learning in neural networks: An overview. Neural networks 61","author":"Schmidhuber J\u00fcrgen","year":"2015","unstructured":"J\u00fcrgen Schmidhuber. 2015. Deep learning in neural networks: An overview. Neural networks 61 (2015), 85--117."},{"key":"e_1_3_2_1_63_1","volume-title":"Fast curvature matrix-vector products for second-order gradient descent. Neural computation 14, 7","author":"Schraudolph Nicol N","year":"2002","unstructured":"Nicol N Schraudolph. 2002. Fast curvature matrix-vector products for second-order gradient descent. Neural computation 14, 7 (2002), 1723--1738."},{"key":"e_1_3_2_1_64_1","volume-title":"9th International Conference on Learning Representations, ICLR 2021","author":"Smith Samuel L","year":"2021","unstructured":"Samuel L Smith, Benoit Dherin, David GT Barrett, and Soham De. 2021. On the origin of implicit regularization in stochastic gradient descent. 9th International Conference on Learning Representations, ICLR 2021 (2021)."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1137\/0720042"},{"key":"e_1_3_2_1_66_1","volume-title":"International conference on machine learning. PMLR, 1139--1147","author":"Sutskever Ilya","year":"2013","unstructured":"Ilya Sutskever, James Martens, George Dahl, and Geoffrey Hinton. 2013. On the importance of initialization and momentum in deep learning. In International conference on machine learning. PMLR, 1139--1147."},{"key":"e_1_3_2_1_67_1","volume-title":"Sparse matrices and their uses","author":"Toint Philippe","unstructured":"Philippe Toint. 1981. Towards an efficient sparsity exploiting Newton method for minimization. In Sparse matrices and their uses. Academic press, 57--88."},{"key":"e_1_3_2_1_68_1","volume-title":"Training (Overparametrized) Neural Networks in Near-Linear Time. In 12th Innovations in Theoretical Computer Science Conference (ITCS 2021) (Leibniz International Proceedings in Informatics (LIPIcs)","volume":"15","author":"van den Brand Jan","year":"2021","unstructured":"Jan van den Brand, Binghui Peng, Zhao Song, and Omri Weinstein. 2021. Training (Overparametrized) Neural Networks in Near-Linear Time. In 12th Innovations in Theoretical Computer Science Conference (ITCS 2021) (Leibniz International Proceedings in Informatics (LIPIcs), Vol. 185), James R. Lee (Ed.). Schloss Dagstuhl-Leibniz-Zentrum f\u00fcr Informatik, Dagstuhl, Germany, 63:1--63:15."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1088\/2752-5724\/ac681d"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10107-019-01405-z"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10915-022-01911-x"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i12.17275"},{"key":"e_1_3_2_1_73_1","volume-title":"Scaling sgd batch size to 32k for imagenet training. arXiv preprint arXiv:1708.03888 6, 12","author":"You Yang","year":"2017","unstructured":"Yang You, Igor Gitman, and Boris Ginsburg. 2017. Scaling sgd batch size to 32k for imagenet training. arXiv preprint arXiv:1708.03888 6, 12 (2017), 6."},{"key":"e_1_3_2_1_74_1","volume-title":"8th International Conference on Learning Representations, ICLR 2020","author":"You Yang","year":"2020","unstructured":"Yang You, Jing Li, Sashank Reddi, Jonathan Hseu, Sanjiv Kumar, Srinadh Bhojanapalli, Xiaodan Song, James Demmel, Kurt Keutzer, and Cho-Jui Hsieh. 2020. Large batch optimization for deep learning: training BERT in 76 minutes. 8th International Conference on Learning Representations, ICLR 2020 (2020)."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1007\/s40304-018-0127-z"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3084070"},{"key":"e_1_3_2_1_77_1","volume-title":"Cryo-DRGN: reconstruction of heterogeneous cryo-EM structures using neural networks. Nature methods 18, 2","author":"Zhong Ellen D","year":"2021","unstructured":"Ellen D Zhong, Tristan Bepler, Bonnie Berger, and Joseph H Davis. 2021. Cryo-DRGN: reconstruction of heterogeneous cryo-EM structures using neural networks. Nature methods 18, 2 (2021), 176--185."}],"event":{"name":"PASC '24: Platform for Advanced Scientific Computing Conference","location":"Zurich Switzerland","acronym":"PASC '24","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","ETH Zurich \/ CSCS"]},"container-title":["Proceedings of the Platform for Advanced Scientific Computing Conference"],"original-title":[],"deposited":{"date-parts":[[2024,5,17]],"date-time":"2024-05-17T18:32:43Z","timestamp":1715970763000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3659914.3659931"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,3]]},"references-count":76,"alternative-id":["10.1145\/3659914.3659931","10.1145\/3659914"],"URL":"http:\/\/dx.doi.org\/10.1145\/3659914.3659931","relation":{},"subject":[],"published":{"date-parts":[[2024,6,3]]},"assertion":[{"value":"2024-06-03","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}