{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,30]],"date-time":"2026-01-30T04:32:55Z","timestamp":1769747575021,"version":"3.49.0"},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2025,4,11]],"date-time":"2025-04-11T00:00:00Z","timestamp":1744329600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,4,11]],"date-time":"2025-04-11T00:00:00Z","timestamp":1744329600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2021YFA1000103"],"award-info":[{"award-number":["2021YFA1000103"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372469"],"award-info":[{"award-number":["62372469"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100007129","name":"Natural Science Foundation of Shandong Province","doi-asserted-by":"publisher","award":["ZR2022LZH009"],"award-info":[{"award-number":["ZR2022LZH009"]}],"id":[{"id":"10.13039\/501100007129","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"DOI":"10.1007\/s11227-025-07204-y","type":"journal-article","created":{"date-parts":[[2025,4,11]],"date-time":"2025-04-11T12:27:06Z","timestamp":1744374426000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Inference and training acceleration of deep learning partial differential equation solver"],"prefix":"10.1007","volume":"81","author":[{"given":"Xun","family":"Wang","sequence":"first","affiliation":[]},{"given":"Xianxi","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Xiangyu","family":"Meng","sequence":"additional","affiliation":[]},{"given":"Zeyang","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Siyu","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Tao","family":"Song","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,4,11]]},"reference":[{"issue":"5","key":"7204_CR1","doi-asserted-by":"publisher","first-page":"2019","DOI":"10.1029\/2019WR026731","volume":"56","author":"AM Tartakovsky","year":"2020","unstructured":"Tartakovsky AM, Marrero CO, Perdikaris P, Tartakovsky GD, Barajas-Solano D (2020) Physics-informed deep neural networks for learning parameters and constitutive relationships in subsurface flow problems. Water Resour Res 56(5):2019\u2013026731","journal-title":"Water Resour Res"},{"issue":"6","key":"7204_CR2","doi-asserted-by":"publisher","DOI":"10.1115\/1.4050542","volume":"143","author":"S Cai","year":"2021","unstructured":"Cai S, Wang Z, Wang S, Perdikaris P, Karniadakis GE (2021) Physics-informed neural networks for heat transfer problems. J Heat Transf 143(6):060801","journal-title":"J Heat Transf"},{"key":"7204_CR3","doi-asserted-by":"publisher","DOI":"10.1016\/j.jcp.2020.109491","volume":"416","author":"DZ Huang","year":"2020","unstructured":"Huang DZ, Xu K, Farhat C, Darve E (2020) Learning constitutive relations from indirect observations using deep neural networks. J Comput Phys 416:109491","journal-title":"J Comput Phys"},{"issue":"3","key":"7204_CR4","doi-asserted-by":"publisher","first-page":"195","DOI":"10.1002\/cnm.1640100303","volume":"10","author":"MG Dissanayake","year":"1994","unstructured":"Dissanayake MG, Phan-Thien N (1994) Neural-network-based approximations for solving partial differential equations. Commun Numer Methods Eng 10(3):195\u2013201","journal-title":"Commun Numer Methods Eng"},{"key":"7204_CR5","doi-asserted-by":"publisher","first-page":"686","DOI":"10.1016\/j.jcp.2018.10.045","volume":"378","author":"M Raissi","year":"2019","unstructured":"Raissi M, Perdikaris P, Karniadakis GE (2019) Physics-informed neural networks: a deep learning framework for solving forward and inverse problems involving nonlinear partial differential equations. J Comput Phys 378:686\u2013707","journal-title":"J Comput Phys"},{"key":"7204_CR6","doi-asserted-by":"publisher","first-page":"136","DOI":"10.1016\/j.jcp.2019.05.027","volume":"394","author":"Y Yang","year":"2019","unstructured":"Yang Y, Perdikaris P (2019) Adversarial uncertainty quantification in physics-informed neural networks. J Comput Phys 394:136\u2013152","journal-title":"J Comput Phys"},{"issue":"4","key":"7204_CR7","doi-asserted-by":"publisher","first-page":"2603","DOI":"10.1137\/18M1229845","volume":"41","author":"G Pang","year":"2019","unstructured":"Pang G, Lu L, Karniadakis GE (2019) fPINNs: fractional physics-informed neural networks. SIAM J Sci Comput 41(4):2603\u20132626","journal-title":"SIAM J Sci Comput"},{"key":"7204_CR8","doi-asserted-by":"publisher","DOI":"10.1016\/j.cma.2020.113250","volume":"370","author":"X Meng","year":"2020","unstructured":"Meng X, Li Z, Zhang D, Karniadakis GE (2020) PPINN: parareal physics-informed neural network for time-dependent PDEs. Comput Methods Appl Mech Eng 370:113250","journal-title":"Comput Methods Appl Mech Eng"},{"issue":"3","key":"7204_CR9","doi-asserted-by":"publisher","first-page":"218","DOI":"10.1038\/s42256-021-00302-5","volume":"3","author":"L Lu","year":"2021","unstructured":"Lu L, Jin P, Pang G, Zhang Z, Karniadakis GE (2021) Learning nonlinear operators via deeponet based on the universal approximation theorem of operators. Nat Mach Intell 3(3):218\u2013229","journal-title":"Nat Mach Intell"},{"key":"7204_CR10","unstructured":"Li Z, Kovachki N, Azizzadenesheli K, Liu B, Bhattacharya K, Stuart A, Anandkumar A (2020) Fourier neural operator for parametric partial differential equations. arXiv:2010.08895"},{"key":"7204_CR11","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2023.107258","volume":"127","author":"J He","year":"2024","unstructured":"He J, Kushwaha S, Park J, Koric S, Abueidda D, Jasiuk I (2024) Sequential deep operator networks (s-deeponet) for predicting full-field solutions under time-dependent loads. Eng Appl Artif Intell 127:107258","journal-title":"Eng Appl Artif Intell"},{"key":"7204_CR12","doi-asserted-by":"publisher","DOI":"10.1016\/j.cma.2022.114587","volume":"391","author":"S Goswami","year":"2022","unstructured":"Goswami S, Yin M, Yu Y, Karniadakis GE (2022) A physics-informed variational DeepONet for predicting crack path in quasi-brittle materials. Comput Methods Appl Mech Eng 391:114587","journal-title":"Comput Methods Appl Mech Eng"},{"key":"7204_CR13","doi-asserted-by":"publisher","DOI":"10.1016\/j.advwatres.2022.104180","volume":"163","author":"G Wen","year":"2022","unstructured":"Wen G, Li Z, Azizzadenesheli K, Anandkumar A, Benson SM (2022) U-FNO\u2014an enhanced Fourier neural operator-based deep-learning model for multiphase flow. Adv Water Resour 163:104180","journal-title":"Adv Water Resour"},{"issue":"11","key":"7204_CR14","doi-asserted-by":"publisher","DOI":"10.1016\/j.isci.2022.105452","volume":"25","author":"MM Rashid","year":"2022","unstructured":"Rashid MM, Pittie T, Chakraborty S, Krishnan NA (2022) Learning the stress\u2013strain fields in digital composites using Fourier neural operator. Iscience 25(11):105452","journal-title":"Iscience"},{"issue":"40","key":"7204_CR15","doi-asserted-by":"publisher","first-page":"8605","DOI":"10.1126\/sciadv.abi8605","volume":"7","author":"S Wang","year":"2021","unstructured":"Wang S, Wang H, Perdikaris P (2021) Learning the solution operator of parametric partial differential equations with physics-informed DeepONets. Sci Adv 7(40):8605","journal-title":"Sci Adv"},{"issue":"4","key":"7204_CR16","doi-asserted-by":"publisher","first-page":"911","DOI":"10.1109\/72.392253","volume":"6","author":"T Chen","year":"1995","unstructured":"Chen T, Chen H (1995) Universal approximation to nonlinear operators by neural networks with arbitrary activation functions and its application to dynamical systems. IEEE Trans Neural Netw 6(4):911\u2013917","journal-title":"IEEE Trans Neural Netw"},{"key":"7204_CR17","unstructured":"Li M, Zhou L, Yang Z, Li A, Xia F, Andersen DG, Smola A (2013) Parameter server for distributed machine learning. In: Big learning NIPS workshop, vol 6. Lake Tahoe"},{"key":"7204_CR18","unstructured":"Sergeev A, Del Balso M (2018) Horovod: fast and easy distributed deep learning in TensorFlow. arXiv:1802.05799"},{"key":"7204_CR19","unstructured":"Goyal P, Doll\u00e1r P, Girshick R, Noordhuis P, Wesolowski L, Kyrola A, Tulloch A, Jia Y, He K (2017) Accurate, large minibatch SGD: training ImageNet in 1 hour. arXiv:1706.02677"},{"key":"7204_CR20","doi-asserted-by":"crossref","unstructured":"You Y, Zhang Z, Hsieh C-J, Demmel J, Keutzer K (2018) ImageNet training in minutes. In: Proceedings of the 47th International Conference on Parallel Processing, pp 1\u201310","DOI":"10.1145\/3225058.3225069"},{"key":"7204_CR21","unstructured":"Jia X, Song S, He W, Wang Y, Rong H, Zhou F, Xie L, Guo Z, Yang Y, Yu L, et al (2018) Highly scalable deep learning training system with mixed-precision: training ImageNet in four minutes. arXiv:1807.11205"},{"key":"7204_CR22","doi-asserted-by":"crossref","unstructured":"Sun P, Feng W, Han R, Yan S, Wen Y (2019) Optimizing network performance for distributed DNN training on GPU clusters: ImageNet\/AlexNet training in 1.5 minutes. arXiv:1902.06855","DOI":"10.1109\/TBDATA.2019.2957478"},{"key":"7204_CR23","unstructured":"Lian X, Zhang W, Zhang C, Liu J (2018) Asynchronous decentralized parallel stochastic gradient descent. In: International conference on machine learning. PMLR, pp 3043\u20133052"},{"key":"7204_CR24","doi-asserted-by":"crossref","unstructured":"Luo Q, He J, Zhuo Y, Qian X (2020) Prague: high-performance heterogeneity-aware asynchronous decentralized training. In: Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems, pp 401\u2013416","DOI":"10.1145\/3373376.3378499"},{"key":"7204_CR25","doi-asserted-by":"crossref","unstructured":"Rajbhandari S, Rasley J, Ruwase O, He Y (2020) Zero: memory optimizations toward training trillion parameter models. In: SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, pp 1\u201316","DOI":"10.1109\/SC41405.2020.00024"},{"key":"7204_CR26","first-page":"1","volume":"1","author":"Z Jia","year":"2019","unstructured":"Jia Z, Zaharia M, Aiken A (2019) Beyond data and model parallelism for deep neural networks. Proc Mach Learn Syst 1:1\u201313","journal-title":"Proc Mach Learn Syst"},{"key":"7204_CR27","doi-asserted-by":"crossref","unstructured":"Wang M, Huang C-c, Li J (2019) Supporting very large models using automatic dataflow graph partitioning. In: Proceedings of the Fourteenth EuroSys Conference 2019, pp 1\u201317","DOI":"10.1145\/3302424.3303953"},{"key":"7204_CR28","unstructured":"Lepikhin D, Lee H, Xu Y, Chen D, Firat O, Huang Y, Krikun M, Shazeer N, Chen Z (2020) GShard: scaling giant models with conditional computation and automatic sharding. arXiv:2006.16668"},{"key":"7204_CR29","unstructured":"Xu Y, Lee H, Chen D, Hechtman B, Huang Y, Joshi R, Krikun M, Lepikhin D, Ly A, Maggioni M, et al (2021) GSPMD: general and scalable parallelization for ml computation graphs. arXiv:2105.04663"},{"key":"7204_CR30","doi-asserted-by":"crossref","unstructured":"Narayanan D, Shoeybi M, Casper J, LeGresley P, Patwary M, Korthikanti V, Vainbrand D, Kashinkunti P, Bernauer J, Catanzaro B, et al (2021) Efficient large-scale language model training on GPU clusters using Megatron-LM. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp 1\u201315","DOI":"10.1145\/3458817.3476209"},{"key":"7204_CR31","unstructured":"Huang Y, Cheng Y, Bapna A, Firat O, Chen D, Chen M, Lee H, Ngiam J, Le QV, Wu Y et al (2019) GPipe: efficient training of giant neural networks using pipeline parallelism. In: Advances in neural information processing systems, vol 32"},{"key":"7204_CR32","doi-asserted-by":"crossref","unstructured":"Narayanan D, Harlap A, Phanishayee A, Seshadri V, Devanur NR, Ganger GR, Gibbons PB, Zaharia M (2019) Pipedream: generalized pipeline parallelism for DNN training. In: Proceedings of the 27th ACM Symposium on Operating Systems Principles, pp 1\u201315","DOI":"10.1145\/3341301.3359646"},{"key":"7204_CR33","first-page":"269","volume":"3","author":"B Yang","year":"2021","unstructured":"Yang B, Zhang J, Li J, R\u00e9 C, Aberger C, De Sa C (2021) PipeMare: asynchronous pipeline parallel DNN training. Proc Mach Learn Syst 3:269\u2013296","journal-title":"Proc Mach Learn Syst"},{"issue":"5","key":"7204_CR34","doi-asserted-by":"publisher","first-page":"3055","DOI":"10.1137\/20M1318043","volume":"43","author":"S Wang","year":"2021","unstructured":"Wang S, Teng Y, Perdikaris P (2021) Understanding and mitigating gradient flow pathologies in physics-informed neural networks. SIAM J Sci Comput 43(5):3055\u20133081","journal-title":"SIAM J Sci Comput"},{"issue":"11","key":"7204_CR35","doi-asserted-by":"publisher","first-page":"2045","DOI":"10.1109\/TPDS.2011.311","volume":"23","author":"J Kurzak","year":"2012","unstructured":"Kurzak J, Tomov S, Dongarra J (2012) Autotuning GEMM kernels for the Fermi GPU. IEEE Trans Parallel Distrib Syst 23(11):2045\u20132057","journal-title":"IEEE Trans Parallel Distrib Syst"},{"key":"7204_CR36","doi-asserted-by":"crossref","unstructured":"Niu Y, Lu Z, Dong M, Jin Z, Liu W, Tan G (2021) TileSpMV: a tiled algorithm for sparse matrix-vector multiplication on GPUs. In: 2021 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, pp 68\u201378","DOI":"10.1109\/IPDPS49936.2021.00016"},{"key":"7204_CR37","doi-asserted-by":"publisher","first-page":"1275","DOI":"10.1109\/TC.2024.3365942","volume":"73","author":"C Guo","year":"2024","unstructured":"Guo C, Xue F, Leng J, Qiu Y, Guan Y, Cui W, Chen Q, Guo M (2024) Accelerating sparse DNNs based on tiled GEMM. IEEE Trans Comput 73:1275\u20131289","journal-title":"IEEE Trans Comput"},{"key":"7204_CR38","unstructured":"Gray S (2014) MaXas: assembler for NVIDIA maxwell architecture. NervanaSystems\/MaxAs"},{"issue":"10","key":"7204_CR39","doi-asserted-by":"publisher","first-page":"3934","DOI":"10.1007\/s11227-015-1483-z","volume":"71","author":"J Filipovi\u010d","year":"2015","unstructured":"Filipovi\u010d J, Madzin M, Fousek J, Matyska L (2015) Optimizing CUDA code by kernel fusion: application on BLAS. J Supercomput 71(10):3934\u20133957","journal-title":"J Supercomput"},{"issue":"8","key":"7204_CR40","doi-asserted-by":"publisher","first-page":"173","DOI":"10.1145\/2858788.2688521","volume":"50","author":"A Ashari","year":"2015","unstructured":"Ashari A, Tatikonda S, Boehm M, Reinwald B, Campbell K, Keenleyside J, Sadayappan P (2015) On optimizing machine learning workloads via kernel fusion. ACM SIGPLAN Not 50(8):173\u2013182","journal-title":"ACM SIGPLAN Not"},{"key":"7204_CR41","unstructured":"Zhang H, Zheng Z, Xu S, Dai W, Ho Q, Liang X, Hu Z, Wei J, Xie P, Xing EP (2017) Poseidon: an efficient communication architecture for distributed deep learning on $$\\{$$GPU$$\\}$$ clusters. In: 2017 USENIX Annual Technical Conference (USENIX ATC 17), pp 181\u2013193"},{"key":"7204_CR42","doi-asserted-by":"crossref","unstructured":"Zhang L, Shi S, Chu X, Wang W, Li B, Liu C (2023) Dear: accelerating distributed deep learning with fine-grained all-reduce pipelining. In: 2023 IEEE 43rd International Conference on Distributed Computing Systems (ICDCS). IEEE, pp 142\u2013153","DOI":"10.1109\/ICDCS57875.2023.00054"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-025-07204-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-025-07204-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-025-07204-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,11]],"date-time":"2025-04-11T12:27:20Z","timestamp":1744374440000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-025-07204-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,11]]},"references-count":42,"journal-issue":{"issue":"5","published-online":{"date-parts":[[2025,4]]}},"alternative-id":["7204"],"URL":"https:\/\/doi.org\/10.1007\/s11227-025-07204-y","relation":{},"ISSN":["1573-0484"],"issn-type":[{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,4,11]]},"assertion":[{"value":"14 March 2025","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 April 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"733"}}