{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T02:48:08Z","timestamp":1776134888789,"version":"3.50.1"},"reference-count":31,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["72401024"],"award-info":[{"award-number":["72401024"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["W2511076"],"award-info":[{"award-number":["W2511076"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Reliability Engineering &amp; System Safety"],"published-print":{"date-parts":[[2026,8]]},"DOI":"10.1016\/j.ress.2026.112484","type":"journal-article","created":{"date-parts":[[2026,3,2]],"date-time":"2026-03-02T17:48:56Z","timestamp":1772473736000},"page":"112484","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"P1","title":["Service reliability of intelligent computing cluster systems for large language models"],"prefix":"10.1016","volume":"272","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7452-8073","authenticated-orcid":false,"given":"Hanxiao","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Chen","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Yingdong","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5755-7115","authenticated-orcid":false,"given":"Yan-Fu","family":"Li","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"issue":"1","key":"10.1016\/j.ress.2026.112484_bib0001","doi-asserted-by":"crossref","first-page":"6","DOI":"10.1007\/s11390-024-4178-1","article-title":"Ai computing systems for large language models training","volume":"40","author":"Zhang","year":"2025","journal-title":"J Comput Sci Technol"},{"issue":"18","key":"10.1016\/j.ress.2026.112484_bib0002","doi-asserted-by":"crossref","first-page":"5681","DOI":"10.1016\/j.apm.2015.01.036","article-title":"Service reliability modeling of distributed computing systems with virus epidemics","volume":"39","author":"Li","year":"2015","journal-title":"Appl Math Model"},{"issue":"2","key":"10.1016\/j.ress.2026.112484_bib0003","doi-asserted-by":"crossref","first-page":"527","DOI":"10.1109\/TR.2022.3154651","article-title":"Service-oriented reliability modeling and autonomous optimization of reliability for public cloud computing systems","volume":"71","author":"Meng","year":"2022","journal-title":"IEEE Trans Reliab"},{"issue":"1","key":"10.1016\/j.ress.2026.112484_bib0004","doi-asserted-by":"crossref","first-page":"200","DOI":"10.1109\/TR.2019.2909279","article-title":"Reliability-aware offloading and allocation in multilevel edge computing system","volume":"70","author":"Dong","year":"2019","journal-title":"IEEE Trans Reliab"},{"issue":"1","key":"10.1016\/j.ress.2026.112484_bib0005","doi-asserted-by":"crossref","first-page":"3","DOI":"10.1109\/TR.2022.3161638","article-title":"Reliability of a distributed data storage system considering the external impacts","volume":"72","author":"Kou","year":"2022","journal-title":"IEEE Trans Reliab"},{"key":"10.1016\/j.ress.2026.112484_bib0006","doi-asserted-by":"crossref","DOI":"10.1016\/j.ress.2019.106643","article-title":"Formal models for safety and performance analysis of a data center system","volume":"193","author":"Bennaceur","year":"2020","journal-title":"Reliability Engineering & System Safety"},{"key":"10.1016\/j.ress.2026.112484_bib0007","first-page":"1","article-title":"Various network topologies and an analysis comparative between fat-tree and BCube for a data center network: an overview","author":"Castillo","year":"2022","journal-title":"2022 IEEE Cloud Summit"},{"key":"10.1016\/j.ress.2026.112484_bib0008","article-title":"Gpipe: efficient training of giant neural networks using pipeline parallelism","volume":"32","author":"Huang","year":"2019","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.ress.2026.112484_bib0009","series-title":"SC20: International conference for high performance computing, networking, storage and analysis","first-page":"1","article-title":"Gems: gpu-enabled memory-aware model-parallelism system for distributed dnn training","author":"Jain","year":"2020"},{"key":"10.1016\/j.ress.2026.112484_bib0010","unstructured":"Liu A., Feng B., Xue B., Wang B., Wu B., Lu C., et al. Deepseek-v3 technical report. 2024; arXiv preprint arXiv: 241219437."},{"key":"10.1016\/j.ress.2026.112484_bib0011","series-title":"Proceedings of the 26th ACM SIGPLAN symposium on principles and practice of parallel programming","first-page":"431","article-title":"Dapple: a pipelined data parallel approach for training large models","author":"Fan","year":"2021"},{"issue":"3","key":"10.1016\/j.ress.2026.112484_bib0012","doi-asserted-by":"crossref","first-page":"931","DOI":"10.1109\/TR.2019.2923770","article-title":"Slow replica and shared protection: energy-efficient and reliable task assignment in cloud data centers","volume":"70","author":"Fan","year":"2019","journal-title":"IEEE Trans Reliab"},{"key":"10.1016\/j.ress.2026.112484_bib0013","doi-asserted-by":"crossref","DOI":"10.1016\/j.ress.2020.107381","article-title":"A generalized petri net-based modeling framework for service reliability evaluation and management of cloud data centers","volume":"207","author":"Li","year":"2021","journal-title":"Reliab Eng System Safety"},{"issue":"4","key":"10.1016\/j.ress.2026.112484_bib0014","doi-asserted-by":"crossref","first-page":"1555","DOI":"10.1109\/TR.2021.3111031","article-title":"Reliability of a distributed computing system with performance sharing","volume":"71","author":"Xiao","year":"2021","journal-title":"IEEE Trans Reliab"},{"issue":"2","key":"10.1016\/j.ress.2026.112484_bib0015","doi-asserted-by":"crossref","first-page":"547","DOI":"10.1109\/TR.2017.2678480","article-title":"Optimal scheduling and management on correlating reliability, performance, and energy consumption for multiagent cloud systems","volume":"66","author":"Sun","year":"2017","journal-title":"IEEE Trans Reliab"},{"issue":"2","key":"10.1016\/j.ress.2026.112484_bib0016","doi-asserted-by":"crossref","first-page":"620","DOI":"10.1109\/TR.2019.2901194","article-title":"Improving failure tolerance in large-scale cloud computing systems","volume":"68","author":"Luo","year":"2019","journal-title":"IEEE Trans Reliab"},{"key":"10.1016\/j.ress.2026.112484_bib0017","series-title":"Proceedings of the international conference for high performance computing, networking, storage and analysis","first-page":"1","article-title":"Efficient large-scale language model training on gpu clusters using megatron-lm","author":"Narayanan","year":"2021"},{"key":"10.1016\/j.ress.2026.112484_bib0018","series-title":"16Th USENIX symposium on operating systems design and implementation (OSDI 22)","first-page":"559","article-title":"Alpa: automating inter-and intra-operator parallelism for distributed deep learning","author":"Zheng","year":"2022"},{"key":"10.1016\/j.ress.2026.112484_bib0019","doi-asserted-by":"crossref","unstructured":"Li S., Zhao Y., Varma R., Salpekar O., Noordhuis P., Li T., et al. Pytorch distributed: experiences on accelerating data parallel training. 2020; arXiv preprint arXiv: 200615704.","DOI":"10.14778\/3415478.3415530"},{"key":"10.1016\/j.ress.2026.112484_bib0020","series-title":"SC20: International conference for high performance computing, networking, storage and analysis","first-page":"1","article-title":"Zero: memory optimizations toward training trillion parameter models","author":"Rajbhandari","year":"2020"},{"key":"10.1016\/j.ress.2026.112484_bib0021","article-title":"Mesh-tensorflow: deep learning for supercomputers","volume":"31","author":"Shazeer","year":"2018","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.ress.2026.112484_bib0022","unstructured":"Shoeybi M., Patwary M., Puri R., LeGresley P., Casper J., Catanzaro B.. Megatron-lm: training multi-billion parameter language models using model parallelism. 2019; arXiv preprint arXiv: 190908053."},{"key":"10.1016\/j.ress.2026.112484_bib0023","unstructured":"Hewett R.J., Grady II T.J.. A linear algebraic approach to model parallelism in deep learning. 2020; arXiv preprint arXiv: 200603108."},{"key":"10.1016\/j.ress.2026.112484_bib0024","unstructured":"Shi Z., Jiang L., Wang A., Zhang J., Jia X., Li Y., et al. Tap: accelerating large-scale dnn training through tensor automatic parallelisation. 2023; arXiv preprint arXiv: 230200247."},{"key":"10.1016\/j.ress.2026.112484_bib0025","doi-asserted-by":"crossref","unstructured":"Harlap A., Narayanan D., Phanishayee A., Seshadri V., Devanur N., Ganger G., et al. Pipedream: fast and efficient pipeline parallel dnn training. 2018; arXiv preprint arXiv: 180603377.","DOI":"10.1145\/3341301.3359646"},{"key":"10.1016\/j.ress.2026.112484_bib0026","series-title":"Proceedings of the international conference for high performance computing, networking, storage and analysis","first-page":"1","article-title":"Chimera: efficiently training large-scale neural networks with bidirectional pipelines","author":"Li","year":"2021"},{"key":"10.1016\/j.ress.2026.112484_bib0027","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.ress.2026.112484_bib0028","unstructured":"Falk S., Corr\u00eaa N.K., Luccioni S., Biber-Freudenberger L., van Wynsberghe A.. From FLOPs to footprints: the resource cost of artificial intelligence. 2025; arXiv preprint arXiv: 251204142."},{"issue":"1","key":"10.1016\/j.ress.2026.112484_bib0029","doi-asserted-by":"crossref","first-page":"1284","DOI":"10.1038\/s41467-021-21531-7","article-title":"Flexible thermal interface based on self-assembled boron arsenide for high-performance thermal management","volume":"12","author":"Cui","year":"2021","journal-title":"Nat Commun"},{"key":"10.1016\/j.ress.2026.112484_bib0030","doi-asserted-by":"crossref","DOI":"10.1016\/j.mssp.2024.108745","article-title":"Brief overview of the impact of thermal stress on the reliability of through silicon via: analysis, characterization, and enhancement","volume":"183","author":"Tang","year":"2024","journal-title":"Mater Sci Semicond Process"},{"key":"10.1016\/j.ress.2026.112484_bib0031","series-title":"Proceedings of the 58th IEEE\/ACM international symposium on microarchitecture\u00ae","first-page":"626","article-title":"Characterizing the efficiency of distributed training: a power, performance, and thermal perspective","author":"Go","year":"2025"}],"container-title":["Reliability Engineering &amp; System Safety"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0951832026003005?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0951832026003005?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T01:59:03Z","timestamp":1776131943000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0951832026003005"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,8]]},"references-count":31,"alternative-id":["S0951832026003005"],"URL":"https:\/\/doi.org\/10.1016\/j.ress.2026.112484","relation":{},"ISSN":["0951-8320"],"issn-type":[{"value":"0951-8320","type":"print"}],"subject":[],"published":{"date-parts":[[2026,8]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Service reliability of intelligent computing cluster systems for large language models","name":"articletitle","label":"Article Title"},{"value":"Reliability Engineering & System Safety","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.ress.2026.112484","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"112484"}}