{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,11]],"date-time":"2025-11-11T13:59:34Z","timestamp":1762869574432,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,5,4]]},"DOI":"10.1145\/3757348.3757366","type":"proceedings-article","created":{"date-parts":[[2025,11,11]],"date-time":"2025-11-11T10:07:33Z","timestamp":1762855653000},"page":"166-177","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Evolving HPC services to enable ML workloads on HPE Cray EX"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-7977-8637","authenticated-orcid":false,"given":"Stefano","family":"Schuppli","sequence":"first","affiliation":[{"name":"Swiss National Supercomputing Centre, Lugano, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8907-4427","authenticated-orcid":false,"given":"Fawzi","family":"Mohamed","sequence":"additional","affiliation":[{"name":"Swiss National Supercomputing Centre, Lugano, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0250-3571","authenticated-orcid":false,"given":"Henrique","family":"Mendonca","sequence":"additional","affiliation":[{"name":"Swiss National Supercomputing Centre, Lugano, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7952-8439","authenticated-orcid":false,"given":"Nina","family":"Mujkanovic","sequence":"additional","affiliation":[{"name":"Swiss National Supercomputing Centre, Lugano, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3997-8395","authenticated-orcid":false,"given":"Elia","family":"Palme","sequence":"additional","affiliation":[{"name":"Swiss National Supercomputing Centre, Lugano, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5322-5796","authenticated-orcid":false,"given":"Dino","family":"Conciatore","sequence":"additional","affiliation":[{"name":"Swiss National Supercomputing Centre, Lugano, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6229-1556","authenticated-orcid":false,"given":"Lukas","family":"Drescher","sequence":"additional","affiliation":[{"name":"Swiss National Supercomputing Centre, Lugano, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9363-7060","authenticated-orcid":false,"given":"Miguel","family":"Gila","sequence":"additional","affiliation":[{"name":"Swiss National Supercomputing Centre, Lugano, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6091-6825","authenticated-orcid":false,"given":"Pim","family":"Witlox","sequence":"additional","affiliation":[{"name":"Swiss National Supercomputing Centre, Lugano, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0902-5111","authenticated-orcid":false,"given":"Joost","family":"VandeVondele","sequence":"additional","affiliation":[{"name":"Swiss National Supercomputing Centre, Lugano, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1849-1621","authenticated-orcid":false,"given":"Maxime","family":"Martinasso","sequence":"additional","affiliation":[{"name":"Swiss National Supercomputing Centre, Lugano, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1086-5812","authenticated-orcid":false,"given":"Thomas C.","family":"Schulthess","sequence":"additional","affiliation":[{"name":"Swiss National Supercomputing Centre, Lugano, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1333-9797","authenticated-orcid":false,"given":"Torsten","family":"Hoefler","sequence":"additional","affiliation":[{"name":"Swiss National Supercomputing Centre, Lugano, Switzerland"}]}],"member":"320","published-online":{"date-parts":[[2025,11,11]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","unstructured":"Sadaf\u00a0R Alam Miguel Gila Mark Klein Maxime Martinasso and Thomas\u00a0C Schulthess. 2023. Versatile software-defined HPC and cloud clusters on Alps supercomputer for diverse workflows. The International Journal of High Performance Computing Applications 37 3-4 (2023) 288\u2013305. 10.1177\/10943420231167811 arXiv:10.1177\/10943420231167811","DOI":"10.1177\/10943420231167811"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","unstructured":"Mar\u00eda Arsuaga-R\u00edos Seppo\u00a0S Heikkil\u00e4 Dirk Duellmann Ren\u00e9 Meusel Jakob Blomer and Ben Couturier. 2015. Using S3 cloud storage with ROOT and CvmFS. Journal of Physics: Conference Series 664 2 (12 2015) 022001. 10.1088\/1742-6596\/664\/2\/022001","DOI":"10.1088\/1742-6596\/664\/2\/022001"},{"key":"e_1_3_3_2_4_2","unstructured":"Stas Bekman. 2022. The BLOOM training chronicles. https:\/\/github.com\/bigscience-workshop\/bigscience\/blob\/master\/train\/tr11-176B-ml\/chronicles.md"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-34356-9_5"},{"key":"e_1_3_3_2_6_2","unstructured":"Massimo Benini Jeff Hanson Mathilde Gianolli and et. al.2024. EMOI: CSCS Extensible Monitoring and Observability Infrastructure. https:\/\/www.research-collection.ethz.ch\/handle\/20.500.11850\/702519"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/INDIS54524.2021.00011"},{"key":"e_1_3_3_2_8_2","unstructured":"Arthur Chiao. 2023. Understanding GPU Performance: Utilization vs. Saturation. http:\/\/arthurchiao.art\/blog\/understanding-gpu-performance"},{"key":"e_1_3_3_2_9_2","unstructured":"Jonathan Coles Benjamin Cumming Theofilos-Ioannis Manitaras Piccinali Jean-Guillaume Simon Pintarelli and Harmen Stoppels. 2023. Deploying Alternative User Environments on Alps. https:\/\/cug.org\/proceedings\/cug2023_proceedings\/includes\/files\/pap143s2-file1.pdf"},{"key":"e_1_3_3_2_10_2","unstructured":"Container Device Interface. 2020. Container Device Interface home page. Retrieved Accessed: 2025-04-10 from https:\/\/github.com\/cncf-tags\/container-device-interface"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/SuperCompCloud51944.2020.00009"},{"key":"e_1_3_3_2_12_2","unstructured":"Felipe\u00a0A. Cruz and Alberto Madonna. 2024. Containers-first user environments on HPE Cray EX."},{"key":"e_1_3_3_2_13_2","unstructured":"Marcel Ferrari and Nina Mujkanovic. 2025. Practical Performance Modeling for Large-Scale Distributed GPU Workloads. https:\/\/www.hpcadvisorycouncil.com\/events\/2025\/swiss-conference\/agenda.php. Accessed: 2025-04-10."},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/2807591.2807623"},{"key":"e_1_3_3_2_15_2","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri and et. al.2024. The Llama 3 Herd of Models. arxiv:https:\/\/arXiv.org\/abs\/2407.21783\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_3_2_16_2","unstructured":"Ziheng Jiang Haibin Lin Yinmin Zhong and et. al.2024. MegaScale: Scaling Large Language Model Training to More Than 10 000 GPUs. arxiv:https:\/\/arXiv.org\/abs\/2402.15627\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2402.15627"},{"key":"e_1_3_3_2_17_2","unstructured":"Apostolos Kokolis Michael Kuchnik John Hoffman Adithya Kumar Parth Malani Faye Ma Zachary DeVito Shubho Sengupta Kalyan Saladi and Carole-Jean Wu. 2024. Revisiting Reliability in Large-Scale Machine Learning Research Clusters. arxiv:https:\/\/arXiv.org\/abs\/2410.21680\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2410.21680"},{"key":"e_1_3_3_2_18_2","unstructured":"Glenn\u00a0K. Lockwood. 2025. LLM training without a parallel file system. https:\/\/blog.glennklockwood.com\/2025\/02\/llm-training-without-parallel-file.html. Accessed: 2025-04-09."},{"key":"e_1_3_3_2_19_2","unstructured":"Phillip Lougher. 2014. SquashFS - A compressed read-only filesystem for Linux. https:\/\/github.com\/plougher\/squashfs-tools. Accessed: 2025-04-09."},{"key":"e_1_3_3_2_20_2","unstructured":"Ryan Lucchese Niki Birkner Yaron Hagai and Virginia Adams. 2024. A practitioner\u2019s guide to testing and running large GPU clusters for training generative AI models. https:\/\/www.together.ai\/blog\/a-practitioners-guide-to-testing-and-running-large-gpu-clusters-for-training-generative-ai-models"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","unstructured":"Maxime Martinasso Mark Klein Benjamin Cumming Miguel Gila Felipe\u00a0A. Cruz Alberto Madonna Manuel\u00a0Sopena Ballesteros Sadaf\u00a0R. Alam and Thomas\u00a0C. Schulthess. 2024. Versatile Software-Defined Cluster for HPC Using Cloud Abstractions. Comput. Sci. Eng. 26 3 (2024) 20\u201329. 10.1109\/MCSE.2024.3394164","DOI":"10.1109\/MCSE.2024.3394164"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3757348.3757365"},{"key":"e_1_3_3_2_23_2","unstructured":"Simon McIntosh-Smith. 2011. The GPU Computing Revolution. https:\/\/www.lms.ac.uk\/sites\/default\/files\/files\/reports\/GPU-KT-report-screen.pdf"},{"key":"e_1_3_3_2_24_2","unstructured":"Timothy Morgan. 2024. AMD now has more compute on the TOP500 than NVIDIA. https:\/\/www.nextplatform.com\/2024\/11\/18\/amd-now-has-more-compute-on-the-top500-than-nvidia\/"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS57955.2024.00052"},{"key":"e_1_3_3_2_26_2","unstructured":"NVIDIA. 2018. DCGM Feature Overview. https:\/\/docs.nvidia.com\/datacenter\/dcgm\/latest\/user-guide\/feature-overview.html Accessed: 2025-04-10."},{"key":"e_1_3_3_2_27_2","unstructured":"Team OLMo Pete Walsh Luca Soldaini Dirk Groeneveld and et. al.2024. 2 OLMo 2 Furious. arxiv:https:\/\/arXiv.org\/abs\/2501.00656\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2501.00656"},{"key":"e_1_3_3_2_28_2","unstructured":"Open Container Initiative. 2022. Open Container Initiative home page. Retrieved Accessed: 2025-04-10 from https:\/\/www.opencontainers.org\/"},{"key":"e_1_3_3_2_29_2","unstructured":"Lucas Pasqualin Less Wright Iris Zhang Chien-Chin Huang Swaminathan Sundararaman Saransh Gupta and Raghu Ganti. 2024. Reducing Model Checkpointing Times by Over 10x with PyTorch Distributed Asynchronous Checkpointing. https:\/\/pytorch.org\/blog\/reducing-checkpointing-times"},{"key":"e_1_3_3_2_30_2","unstructured":"Anton Shilov. 2024. Faulty Nvidia H100 GPUs and HBM3 memory caused half of failures during LLama 3 training. https:\/\/www.tomshardware.com\/tech-industry\/artificial-intelligence\/faulty-nvidia-h100-gpus-and-hbm3-memory-caused-half-of-the-failures-during-llama-3-training-one-failure-every-three-hours-for-metas-16384-gpu-training-cluster"},{"key":"e_1_3_3_2_31_2","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Jean-Baptiste Alayrac and et. al.2024. Gemini: A Family of Highly Capable Multimodal Models. arxiv:https:\/\/arXiv.org\/abs\/2312.11805\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2312.11805"},{"key":"e_1_3_3_2_32_2","unstructured":"Imbue Team. 2024. From bare metal to a 70B model: infrastructure set-up and scripts. https:\/\/imbue.com\/research\/70b-infrastructure"},{"key":"e_1_3_3_2_33_2","unstructured":"The\u00a0PyTorch team. 2024. (prototype) Flight Recorder for Debugging Stuck Jobs. https:\/\/pytorch.org\/tutorials\/prototype\/flight_recorder_tutorial.html."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18072.2020.9218551"},{"key":"e_1_3_3_2_35_2","unstructured":"ETH Zurich and EPFL. 2023. Swiss AI Initiative. https:\/\/www.swiss-ai.org. Accessed: 2025-04-09."}],"event":{"name":"CUG 2025: Cray User Group","acronym":"CUG '25","location":"Jersey City USA"},"container-title":["Proceedings of the Cray User Group"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3757348.3757366","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,11]],"date-time":"2025-11-11T10:28:53Z","timestamp":1762856933000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3757348.3757366"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,4]]},"references-count":34,"alternative-id":["10.1145\/3757348.3757366","10.1145\/3757348"],"URL":"https:\/\/doi.org\/10.1145\/3757348.3757366","relation":{},"subject":[],"published":{"date-parts":[[2025,5,4]]},"assertion":[{"value":"2025-11-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}