{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,11]],"date-time":"2025-11-11T10:35:52Z","timestamp":1762857352871,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,5,4]]},"DOI":"10.1145\/3757348.3757361","type":"proceedings-article","created":{"date-parts":[[2025,11,11]],"date-time":"2025-11-11T10:07:33Z","timestamp":1762855653000},"page":"115-126","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Evaluating AMD Instinct\u2122 MI300A APU: Performance Insights on LLM Training via Knowledge Distillation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-1686-2479","authenticated-orcid":false,"given":"Dennis","family":"Dickmann","sequence":"first","affiliation":[{"name":"Seedbox, Stuttgart, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1674-7980","authenticated-orcid":false,"given":"Philipp","family":"Offenh\u00e4user","sequence":"additional","affiliation":[{"name":"HPE, B\u00f6blingen, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4931-2710","authenticated-orcid":false,"given":"Rishabh","family":"Saxena","sequence":"additional","affiliation":[{"name":"HLRS, University of Stuttgart, Stuttgart, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5571-4823","authenticated-orcid":false,"given":"George","family":"Markomanolis","sequence":"additional","affiliation":[{"name":"AMD, Lille, France"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2132-7726","authenticated-orcid":false,"given":"Alessandro","family":"Rigazzi","sequence":"additional","affiliation":[{"name":"HPE HPC\/AI EMEA Research Lab, Basel, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5660-750X","authenticated-orcid":false,"given":"Patrick","family":"Keller","sequence":"additional","affiliation":[{"name":"HPE, Schwalbach, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3333-4248","authenticated-orcid":false,"given":"Kerem","family":"Kayabay","sequence":"additional","affiliation":[{"name":"HLRS, University of Stuttgart, Stuttgart, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8976-8603","authenticated-orcid":false,"given":"Dennis","family":"Hoppe","sequence":"additional","affiliation":[{"name":"HLRS, University of Stuttgart, Stuttgart, Germany"}]}],"member":"320","published-online":{"date-parts":[[2025,11,11]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Loubna\u00a0Ben Allal Anton Lozhkov Elie Bakouch Gabriel\u00a0Mart\u00edn Bl\u00e1zquez Guilherme Penedo Lewis Tunstall Andr\u00e9s Marafioti Hynek Kydl\u00ed\u010dek Agust\u00edn\u00a0Piqueres Lajar\u00edn Vaibhav Srivastav et\u00a0al. 2025. SmolLM2: When Smol Goes Big\u2013Data-Centric Training of a Small Language Model."},{"key":"e_1_3_3_2_3_2","unstructured":"AMD ROCm Contributors. 2025. AWS OFI RCCL. https:\/\/github.com\/ROCm\/aws-ofi-rccl. Accessed: 2025-05-02."},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","unstructured":"D.M. Anisuzzaman Jeffrey\u00a0G. Malins Paul\u00a0A. Friedman and Zachi\u00a0I. Attia. 2025. Fine-Tuning Large Language Models for Specialized Use Cases. Mayo Clinic Proceedings: Digital Health 3 1 (2025) 100184. 10.1016\/j.mcpdig.2024.11.005","DOI":"10.1016\/j.mcpdig.2024.11.005"},{"key":"e_1_3_3_2_5_2","unstructured":"Loubna Ben\u00a0Allal Anton Lozhkov Guilherme Penedo Thomas Wolf and Leandro von Werra. 2024. SmolLM-Corpus. https:\/\/huggingface.co\/datasets\/HuggingFaceTB\/smollm-corpus"},{"key":"e_1_3_3_2_6_2","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared\u00a0D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et\u00a0al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877\u20131901."},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/1150402.1150464"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.23919\/ISC.2024.10528939"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00039"},{"key":"e_1_3_3_2_10_2","unstructured":"Epoch AI. 2023. Key Trends and Figures in Machine Learning. https:\/\/epoch.ai\/trends Accessed: 2025-03-28."},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4613-0303-933"},{"key":"e_1_3_3_2_12_2","unstructured":"Yuxian Gu Li Dong Furu Wei and Minlie Huang. 2024. MiniLLM: Knowledge Distillation of Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2306.08543\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2306.08543"},{"key":"e_1_3_3_2_13_2","unstructured":"Tom Gunter Zirui Wang Chong Wang Ruoming Pang Andy Narayanan Aonan Zhang Bowen Zhang Chen Chen Chung-Cheng Chiu David Qiu et\u00a0al. 2024. Apple Intelligence Foundation Language Models. arxiv:https:\/\/arXiv.org\/abs\/2407.21075\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2407.21075"},{"key":"e_1_3_3_2_14_2","unstructured":"Aaron Harlap Deepak Narayanan Amar Phanishayee Vivek Seshadri Nikhil Devanur Greg Ganger and Phil Gibbons. 2018. PipeDream: Fast and Efficient Pipeline Parallel DNN Training. arxiv:https:\/\/arXiv.org\/abs\/1806.03377\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/1806.03377"},{"key":"e_1_3_3_2_15_2","unstructured":"High-Performance Computing Center Stuttgart (HLRS). 2025. \u201cHunter\u201c Supercomputer Goes Into Service in Stuttgart. https:\/\/www.hlrs.de\/press\/detail\/hunter-supercomputer-goes-into-service-in-stuttgart Accessed: 28.03.2025."},{"key":"e_1_3_3_2_16_2","unstructured":"Geoffrey Hinton Oriol Vinyals and Jeff Dean. 2015. Distilling the Knowledge in a Neural Network. arxiv:https:\/\/arXiv.org\/abs\/1503.02531\u00a0[stat.ML] https:\/\/arxiv.org\/abs\/1503.02531"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"crossref","unstructured":"John Kim Wiliam\u00a0J Dally Steve Scott and Dennis Abts. 2008. Technology-driven highly-scalable dragonfly topology. ACM SIGARCH Computer Architecture News 36 3 (2008) 77\u201388.","DOI":"10.1145\/1394608.1382129"},{"key":"e_1_3_3_2_18_2","unstructured":"Eldar Kurtic Denis Kuznedelev Elias Frantar Michael Goin and Dan Alistarh. 2023. Sparse Fine-tuning for Inference Acceleration of Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2310.06927\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2310.06927"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_3_2_20_2","unstructured":"Shen Li Yanli Zhao Rohan Varma Omkar Salpekar Pieter Noordhuis Teng Li Adam Paszke Jeff Smith Brian Vaughan Pritam Damania and Soumith Chintala. 2020. PyTorch Distributed: Experiences on Accelerating Data Parallel Training. arxiv:https:\/\/arXiv.org\/abs\/2006.15704\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2006.15704"},{"key":"e_1_3_3_2_21_2","volume-title":"Advances in Neural Information Processing Systems","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et\u00a0al. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Advances in Neural Information Processing Systems , Vol.\u00a032. Curran Associates, Inc., New York, NY, USA, 12\u00a0pages."},{"key":"e_1_3_3_2_22_2","unstructured":"D. Patel D. Nishball and R. Knuhtsen. 2024. MI300X vs H100 vs H200 Benchmark Part 1: Training \u2013 CUDA Moat Still Alive. SemiAnalysis. https:\/\/semianalysis.com\/2024\/12\/22\/mi300x-vs-h100-vs-h200-benchmark-part-1-training\/ Accessed: Jan. 22 2025."},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_3_2_25_2","first-page":"551","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Ren Jie","year":"2021","unstructured":"Jie Ren, Samyam Rajbhandari, Reza\u00a0Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. 2021. { ZeRO-Offload} : Democratizing { Billion-Scale} Model Training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). USENIX Association, Berkeley, CA, US, 551\u2013564. https:\/\/www.usenix.org\/conference\/atc21\/presentation\/ren-jie"},{"key":"e_1_3_3_2_26_2","volume-title":"Advances in Neural Information Processing Systems","author":"Shazeer Noam","year":"2018","unstructured":"Noam Shazeer, Youlong Cheng, Niki Parmar, Dustin Tran, Ashish Vaswani, Penporn Koanantakool, Peter Hawkins, HyoukJoong Lee, Mingsheng Hong, Cliff Young, Ryan Sepassi, and Blake Hechtman. 2018. Mesh-TensorFlow: Deep Learning for Supercomputers. In Advances in Neural Information Processing Systems , S.\u00a0Bengio, H.\u00a0Wallach, H.\u00a0Larochelle, K.\u00a0Grauman, N.\u00a0Cesa-Bianchi, and R.\u00a0Garnett (Eds.), Vol.\u00a031. Curran Associates, Inc., New York, NY, USA, 10\u00a0pages."},{"key":"e_1_3_3_2_27_2","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper and Bryan Catanzaro. 2020. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arxiv:https:\/\/arXiv.org\/abs\/1909.08053\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/1909.08053"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00068"},{"key":"e_1_3_3_2_29_2","unstructured":"Sharath\u00a0Turuvekere Sreenivas Saurav Muralidharan Raviraj Joshi Marcin Chochowski Ameya\u00a0Sunil Mahabaleshwarkar Gerald Shen Jiaqi Zeng Zijia Chen Yoshi Suhara Shizhe Diao et\u00a0al. 2024. LLM Pruning and Distillation in Practice: The Minitron Approach. arxiv:https:\/\/arXiv.org\/abs\/2408.11796\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2408.11796"},{"key":"e_1_3_3_2_30_2","unstructured":"Gemma Team Morgane Riviere Shreya Pathak Pier\u00a0Giuseppe Sessa Cassidy Hardin Surya Bhupatiraju L\u00e9onard Hussenot Thomas Mesnard Bobak Shahriari Alexandre Ram\u00e9 et\u00a0al. 2024. Gemma 2: Improving Open Language Models at a Practical Size. arxiv:https:\/\/arXiv.org\/abs\/2408.00118\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2408.00118"},{"key":"e_1_3_3_2_31_2","unstructured":"Mistral\u00a0AI Team. 2025. Mistral Small 3 | Mistral AI. https:\/\/mistral.ai\/news\/mistral-small-3"},{"key":"e_1_3_3_2_32_2","unstructured":"TOP500.org. 2024. TOP500 List - November 2024. TOP500. https:\/\/www.top500.org\/lists\/top500\/2024\/11\/ Accessed: Jan. 22 2025."},{"key":"e_1_3_3_2_33_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar et\u00a0al. 2023. LLaMA: Open and Efficient Foundation Language Models. arxiv:https:\/\/arXiv.org\/abs\/2302.13971\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2302.13971"},{"key":"e_1_3_3_2_34_2","unstructured":"Xiaohan Xu Ming Li Chongyang Tao Tao Shen Reynold Cheng Jinyang Li Can Xu Dacheng Tao and Tianyi Zhou. 2024. A Survey on Knowledge Distillation of Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2402.13116\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2402.13116"}],"event":{"name":"CUG 2025: Cray User Group","acronym":"CUG '25","location":"Jersey City USA"},"container-title":["Proceedings of the Cray User Group"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3757348.3757361","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,11]],"date-time":"2025-11-11T10:28:29Z","timestamp":1762856909000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3757348.3757361"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,4]]},"references-count":33,"alternative-id":["10.1145\/3757348.3757361","10.1145\/3757348"],"URL":"https:\/\/doi.org\/10.1145\/3757348.3757361","relation":{},"subject":[],"published":{"date-parts":[[2025,5,4]]},"assertion":[{"value":"2025-11-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}