{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,27]],"date-time":"2026-05-27T06:05:15Z","timestamp":1779861915889,"version":"3.53.1"},"reference-count":59,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,4,26]],"date-time":"2026-04-26T00:00:00Z","timestamp":1777161600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,4,26]],"date-time":"2026-04-26T00:00:00Z","timestamp":1777161600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,4,26]]},"DOI":"10.1109\/ispass69572.2026.00050","type":"proceedings-article","created":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T19:39:19Z","timestamp":1779824359000},"page":"448-460","source":"Crossref","is-referenced-by-count":0,"title":["Evaluating Cross-Architecture Performance Modeling of Distributed ML Workloads Using StableHLO"],"prefix":"10.1109","author":[{"given":"Jonas","family":"Svedas","sequence":"first","affiliation":[{"name":"20 Station Road, Cambridge CB1 2JD,imec,UK"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Nathan","family":"Laubeuf","sequence":"additional","affiliation":[{"name":"Kapeldreef 75, 3001,imec,Leuven,Belgium"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ryan","family":"Harvey","sequence":"additional","affiliation":[{"name":"20 Station Road, Cambridge CB1 2JD,imec,UK"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Arjun","family":"Singh","sequence":"additional","affiliation":[{"name":"20 Station Road, Cambridge CB1 2JD,imec,UK"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Changhai","family":"Man","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology,Atlanta,GA,USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Abubakr","family":"Nada","sequence":"additional","affiliation":[{"name":"20 Station Road, Cambridge CB1 2JD,imec,UK"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tushar","family":"Krishna","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology,Atlanta,GA,USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"James","family":"Myers","sequence":"additional","affiliation":[{"name":"20 Station Road, Cambridge CB1 2JD,imec,UK"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Debjyoti","family":"Bhattacharjee","sequence":"additional","affiliation":[{"name":"Kapeldreef 75, 3001,imec,Leuven,Belgium"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Training compute of frontier ai models grows by 4-5x}per year","author":"Sevilla","year":"2024"},{"key":"ref2","article-title":"The new llm bottleneck: A systems perspective on latent attention and mixture-of-experts","author":"Yun","year":"2025","journal-title":"arXiv preprint arXiv:2507.15465"},{"key":"ref3","article-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer","author":"Shazeer","year":"2017","journal-title":"arXiv preprint arXiv:1701.06538"},{"key":"ref4","article-title":"Mamba: Linear-time sequence modeling with selective state spaces","volume-title":"First conference on language modeling","author":"Gu"},{"key":"ref5","first-page":"1587","article-title":"A survey of state of the art large vision language models: Benchmark evaluations and challenges","volume-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","author":"Li"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1189"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"ref8","article-title":"cudnn: Efficient primitives for deep learning","author":"Chetlur","year":"2014","journal-title":"arXiv preprint arXiv:1410.0759"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.51130\/graphicon-2020-2-2-2"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CGO57630.2024.10444871"},{"key":"ref11","article-title":"Xla: Compiling machine learning for peak performance","author":"Sabne","year":"2020"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640366"},{"key":"ref13","article-title":"Nvidia blackwell architecture technical overview","volume-title":"NVIDIA Corporation, Tech. Rep.","year":"2024"},{"key":"ref14","article-title":"Introducing amd CDNA 3 architecture","volume":"20","year":"2025","journal-title":"Advanced Micro Devices, Inc., White Paper 2258402-A"},{"key":"ref15","article-title":"Ironwood: The first Google TPU for the age of inference","year":"2025"},{"key":"ref16","article-title":"Stablehlo specification","year":"2023"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00047"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS57527.2023.00037"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607102"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC63097.2024.00015"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3587135.3592200"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3635867"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS57527.2023.00035"},{"key":"ref24","first-page":"541","article-title":": Unifying architecture design and performance tuning for Large-Scale large language model training with scalability and precision","volume-title":"22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25)","author":"Wang"},{"key":"ref25","first-page":"473","article-title":"Accelerating design space exploration for LLM training systems with multi-experiment parallel simulation","volume-title":"22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25)","author":"Gui"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00021"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3712285.3759838"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2024.3443255"},{"key":"ref29","first-page":"337","article-title":"Daydream: Accurately estimating the efficacy of optimizations for DNN training","volume-title":"2020 USENIX Annual Technical Conference (USENIX ATC 20)","author":"Zhu"},{"key":"ref30","article-title":"Echo: Simulating distributed training at scale","author":"Feng","year":"2024","journal-title":"arXiv preprint arXiv:2412.12487"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00082"},{"key":"ref32","article-title":"Chakra: Advancing performance benchmarking and co-design using standardized execution traces","author":"Sridharan","year":"2023","journal-title":"arXiv preprint arXiv:2305.14516"},{"key":"ref33","article-title":"Llmservingsim: A hw\/sw co-simulation infrastructure for 11 m inference serving at scale","author":"Cho","year":"2024","journal-title":"arXiv preprint arXiv:2408.05499"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2009.70"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3437984.3458829"},{"key":"ref36","first-page":"638","article-title":"torch. fx: Practical program capture and transformation for deep learning in python","volume-title":"Proceedings of Machine Learning and Systems","volume":"4","author":"Reed"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707265"},{"key":"ref38","article-title":"Onnx: Open neural network exchange","author":"Bai","year":"2019"},{"key":"ref39","article-title":"Onnxim: A fast, cycle-level multi-core npu simulator","author":"Ham","year":"2024","journal-title":"arXiv preprint arXiv:2406.08051"},{"key":"ref40","article-title":"Zigzag: A memory-centric rapid dnn accelerator design space exploration framework","author":"Mei","year":"2020","journal-title":"arXiv preprint arXiv:2007.11360"},{"key":"ref41","article-title":"Xla architecture and high-level optimizer (hlo)","author":"Contributors","year":"2024"},{"key":"ref42","article-title":"Stablehlo: A portable, stable, and versioned ir for ml workloads","author":"Community","year":"2023"},{"key":"ref43","article-title":"\u2019sdy\u2019 dialect - openxla project","year":"2025"},{"key":"ref44","article-title":"Mlir: A compiler infrastructure for the end of moore\u2019s law","volume-title":"Proceedings of the IEEE","author":"Lattner"},{"key":"ref45","first-page":"2025","article-title":"Maxtext: A simple, performant 11 m library in jax"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/1964218.1964225"},{"key":"ref47","article-title":"Jax: composable transformations of python+numpy programs","year":"2018"},{"key":"ref48","article-title":"Flax: A neural network library and ecosystem for JAX","author":"Heek","year":"2024"},{"key":"ref49","article-title":"xprof: A profiling and performance analysis tool for machine learning","author":"contributors","journal-title":"OpenXLA, gitHub repository, Apache-2.0 license."},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.2122762119"},{"key":"ref51","article-title":"The current and future of roofline","author":"Yang","year":"2019"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS64960.2025.00025"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1145\/3360307"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1145\/1394608.1382129"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"ref56","article-title":"Scale-sim v3: A modular cycle-accurate systolic accelerator simulator for end-to-end system analysis","volume":"20","author":"Raj","journal-title":"arXiv preprint arXiv:2504.15377"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2021.3059962"},{"key":"ref58","article-title":"Iree: An mlirbased compiler and runtime for machine learning models","author":"Authors","year":"2019"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1145\/3725843.3756045"}],"event":{"name":"2026 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)","location":"Seoul, Korea, Republic of","start":{"date-parts":[[2026,4,26]]},"end":{"date-parts":[[2026,4,28]]}},"container-title":["2026 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11527204\/11527232\/11527299.pdf?arnumber=11527299","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,27]],"date-time":"2026-05-27T05:36:46Z","timestamp":1779860206000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11527299\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":59,"URL":"https:\/\/doi.org\/10.1109\/ispass69572.2026.00050","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]}}}