{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T13:04:51Z","timestamp":1780664691492,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,4,26]],"date-time":"2026-04-26T00:00:00Z","timestamp":1777161600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CNS-2420977"],"award-info":[{"award-number":["CNS-2420977"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3767295.3769366","type":"proceedings-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:20:04Z","timestamp":1777062004000},"page":"1738-1758","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Maya: Optimizing Deep Learning Training Workloads using GPU Runtime Emulation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-4020-3931","authenticated-orcid":false,"given":"Srihas","family":"Yarlagadda","sequence":"first","affiliation":[{"name":"School of Computer Science, Georgia Institute of Technology, Atlanta, Georgia, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2286-577X","authenticated-orcid":false,"given":"Amey","family":"Agrawal","sequence":"additional","affiliation":[{"name":"School of Computer Science, Georgia Institute of Technology, Atlanta, Georgia, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4570-5333","authenticated-orcid":false,"given":"Elton","family":"Pinto","sequence":"additional","affiliation":[{"name":"School of Computer Science, Georgia Institute of Technology, Atlanta, Georgia, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2878-0571","authenticated-orcid":false,"given":"Hakesh","family":"Darapaneni","sequence":"additional","affiliation":[{"name":"School of Computer Science, Georgia Institute of Technology, Atlanta, Georgia, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9381-479X","authenticated-orcid":false,"given":"Mitali","family":"Meratwal","sequence":"additional","affiliation":[{"name":"School of Computer Science, Georgia Institute of Technology, Atlanta, Georgia, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9173-026X","authenticated-orcid":false,"given":"Shivam","family":"Mittal","sequence":"additional","affiliation":[{"name":"School of Computer Science, Georgia Institute of Technology, Atlanta, Georgia, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0065-2522","authenticated-orcid":false,"given":"Pranavi","family":"Bajjuri","sequence":"additional","affiliation":[{"name":"School of Computer Science, Georgia Institute of Technology, Atlanta, Georgia, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0651-370X","authenticated-orcid":false,"given":"Srinivas","family":"Sridharan","sequence":"additional","affiliation":[{"name":"Nvidia Inc., Santa Clara, California, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7862-1477","authenticated-orcid":false,"given":"Alexey","family":"Tumanov","sequence":"additional","affiliation":[{"name":"School of Computer Science, Georgia Institute of Technology, Atlanta, Georgia, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,4,26]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"366","volume-title":"Vidur: A large-scale simulation framework for llm inference","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Jayashree Mohan, Ashish Panwar, Nipun Kwatra, Bhargav S. Gulavani, Ramachandran Ramjee, and Alexey Tumanov. Vidur: A large-scale simulation framework for llm inference. In P. Gibbons, G. Pekhimenko, and C. De Sa, editors, Proceedings of Machine Learning and Systems, volume 6, pages 351\u2013366, 2024."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Gargi Alavani Kajal Varma and Santonu Sarkar. Predicting execution time of cuda kernel using static analysis. In 2018 IEEE Intl Conf on Parallel & Distributed Processing with Applications Ubiquitous Computing & Communications Big Data & Cloud Computing Social Computing & Networking Sustainable Computing & Communications (ISPA\/IUCC\/BDCloud\/SocialCom\/SustainCom) pages 948\u2013955. IEEE 2018.","DOI":"10.1109\/BDCloud.2018.00139"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640366"},{"key":"e_1_3_2_1_4_1","volume-title":"vtrain: A simulation framework for evaluating cost-effective and compute-optimal large language model training. arXiv preprint arXiv:2312.12391","author":"Bang Jehyeon","year":"2023","unstructured":"Jehyeon Bang, Yujeong Choi, Myeongwoo Kim, Yongdeok Kim, and Minsoo Rhu. vtrain: A simulation framework for evaluating cost-effective and compute-optimal large language model training. arXiv preprint arXiv:2312.12391, 2023."},{"key":"e_1_3_2_1_5_1","volume-title":"A simple model for portable and fast prediction of execution time and power consumption of gpu kernels. ACM Transactions on Architecture and Code Optimization (TACO), 18(1):1\u201325","author":"Braun Lorenz","year":"2020","unstructured":"Lorenz Braun, Sotirios Nikas, Chen Song, Vincent Heuveline, and Holger Fr\u00f6ning. A simple model for portable and fast prediction of execution time and power consumption of gpu kernels. ACM Transactions on Architecture and Code Optimization (TACO), 18(1):1\u201325, 2020."},{"key":"e_1_3_2_1_6_1","volume-title":"Video generation models as world simulators","author":"Brooks Tim","year":"2024","unstructured":"Tim Brooks, Bill Peebles, Connor Holmes, Will DePue, Yufei Guo, Li Jing, David Schnurr, Joe Taylor, Troy Luhman, Eric Luhman, Clarence Ng, Ricky Wang, and Aditya Ramesh. Video generation models as world simulators. 2024."},{"key":"e_1_3_2_1_7_1","volume-title":"Language models are few-shot learners. arXiv preprint arXiv:2005.14165","author":"Brown Tom B","year":"2020","unstructured":"Tom B Brown. Language models are few-shot learners. arXiv preprint arXiv:2005.14165, 2020."},{"key":"e_1_3_2_1_8_1","volume-title":"https:\/\/github.com\/deepseek-ai\/DualPipe","year":"2025","unstructured":"DeepSeek. DualPipe. https:\/\/github.com\/deepseek-ai\/DualPipe, 2025."},{"key":"e_1_3_2_1_9_1","volume-title":"Deepseek-v3 technical report","author":"Aixin Liu AI","year":"2025","unstructured":"DeepSeek-AI, Aixin Liu, Bei Feng, Bing Xue, Bingxuan Wang, Bochao Wu, Chengda Lu, Chenggang Zhao, Chengqi Deng, Chenyu Zhang, Chong Ruan, Damai Dai, Daya Guo, Dejian Yang, Deli Chen, Dongjie Ji, Erhang Li, Fangyun Lin, Fucong Dai, Fuli Luo, Guangbo Hao, Guanting Chen, Guowei Li, H. Zhang, Han Bao, Hanwei Xu, Haocheng Wang, Haowei Zhang, Honghui Ding, Huajian Xin, Huazuo Gao, Hui Li, Hui Qu, J. L. Cai, Jian Liang, Jianzhong Guo, Jiaqi Ni, Jiashi Li, Jiawei Wang, Jin Chen, Jingchang Chen, Jingyang Yuan, Junjie Qiu, Junlong Li, Junxiao Song, Kai Dong, Kai Hu, Kaige Gao, Kang Guan, Kexin Huang, Kuai Yu, Lean Wang, Lecong Zhang, Lei Xu, Leyi Xia, Liang Zhao, Litong Wang, Liyue Zhang, Meng Li, Miaojun Wang, Mingchuan Zhang, Minghua Zhang, Minghui Tang, Mingming Li, Ning Tian, Panpan Huang, Peiyi Wang, Peng Zhang, Qiancheng Wang, Qihao Zhu, Qinyu Chen, Qiushi Du, R. J. Chen, R. L. Jin, Ruiqi Ge, Ruisong Zhang, Ruizhe Pan, Runji Wang, Runxin Xu, Ruoyu Zhang, Ruyi Chen, S. S. Li, Shanghao Lu, Shangyan Zhou, Shanhuang Chen, Shaoqing Wu, Shengfeng Ye, Shengfeng Ye, Shirong Ma, Shiyu Wang, Shuang Zhou, Shuiping Yu, Shunfeng Zhou, Shuting Pan, T. Wang, Tao Yun, Tian Pei, Tianyu Sun, W. L. Xiao, Wangding Zeng, Wanjia Zhao, Wei An, Wen Liu, Wenfeng Liang, Wenjun Gao, Wenqin Yu, Wentao Zhang, X. Q. Li, Xiangyue Jin, Xianzu Wang, Xiao Bi, Xiaodong Liu, Xiaohan Wang, Xiaojin Shen, Xiaokang Chen, Xiaokang Zhang, Xiaosha Chen, Xiaotao Nie, Xiaowen Sun, Xiaoxiang Wang, Xin Cheng, Xin Liu, Xin Xie, Xingchao Liu, Xingkai Yu, Xinnan Song, Xinxia Shan, Xinyi Zhou, Xinyu Yang, Xinyuan Li, Xuecheng Su, Xuheng Lin, Y. K. Li, Y. Q. Wang, Y. X. Wei, Y. X. Zhu, Yang Zhang, Yanhong Xu, Yanhong Xu, Yanping Huang, Yao Li, Yao Zhao, Yaofeng Sun, Yaohui Li, Yaohui Wang, Yi Yu, Yi Zheng, Yichao Zhang, Yifan Shi, Yiliang Xiong, Ying He, Ying Tang, Yishi Piao, Yisong Wang, Yixuan Tan, Yiyang Ma, Yiyuan Liu, Yongqiang Guo, Yu Wu, Yuan Ou, Yuchen Zhu, Yuduan Wang, Yue Gong, Yuheng Zou, Yujia He, Yukun Zha, Yunfan Xiong, Yunxian Ma, Yuting Yan, Yuxiang Luo, Yuxiang You, Yuxuan Liu, Yuyang Zhou, Z. F. Wu, Z. Z. Ren, Zehui Ren, Zhangli Sha, Zhe Fu, Zhean Xu, Zhen Huang, Zhen Zhang, Zhenda Xie, Zhengyan Zhang, Zhewen Hao, Zhibin Gou, Zhicheng Ma, Zhigang Yan, Zhihong Shao, Zhipeng Xu, Zhiyu Wu, Zhongyu Zhang, Zhuoshu Li, Zihui Gu, Zijia Zhu, Zijun Liu, Zilin Li, Ziwei Xie, Ziyang Song, Ziyi Gao, and Zizheng Pan. Deepseek-v3 technical report, 2025."},{"key":"e_1_3_2_1_10_1","volume-title":"https:\/\/github.com\/deepspeedai\/DeepSpeedExamples","year":"2020","unstructured":"DeepSpeed. DeepSpeedExamples. https:\/\/github.com\/deepspeedai\/DeepSpeedExamples, 2020."},{"key":"e_1_3_2_1_11_1","volume-title":"https:\/\/github.com\/JF-D\/Proteus","author":"Duan Jiangfei","year":"2023","unstructured":"Jiangfei Duan, Xiuhong Li, Ping Xu, Xingcheng Zhang, Shengen Yan, Yun Liang, and Dahua Lin. Proteus. https:\/\/github.com\/JF-D\/Proteus, 2023."},{"key":"e_1_3_2_1_12_1","volume-title":"Proteus: Simulating the performance of distributed dnn training. arXiv preprint arXiv:2306.02267","author":"Duan Jiangfei","year":"2023","unstructured":"Jiangfei Duan, Xiuhong Li, Ping Xu, Xingcheng Zhang, Shengen Yan, Yun Liang, and Dahua Lin. Proteus: Simulating the performance of distributed dnn training. arXiv preprint arXiv:2306.02267, 2023."},{"key":"e_1_3_2_1_13_1","volume-title":"The llama 3 herd of models. arXiv preprint arXiv:2407.21783","author":"Dubey Abhimanyu","year":"2024","unstructured":"Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan, et al. The llama 3 herd of models. arXiv preprint arXiv:2407.21783, 2024."},{"key":"e_1_3_2_1_14_1","first-page":"521","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Geoffrey X Yu","year":"2021","unstructured":"X Yu Geoffrey, Yubo Gao, Pavel Golikov, and Gennady Pekhimenko. Habitat: A {Runtime-Based} computational performance predictor for deep neural network training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21), pages 503\u2013521, 2021."},{"key":"e_1_3_2_1_15_1","first-page":"170","volume-title":"2018 30th International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD)","author":"Gianniti Eugenio","unstructured":"Eugenio Gianniti, Li Zhang, and Danilo Ardagna. Performance prediction of gpu-based deep learning applications. In 2018 30th International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD), pages 167\u2013170. IEEE, 2018."},{"key":"e_1_3_2_1_16_1","volume-title":"Accelerate: Training and inference at scale made simple, efficient and adaptable. https:\/\/github.com\/huggingface\/accelerate","author":"Gugger Sylvain","year":"2022","unstructured":"Sylvain Gugger, Lysandre Debut, Thomas Wolf, Philipp Schmid, Zachary Mueller, Sourab Mangrulkar, Marc Sun, and Benjamin Bossan. Accelerate: Training and inference at scale made simple, efficient and adaptable. https:\/\/github.com\/huggingface\/accelerate, 2022."},{"key":"e_1_3_2_1_17_1","volume-title":"The cma evolution strategy: A tutorial. arXiv preprint arXiv:1604.00772","author":"Hansen Nikolaus","year":"2016","unstructured":"Nikolaus Hansen. The cma evolution strategy: A tutorial. arXiv preprint arXiv:1604.00772, 2016."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1162\/106365601750190398"},{"key":"e_1_3_2_1_19_1","volume-title":"et al. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems, 32","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems, 32, 2019."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607102"},{"key":"e_1_3_2_1_21_1","volume-title":"Beyond data and model parallelism for deep neural networks. CoRR, abs\/1807.05358","author":"Jia Zhihao","year":"2018","unstructured":"Zhihao Jia, Matei Zaharia, and Alex Aiken. Beyond data and model parallelism for deep neural networks. CoRR, abs\/1807.05358, 2018."},{"key":"e_1_3_2_1_22_1","volume-title":"Scaling laws for neural language models. arXiv preprint arXiv:2001.08361","author":"Kaplan Jared","year":"2020","unstructured":"Jared Kaplan, Sam McCandlish, Tom Henighan, Tom B Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. Scaling laws for neural language models. arXiv preprint arXiv:2001.08361, 2020."},{"key":"e_1_3_2_1_23_1","first-page":"341","article-title":"Reducing activation recomputation in large transformer models","volume":"5","author":"Korthikanti Vijay Anand","year":"2023","unstructured":"Vijay Anand Korthikanti, Jared Casper, Sangkug Lym, Lawrence McAfee, Michael Andersch, Mohammad Shoeybi, and Bryan Catanzaro. Reducing activation recomputation in large transformer models. Proceedings of Machine Learning and Systems, 5:341\u2013353, 2023.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707265"},{"key":"e_1_3_2_1_25_1","volume-title":"Tune: A research platform for distributed model selection and training. arXiv preprint arXiv:1807.05118","author":"Liaw Richard","year":"2018","unstructured":"Richard Liaw, Eric Liang, Robert Nishihara, Philipp Moritz, Joseph E Gonzalez, and Ion Stoica. Tune: A research platform for distributed model selection and training. arXiv preprint arXiv:1807.05118, 2018."},{"key":"e_1_3_2_1_26_1","volume-title":"Apex: An extensible and dynamism-aware simulator for automated parallel execution in llm serving","author":"Lin Yi-Chien","year":"2025","unstructured":"Yi-Chien Lin, Woosuk Kwon, Ronald Pineda, and Fanny Nina Paravecino. Apex: An extensible and dynamism-aware simulator for automated parallel execution in llm serving, 2025."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3587135.3592200"},{"key":"e_1_3_2_1_28_1","volume-title":"DistIR: An Intermediate Representation for Optimizing Distributed Neural Networks. https:\/\/github.com\/microsoft\/dist-ir","year":"2023","unstructured":"Microsoft. DistIR: An Intermediate Representation for Optimizing Distributed Neural Networks. https:\/\/github.com\/microsoft\/dist-ir, 2023."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS57527.2023.00037"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_31_1","volume-title":"Introduction to InfiniBand. Whitepaper WP-190","author":"NVIDIA.","year":"2021","unstructured":"NVIDIA. Introduction to InfiniBand. Whitepaper WP-190, NVIDIA Corporation, 2021."},{"key":"e_1_3_2_1_32_1","volume-title":"Nvidia h100 tensor core gpu architecture. https:\/\/resources.nvidia.com\/en-us-tensor-core\/gtc22-whitepaper-hopper","author":"NVIDIA.","year":"2022","unstructured":"NVIDIA. Nvidia h100 tensor core gpu architecture. https:\/\/resources.nvidia.com\/en-us-tensor-core\/gtc22-whitepaper-hopper, 2022."},{"key":"e_1_3_2_1_33_1","volume-title":"NVIDIA Tesla V100 GPU Architecture. Whitepaper","author":"NVIDIA Corporation","year":"2017","unstructured":"NVIDIA Corporation. NVIDIA Tesla V100 GPU Architecture. Whitepaper, NVIDIA Corporation, 2017."},{"key":"e_1_3_2_1_34_1","volume-title":"https:\/\/openai.com\/chatgpt","author":"AI.","year":"2023","unstructured":"OpenAI. ChatGPT. https:\/\/openai.com\/chatgpt, 2023. Accessed: September 27, 2025."},{"key":"e_1_3_2_1_35_1","volume-title":"pplx-kernels. https:\/\/github.com\/perplexityai\/pplx-kernels","year":"2025","unstructured":"Perplexity. pplx-kernels. https:\/\/github.com\/perplexityai\/pplx-kernels, 2025."},{"key":"e_1_3_2_1_36_1","volume-title":"International Conference on Learning Representations","author":"Qi Hang","year":"2017","unstructured":"Hang Qi, Evan R Sparks, and Ameet Talwalkar. Paleo: A performance model for deep neural networks. In International Conference on Learning Representations, 2017."},{"key":"e_1_3_2_1_37_1","first-page":"16","volume-title":"SC20: International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Rajbhandari Samyam","unstructured":"Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. Zero: Memory optimizations toward training trillion parameter models. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis, pages 1\u201316. IEEE, 2020."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS48437.2020.00018"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437984.3458829"},{"key":"e_1_3_2_1_40_1","volume-title":"Memory optimization for deep networks. arXiv preprint arXiv:2010.14501","author":"Shah Aashaka","year":"2020","unstructured":"Aashaka Shah, Chao-Yuan Wu, Jayashree Mohan, Vijay Chidambaram, and Philipp Kr\u00e4henb\u00fchl. Memory optimization for deep networks. arXiv preprint arXiv:2010.14501, 2020."},{"key":"e_1_3_2_1_41_1","volume-title":"Megatron-lm: Training multibillion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. Megatron-lm: Training multibillion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053, 2019."},{"key":"e_1_3_2_1_42_1","volume-title":"et al. Singularity: Planet-scale, preemptive and elastic scheduling of ai workloads. arXiv preprint arXiv:2202.07848","author":"Shukla Dharma","year":"2022","unstructured":"Dharma Shukla, Muthian Sivathanu, Srinidhi Viswanatha, Bhargav Gulavani, Rimma Nehme, Amey Agrawal, Chen Chen, Nipun Kwatra, Ramachandran Ramjee, Pankaj Sharma, et al. Singularity: Planet-scale, preemptive and elastic scheduling of ai workloads. arXiv preprint arXiv:2202.07848, 2022."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593704"},{"key":"e_1_3_2_1_44_1","volume-title":"Practical bayesian optimization of machine learning algorithms. Advances in neural information processing systems, 25","author":"Snoek Jasper","year":"2012","unstructured":"Jasper Snoek, Hugo Larochelle, and Ryan P Adams. Practical bayesian optimization of machine learning algorithms. Advances in neural information processing systems, 25, 2012."},{"key":"e_1_3_2_1_45_1","first-page":"284","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Unger Colin","year":"2022","unstructured":"Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Pat McCormick, Jamaludin Mohd-Yusof, et al. Unity: Accelerating {DNN} training through joint optimization of algebraic transformations and parallelization. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 267\u2013284, 2022."},{"key":"e_1_3_2_1_46_1","volume-title":"Omniwise: Predicting gpu kernels performance with llms","author":"Wang Zixian","year":"2025","unstructured":"Zixian Wang, Cole Ramos, Muhammad A. Awad, and Keith Lowery. Omniwise: Predicting gpu kernels performance with llms, 2025."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS57527.2023.00035"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/2783258.2783270"},{"key":"e_1_3_2_1_49_1","volume-title":"Deepep: an efficient expert-parallel communication library. https:\/\/github.com\/deepseekai\/DeepEP","author":"Zhao Chenggang","year":"2025","unstructured":"Chenggang Zhao, Shangyan Zhou, Liyue Zhang, Chengqi Deng, Zhean Xu, Yuxuan Liu, Kuai Yu, Jiashi Li, and Liang Zhao. Deepep: an efficient expert-parallel communication library. https:\/\/github.com\/deepseekai\/DeepEP, 2025."},{"key":"e_1_3_2_1_50_1","first-page":"578","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P Xing, et al. Alpa: Automating inter-and {Intra-Operator} parallelism for distributed deep learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 559\u2013578, 2022."},{"key":"e_1_3_2_1_51_1","first-page":"352","volume-title":"2020 USENIX Annual Technical Conference (USENIX ATC 20)","author":"Zhu Hongyu","year":"2020","unstructured":"Hongyu Zhu, Amar Phanishayee, and Gennady Pekhimenko. Daydream: Accurately estimating the efficacy of optimizations for {DNN} training. In 2020 USENIX Annual Technical Conference (USENIX ATC 20), pages 337\u2013352, 2020."}],"event":{"name":"EUROSYS '26: 21st European Conference on Computer Systems","location":"McEwan Hall\/The University of Edinburgh Edinburgh Scotland UK","acronym":"EUROSYS '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 21st European Conference on Computer Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3767295.3769366","content-type":"text\/html","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3767295.3769366","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3767295.3769366","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T12:17:21Z","timestamp":1780661841000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3767295.3769366"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":51,"alternative-id":["10.1145\/3767295.3769366","10.1145\/3767295"],"URL":"https:\/\/doi.org\/10.1145\/3767295.3769366","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]},"assertion":[{"value":"2026-04-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}