{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T16:50:49Z","timestamp":1771951849241,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":71,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,27]],"date-time":"2024-04-27T00:00:00Z","timestamp":1714176000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,27]]},"DOI":"10.1145\/3620665.3640399","type":"proceedings-article","created":{"date-parts":[[2024,4,22]],"date-time":"2024-04-22T14:18:06Z","timestamp":1713795486000},"page":"1095-1111","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Slapo: A Schedule Language for Progressive Optimization of Large Deep Learning Model Training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6617-0075","authenticated-orcid":false,"given":"Hongzheng","family":"Chen","sequence":"first","affiliation":[{"name":"Cornell University, Ithaca, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9298-6254","authenticated-orcid":false,"given":"Cody Hao","family":"Yu","sequence":"additional","affiliation":[{"name":"Boson AI, Inc, Santa Clara, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3093-6486","authenticated-orcid":false,"given":"Shuai","family":"Zheng","sequence":"additional","affiliation":[{"name":"Boson AI, Inc, Santa Clara, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0164-0849","authenticated-orcid":false,"given":"Zhen","family":"Zhang","sequence":"additional","affiliation":[{"name":"Amazon Web Services, Santa Clara, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0778-0308","authenticated-orcid":false,"given":"Zhiru","family":"Zhang","sequence":"additional","affiliation":[{"name":"Cornell University, Ithaca, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8165-840X","authenticated-orcid":false,"given":"Yida","family":"Wang","sequence":"additional","affiliation":[{"name":"Amazon Web Services, Santa Clara, United States of America"}]}],"member":"320","published-online":{"date-parts":[[2024,4,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the USENIX Conference on Operating Systems Design and Implementation (OSDI)","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek G. Murray, Benoit Steiner, Paul Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng. Tensor-flow: A system for large-scale machine learning. In Proceedings of the USENIX Conference on Operating Systems Design and Implementation (OSDI), 2016."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2019.8661197"},{"key":"e_1_3_2_1_3_1","volume-title":"On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258","author":"Bommasani Rishi","year":"2021","unstructured":"Rishi Bommasani, Drew A. Hudson, Ehsan Adeli, Russ Altman, Simran Arora, Sydney von Arx, Michael S. Bernstein, Jeannette Bohg, Antoine Bosselut, Emma Brunskill, et al. On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258, 2021."},{"key":"e_1_3_2_1_4_1","unstructured":"James Bradbury Roy Frostig Peter Hawkins Matthew James Johnson Chris Leary Dougal Maclaurin George Necula Adam Paszke Jake VanderPlas Skye Wanderman-Milne and Qiao Zhang. JAX: composable transformations of Python+NumPy programs. http:\/\/github.com\/google\/jax 2018."},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS)","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. Language models are few-shot learners. In Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS), 2020."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476159"},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the USENIX Conference on Operating Systems Design and Implementation (OSDI)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Meghan Cowan, Haichen Shen, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. TVM: An automated end-to-end optimizing compiler for deep learning. In Proceedings of the USENIX Conference on Operating Systems Design and Implementation (OSDI), 2018."},{"key":"e_1_3_2_1_8_1","volume-title":"Training deep nets with sublinear memory cost. arXiv preprint arXiv:1604.06174","author":"Chen Tianqi","year":"2016","unstructured":"Tianqi Chen, Bing Xu, Chiyuan Zhang, and Carlos Guestrin. Training deep nets with sublinear memory cost. arXiv preprint arXiv:1604.06174, 2016."},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Lianmin Zheng, Eddie Yan, Ziheng Jiang, Thierry Moreau, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. Learning to optimize tensor programs. In Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS), 2018."},{"key":"e_1_3_2_1_10_1","volume-title":"Charles Sutton, Sebastian Gehrmann, et al. Palm: Scaling language modeling with pathways. arXiv preprint arXiv:2204.02311","author":"Chowdhery Aakanksha","year":"2022","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. Palm: Scaling language modeling with pathways. arXiv preprint arXiv:2204.02311, 2022."},{"key":"e_1_3_2_1_11_1","volume-title":"FlashAttention: Fast and memory-efficient exact attention with io-awareness. arXiv preprint arXiv:2205.14135","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Daniel Y Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. FlashAttention: Fast and memory-efficient exact attention with io-awareness. arXiv preprint arXiv:2205.14135, 2022."},{"key":"e_1_3_2_1_12_1","volume-title":"BERT: Pre-training of deep bidirectional Transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. BERT: Pre-training of deep bidirectional Transformers for language understanding. arXiv preprint arXiv:1810.04805, 2018."},{"key":"e_1_3_2_1_13_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. An image is worth 16\u00d716 words: Transformers for image recognition at scale. In International Conference on Learning Representations (ICLR), 2021."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.5555\/313651.313830"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3576933"},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the USENIX Conference on Operating Systems Design and Implementation (OSDI)","author":"Gandhi Swapnil","year":"2021","unstructured":"Swapnil Gandhi and Anand Padmanabha Iyer. P3: Distributed deep graph learning at scale. In Proceedings of the USENIX Conference on Operating Systems Design and Implementation (OSDI), 2021."},{"key":"e_1_3_2_1_17_1","volume-title":"Distributed training with tensorflow. https:\/\/www.tensorflow.org\/guide\/distributed_training","year":"2022","unstructured":"Google. Distributed training with tensorflow. https:\/\/www.tensorflow.org\/guide\/distributed_training, 2022."},{"key":"e_1_3_2_1_18_1","volume-title":"XLA: Optimizing compiler for machine learning. https:\/\/www.tensorflow.org\/xla","year":"2022","unstructured":"Google. XLA: Optimizing compiler for machine learning. https:\/\/www.tensorflow.org\/xla, 2022."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3410463.3414632"},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS)","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Mia Xu Chen, Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V. Le, Yonghui Wu, and Zhifeng Chen. GPipe: Efficient training of giant neural networks using pipeline parallelism. In Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS), 2019."},{"key":"e_1_3_2_1_21_1","volume-title":"Wikipedia-en dataset. https:\/\/huggingface.co\/datasets\/wikipedia","year":"2022","unstructured":"HuggingFace. Wikipedia-en dataset. https:\/\/huggingface.co\/datasets\/wikipedia, 2022."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of Machine Learning and Systems (MLSys)","author":"Jain Paras","year":"2020","unstructured":"Paras Jain, Ajay Jain, Aniruddha Nrusimha, Amir Gholami, Pieter Abbeel, Joseph Gonzalez, Kurt Keutzer, and Ion Stoica. Checkmate: Breaking the memory wall with optimal tensor rematerialization. In Proceedings of Machine Learning and Systems (MLSys), 2020."},{"key":"e_1_3_2_1_23_1","volume-title":"Pavel Belevich. Pippy: Pipeline parallelism for pytorch. https:\/\/github.com\/pytorch\/PiPPy","author":"James Reed Ke Wen","year":"2022","unstructured":"Ke Wen James Reed, Pavel Belevich. Pippy: Pipeline parallelism for pytorch. https:\/\/github.com\/pytorch\/PiPPy, 2022."},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the USENIX Conference on Operating Systems Design and Implementation (OSDI)","author":"Jiang Yimin","year":"2020","unstructured":"Yimin Jiang, Yibo Zhu, Chang Lan, Bairen Yi, Yong Cui, and Chuanxiong Guo. A unified architecture for accelerating distributed dnn training in heterogeneous gpu\/cpu clusters. In Proceedings of the USENIX Conference on Operating Systems Design and Implementation (OSDI), 2020."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575747"},{"key":"e_1_3_2_1_26_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Kirisame Marisa","year":"2021","unstructured":"Marisa Kirisame, Steven Lyubomirsky, Altan Haan, Jennifer Brennan, Mike He, Jared Roesch, Tianqi Chen, and Zachary Tatlock. Dynamic tensor rematerialization. In International Conference on Learning Representations (ICLR), 2021."},{"key":"e_1_3_2_1_27_1","volume-title":"Reducing activation recomputation in large Transformer models. arXiv preprint arXiv:2205.05198","author":"Korthikanti Vijay","year":"2022","unstructured":"Vijay Korthikanti, Jared Casper, Sangkug Lym, Lawrence McAfee, Michael Andersch, Mohammad Shoeybi, and Bryan Catanzaro. Reducing activation recomputation in large Transformer models. arXiv preprint arXiv:2205.05198, 2022."},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the ACM\/SIGDA International Symposium on Field-Programmable Gate Arrays (FPGA)","author":"Lai Yi-Hsiang","year":"2019","unstructured":"Yi-Hsiang Lai, Yuze Chi, Yuwei Hu, Jie Wang, Cody Hao Yu, Yuan Zhou, Jason Cong, and Zhiru Zhang. HeteroCL: A multi-paradigm programming infrastructure for software-defined reconfigurable computing. In Proceedings of the ACM\/SIGDA International Symposium on Field-Programmable Gate Arrays (FPGA), 2019."},{"key":"e_1_3_2_1_29_1","volume-title":"xFormers: A modular and hackable Transformer modelling library. https:\/\/github.com\/facebookresearch\/xformers","author":"Lefaudeux Benjamin","year":"2022","unstructured":"Benjamin Lefaudeux, Francisco Massa, Diana Liskovich, Wenhan Xiong, Vittorio Caggiano, Sean Naren, Min Xu, Jieru Hu, Marta Tintore, Susan Zhang, Patrick Labatut, and Daniel Haziza. xFormers: A modular and hackable Transformer modelling library. https:\/\/github.com\/facebookresearch\/xformers, 2022."},{"key":"e_1_3_2_1_30_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Lepikhin Dmitry","year":"2021","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. GShard: Scaling giant models with conditional computation and automatic sharding. In International Conference on Learning Representations (ICLR), 2021."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the USENIX Symposium on Networked Systems Design and Implementation (NSDI)","author":"Liu Tianfeng","year":"2023","unstructured":"Tianfeng Liu, Yangrui Chen, Dan Li, Chuan Wu, Yibo Zhu, Jun He, Yanghua Peng, Hongzheng Chen, Hongzhi Chen, and Chuanxiong Guo. BGL: GPU-Efficient GNN training by optimizing graph data I\/O and preprocessing. In Proceedings of the USENIX Symposium on Networked Systems Design and Implementation (NSDI), 2023."},{"key":"e_1_3_2_1_33_1","volume-title":"RoBERTa: A robustly optimized BERT pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. RoBERTa: A robustly optimized BERT pretraining approach. arXiv preprint arXiv:1907.11692, 2019."},{"key":"e_1_3_2_1_34_1","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Liu Ze","year":"2022","unstructured":"Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, and Baining Guo. Swin Transformer V2: Scaling up capacity and resolution. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2022."},{"key":"e_1_3_2_1_35_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101, 2017."},{"key":"e_1_3_2_1_36_1","volume-title":"Torchvision: Pytorch's computer vision library. https:\/\/github.com\/pytorch\/vision","author":"TorchVision","year":"2016","unstructured":"TorchVision maintainers and contributors. Torchvision: Pytorch's computer vision library. https:\/\/github.com\/pytorch\/vision, 2016."},{"key":"e_1_3_2_1_37_1","volume-title":"Digit. Tech. J.","author":"McKeeman William M.","year":"1998","unstructured":"William M. McKeeman. Differential testing for software. Digit. Tech. J., 1998."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_40_1","series-title":"SIAM Journal on Optimization","volume-title":"Efficiency of coordinate descent methods on huge-scale optimization problems","author":"Nesterov Yu.","year":"2012","unstructured":"Yu. Nesterov. Efficiency of coordinate descent methods on huge-scale optimization problems. SIAM Journal on Optimization, 2012."},{"key":"e_1_3_2_1_41_1","volume-title":"Apex: Tools for easy mixed precision and distributed training in pytorch. https:\/\/github.com\/NVIDIA\/apex","year":"2022","unstructured":"Nvidia. Apex: Tools for easy mixed precision and distributed training in pytorch. https:\/\/github.com\/NVIDIA\/apex, 2022."},{"key":"e_1_3_2_1_42_1","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS)","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas K\u00f6pf, Edward Yang, Zach DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. PyTorch: An imperative style, high-performance deep learning library. In Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS), 2019."},{"key":"e_1_3_2_1_43_1","volume-title":"https:\/\/pytorch.org\/get-started\/pytorch-2.0\/","author":"PyTorch","year":"2022","unstructured":"PyTorch. PyTorch 2.0. https:\/\/pytorch.org\/get-started\/pytorch-2.0\/, 2022."},{"key":"e_1_3_2_1_44_1","volume-title":"Torchdynamo overview. https:\/\/pytorch.org\/docs\/master\/dynamo\/","year":"2022","unstructured":"PyTorch. Torchdynamo overview. https:\/\/pytorch.org\/docs\/master\/dynamo\/, 2022."},{"key":"e_1_3_2_1_45_1","volume-title":"https:\/\/dev-discuss.pytorch.org\/t\/747","year":"2022","unstructured":"PyTorch. TorchInductor. https:\/\/dev-discuss.pytorch.org\/t\/747, 2022."},{"key":"e_1_3_2_1_46_1","volume-title":"https:\/\/pytorch.org\/docs\/stable\/jit.html","year":"2022","unstructured":"PyTorch. TorchScript. https:\/\/pytorch.org\/docs\/stable\/jit.html, 2022."},{"issue":"8","key":"e_1_3_2_1_47_1","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever. Language models are unsupervised multitask learners. OpenAI blog, 1(8):9, 2019.","journal-title":"OpenAI blog"},{"issue":"140","key":"e_1_3_2_1_48_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text Transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. Exploring the limits of transfer learning with a unified text-to-text Transformer. Journal of Machine Learning Research, 21(140):1--67, 2020.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/2491956.2462176"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of Machine Learning and Systems (MLSys)","author":"Reed James","year":"2022","unstructured":"James Reed, Zachary DeVito, Horace He, Ansley Ussery, and Jason Ansel. torch.fx: Practical program capture and transformation for deep learning in python. Proceedings of Machine Learning and Systems (MLSys), 2022."},{"key":"e_1_3_2_1_53_1","volume-title":"Getting started - accelerate your scripts with nvfuser. https:\/\/github.com\/pytorch\/tutorials\/blob\/0d8c59f\/intermediate_source\/nvfuser_intro_tutorial.py","author":"Sarofeen Christian","year":"2022","unstructured":"Christian Sarofeen, Piotr Bialecki, Kevin Stephano, Jie Jiang, Masaki Kozuki, and Neal Vaidya. Getting started - accelerate your scripts with nvfuser. https:\/\/github.com\/pytorch\/tutorials\/blob\/0d8c59f\/intermediate_source\/nvfuser_intro_tutorial.py, 2022."},{"key":"e_1_3_2_1_54_1","volume-title":"Horovod: fast and easy distributed deep learning in tensorflow. arXiv preprint arXiv:1802.05799","author":"Sergeev Alexander","year":"2018","unstructured":"Alexander Sergeev and Mike Del Balso. Horovod: fast and easy distributed deep learning in tensorflow. arXiv preprint arXiv:1802.05799, 2018."},{"key":"e_1_3_2_1_55_1","volume-title":"Megatron-LM: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. Megatron-LM: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053, 2019."},{"key":"e_1_3_2_1_56_1","volume-title":"Jie Young Sohn, and Denys Shabalin. LazyTensor: combining eager execution with domain-specific compilers. arXiv preprint arXiv:2102.13267","author":"Suhan Alex","year":"2021","unstructured":"Alex Suhan, Davide Libenzi, Ailing Zhang, Parker Schuh, Brennan Saeta, Jie Young Sohn, and Denys Shabalin. LazyTensor: combining eager execution with domain-specific compilers. arXiv preprint arXiv:2102.13267, 2021."},{"key":"e_1_3_2_1_57_1","volume-title":"Accelerating pytorch with cuda graphs). https:\/\/pytorch.org\/blog\/accelerating-pytorch-with-cuda-graphs\/","author":"Team PyTorch","year":"2021","unstructured":"PyTorch Team. Accelerating pytorch with cuda graphs). https:\/\/pytorch.org\/blog\/accelerating-pytorch-with-cuda-graphs\/, 2021."},{"key":"e_1_3_2_1_58_1","volume-title":"LLaMA: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. LLaMA: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971, 2023."},{"key":"e_1_3_2_1_59_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. LLaMA 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 2023."},{"key":"e_1_3_2_1_60_1","volume-title":"Proceedings of the USENIX Conference on Operating Systems Design and Implementation (OSDI)","author":"Unger Colin","year":"2022","unstructured":"Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Pat McCormick, Jamaludin Mohd-Yusof, Xi Luo, Dheevatsa Mudigere, Jongsoo Park, Misha Smelyanskiy, and Alex Aiken. Unity: Accelerating DNN training through joint optimization of algebraic transformations and parallelization. In Proceedings of the USENIX Conference on Operating Systems Design and Implementation (OSDI), 2022."},{"key":"e_1_3_2_1_61_1","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS)","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. Attention is all you need. In Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS), 2017."},{"key":"e_1_3_2_1_62_1","volume-title":"Huggingface's Transformers: State-of-the-art natural language processing. arXiv preprint arXiv:1910.03771","author":"Wolf Thomas","year":"2019","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, R\u00e9mi Louf, Morgan Funtowicz, et al. Huggingface's Transformers: State-of-the-art natural language processing. arXiv preprint arXiv:1910.03771, 2019."},{"key":"e_1_3_2_1_63_1","volume-title":"Proceedings of the ACM\/SIGDA International Symposium on Field-Programmable Gate Arrays (FPGA)","author":"Xiang Shaojie","year":"2022","unstructured":"Shaojie Xiang, Yi-Hsiang Lai, Yuan Zhou, Hongzheng Chen, Niansong Zhang, Debjit Pal, and Zhiru Zhang. HeteroFlow: An accelerator programming model with decoupled data placement for software-defined fpgas. In Proceedings of the ACM\/SIGDA International Symposium on Field-Programmable Gate Arrays (FPGA), 2022."},{"key":"e_1_3_2_1_64_1","volume-title":"GSPMD: General and scalable parallelization for ml computation graphs. arXiv preprint arXiv:2105.04663","author":"Xu Yuanzhong","year":"2021","unstructured":"Yuanzhong Xu, HyoukJoong Lee, Dehao Chen, Blake Hechtman, Yanping Huang, Rahul Joshi, Maxim Krikun, Dmitry Lepikhin, Andy Ly, Marcello Maggioni, Ruoming Pang, Noam Shazeer, Shibo Wang, Tao Wang, Yonghui Wu, and Zhifeng Chen. GSPMD: General and scalable parallelization for ml computation graphs. arXiv preprint arXiv:2105.04663, 2021."},{"key":"e_1_3_2_1_65_1","volume-title":"RAF: Holistic compilation for deep learning model training. arXiv preprint arXiv:2303.04759","author":"Yu Cody Hao","year":"2023","unstructured":"Cody Hao Yu, Haozheng Fan, Guangtai Huang, Zhen Jia, Yizhi Liu, Jie Wang, Zach Zheng, Yuan Zhou, Haichen Shen, Junru Shao, Mu Li, and Yida Wang. RAF: Holistic compilation for deep learning model training. arXiv preprint arXiv:2303.04759, 2023."},{"key":"e_1_3_2_1_66_1","volume-title":"Wide residual networks. arXiv preprint arXiv:1605.07146","author":"Zagoruyko Sergey","year":"2016","unstructured":"Sergey Zagoruyko and Nikos Komodakis. Wide residual networks. arXiv preprint arXiv:1605.07146, 2016."},{"key":"e_1_3_2_1_67_1","unstructured":"Susan Zhang Stephen Roller Naman Goyal Mikel Artetxe Moya Chen Shuohui Chen Christopher Dewan Mona Diab Xian Li Xi Victoria Lin Todor Mihaylov Myle Ott Sam Shleifer Kurt Shuster Daniel Simig Punit Singh Koura Anjali Sridhar Tianlu Wang and Luke Zettlemoyer. OPT: Open pre-trained Transformer language models. arXiv preprint arXiv:2205.01068 2022."},{"key":"e_1_3_2_1_68_1","volume-title":"Proc. VLDB Endow.","author":"Zhang Zhen","year":"2022","unstructured":"Zhen Zhang, Shuai Zheng, Yida Wang, Justin Chiu, George Karypis, Trishul Chilimbi, Mu Li, and Xin Jin. MiCS: Near-linear scaling for training gigantic model on public cloud. Proc. VLDB Endow., 2022."},{"key":"e_1_3_2_1_69_1","volume-title":"Proc. VLDB Endow.","author":"Zhao Yanli","year":"2023","unstructured":"Yanli Zhao, Andrew Gu, Rohan Varma, Liang Luo, Chien-Chin Huang, Min Xu, Less Wright, Hamid Shojanazeri, Myle Ott, Sam Shleifer, Alban Desmaison, Can Balioglu, Pritam Damania, Bernard Nguyen, Geeta Chauhan, Yuchen Hao, Ajit Mathews, and Shen Li. Pytorch FSDP: Experiences on scaling fully sharded data parallel. Proc. VLDB Endow., 2023."},{"key":"e_1_3_2_1_70_1","volume-title":"Proceedings of the USENIX Conference on Operating Systems Design and Implementation (OSDI)","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, et al. Ansor: Generating high-performance tensor programs for deep learning. In Proceedings of the USENIX Conference on Operating Systems Design and Implementation (OSDI), 2020."},{"key":"e_1_3_2_1_71_1","volume-title":"Proceedings of the USENIX Conference on Operating Systems Design and Implementation (OSDI)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P. Xing, Joseph E. Gonzalez, and Ion Stoica. Alpa: Automating inter- and Intra-Operator parallelism for distributed deep learning. In Proceedings of the USENIX Conference on Operating Systems Design and Implementation (OSDI), 2022."}],"event":{"name":"ASPLOS '24: 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","location":"La Jolla CA USA","acronym":"ASPLOS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620665.3640399","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3620665.3640399","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:03:42Z","timestamp":1750291422000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620665.3640399"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,27]]},"references-count":71,"alternative-id":["10.1145\/3620665.3640399","10.1145\/3620665"],"URL":"https:\/\/doi.org\/10.1145\/3620665.3640399","relation":{},"subject":[],"published":{"date-parts":[[2024,4,27]]},"assertion":[{"value":"2024-04-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}