{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T22:50:31Z","timestamp":1757631031292,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":100,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,6]]},"DOI":"10.1145\/3676642.3736399","type":"proceedings-article","created":{"date-parts":[[2025,8,6]],"date-time":"2025-08-06T22:19:59Z","timestamp":1754518799000},"page":"5-17","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["The ASPLOS 2025 \/ EuroSys 2025 Contest on Intra-Operator Parallelism for Distributed Deep Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7655-5024","authenticated-orcid":false,"given":"Michael D.","family":"Moffitt","sequence":"first","affiliation":[{"name":"Google, Austin, Texas, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0866-9429","authenticated-orcid":false,"given":"Pratik","family":"Fegade","sequence":"additional","affiliation":[{"name":"Google, Seattle, Washington, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,8,6]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the 12th USENIX Symposium on Operating Systems Design and Implementation, (OSDI","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek G. Murray, Benoit Steiner, Paul Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng. 2016. TensorFlow: A System for Large-Scale Machine Learning. In Proceedings of the 12th USENIX Symposium on Operating Systems Design and Implementation, (OSDI 2016). 265-283. https:\/\/dl.acm.org\/doi\/10.5555\/3026877.3026899"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707284"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640366"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1609\/aimag.v22i3.1571"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1142\/S0218213008004060"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10703-007-0038-1"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10817-006-9026-1"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3495883"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3132413"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2008.93"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2024.3457735"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of the 13th USENIX Symposium on Operating Systems Design and Implementation, (OSDI","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In Proceedings of the 13th USENIX Symposium on Operating Systems Design and Implementation, (OSDI 2018). 578-594. https:\/\/dl.acm.org\/doi\/abs\/10.5555\/3291168.3291211"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627535.3638465"},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 11th USENIX Symposium on Operating Systems Design and Implementation, (OSDI","author":"Chilimbi Trishul","year":"2014","unstructured":"Trishul Chilimbi, Yutaka Suzue, Johnson Apacible, and Karthik Kalyanaraman. 2014. Project Adam: Building an Efficient and Scalable Deep Learning Training System. In Proceedings of the 11th USENIX Symposium on Operating Systems Design and Implementation, (OSDI 2014). 571-582. https:\/\/www.usenix.org\/conference\/osdi14\/technical-sessions\/presentation\/chilimbi"},{"key":"e_1_3_2_1_16_1","unstructured":"Fran\u00e7ois Chollet Mike Knoop Gregory Kamradt and Bryan Landers. 2025. ARC Prize 2024: Technical Report. showeprintcs.AI\/2412.04604 https:\/\/arxiv.org\/abs\/2412.04604"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/390013.808480"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the 26th Conference on Neural Information Processing Systems, (NIPS","author":"Dean Jeffrey","year":"2012","unstructured":"Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Mark Mao, MarctextquotesingleAurelio Ranzato, Andrew Senior, Paul Tucker, Ke Yang, Quoc Le, and Andrew Ng. 2012. Large Scale Distributed Deep Networks. In Proceedings of the 26th Conference on Neural Information Processing Systems, (NIPS 2012). 1223-1231. https:\/\/dl.acm.org\/doi\/10.5555\/2999134.2999271"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627535.3638466"},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the 39th International Conference on Machine Learning, (ICML","author":"Du Nan","year":"2022","unstructured":"Nan Du, Yanping Huang, Andrew M Dai, Simon Tong, Dmitry Lepikhin, Yuanzhong Xu, Maxim Krikun, Yanqi Zhou, Adams Wei Yu, Orhan Firat, Barret Zoph, Liam Fedus, Maarten P Bosma, Zongwei Zhou, Tao Wang, Emma Wang, Kellie Webster, Marie Pellat, Kevin Robinson, Kathleen Meier-Hellstern, Toju Duke, Lucas Dixon, Kun Zhang, Quoc Le, Yonghui Wu, Zhifeng Chen, and Claire Cui. 2022. GLaM: Efficient Scaling of Language Models with Mixture-of-Experts. In Proceedings of the 39th International Conference on Machine Learning, (ICML 2022). 5547-5569. https:\/\/proceedings.mlr.press\/v162\/du22c.html"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575703"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589348"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3698038.3698535"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037702"},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the NeurIPS 2021 Competitions and Demonstrations Track. 220-231","author":"Gasse Maxime","year":"2022","unstructured":"Maxime Gasse, Simon Bowly, Quentin Cappart, Jonas Charfreitag, Laurent Charlin, Didier Ch\u00e9telat, Antonia Chmiela, Justin Dumouchelle, Ambros Gleixner, Aleksandr M. Kazachkov, Elias Khalil, Pawel Lichocki, Andrea Lodi, Miles Lubin, Chris J. Maddison, Morris Christopher, Dimitri J. Papageorgiou, Augustin Parjadis, Sebastian Pokutta, Antoine Prouvost, Lara Scavuzzo, Giulia Zarpellon, Linxin Yang, Sha Lai, Akang Wang, Xiaodong Luo, Xiang Zhou, Haohan Huang, Shengcheng Shao, Yuanming Zhu, Dong Zhang, Tao Quan, Zixuan Cao, Yang Xu, Zhewei Huang, Shuchang Zhou, Chen Binbin, He Minggui, Hao Hao, Zhang Zhiyu, An Zhiwu, and Mao Kun. 2022. The Machine Learning for Combinatorial Optimization Competition (ML4CO): Results and Insights. In Proceedings of the NeurIPS 2021 Competitions and Demonstrations Track. 220-231. https:\/\/proceedings.mlr.press\/v176\/gasse22a.html"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3210377.3210394"},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the 10th International Conference on Learning Representations, (ICLR","author":"Godwin Jonathan","year":"2022","unstructured":"Jonathan Godwin, Michael Schaarschmidt, Alexander L Gaunt, Alvaro Sanchez-Gonzalez, Yulia Rubanova, Petar Veli\u010dkovi\u0107, James Kirkpatrick, and Peter Battaglia. 2022. Simple GNN Regularisation for 3D Molecular Property Prediction and Beyond. In Proceedings of the 10th International Conference on Learning Representations, (ICLR 2022). https:\/\/openreview.net\/forum?id=1wVvweK3oIb"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3496298"},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of the 36th Conference on Neural Information Processing Systems, (NeurIPS","author":"Hoffmann Jordan","year":"2022","unstructured":"Jordan Hoffmann, Sebastian Borgeaud, Arthur Mensch, Elena Buchatskaya, Trevor Cai, Eliza Rutherford, Diego de Las Casas, Lisa Anne Hendricks, Johannes Welbl, Aidan Clark, Tom Hennigan, Eric Noland, Katie Millican, George van den Driessche, Bogdan Damoc, Aurelia Guy, Simon Osindero, Karen Simonyan, Erich Elsen, Oriol Vinyals, Jack W. Rae, and Laurent Sifre. 2022. Training Compute-Optimal Large Language Models. In Proceedings of the 36th Conference on Neural Information Processing Systems, (NeurIPS 2022). Article 2176. https:\/\/dl.acm.org\/doi\/10.5555\/3600270.3602446"},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of the 33rd Conference on Neural Information Processing Systems, (NeurIPS","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Mia Xu Chen, Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V. Le, Yonghui Wu, and Zhifeng Chen. 2019. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. In Proceedings of the 33rd Conference on Neural Information Processing Systems, (NeurIPS 2019). Article 10. https:\/\/dl.acm.org\/doi\/10.5555\/3454287.3454297"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the 2022 USENIX Annual Technical Conference, (USENIX ATC 2022). 673-688","author":"Jia Xianyan","year":"2022","unstructured":"Xianyan Jia, Le Jiang, Ang Wang, Wencong Xiao, Ziji Shi, Jie Zhang, Xinyuan Li, Langshi Chen, Yong Li, Zhen Zheng, Xiaoyong Liu, and Wei Lin. 2022. Whale: Efficient Giant Model Training over Heterogeneous GPUs. In Proceedings of the 2022 USENIX Annual Technical Conference, (USENIX ATC 2022). 673-688. https:\/\/www.usenix.org\/conference\/atc22\/presentation\/jia-xianyan"},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the 35th International Conference on Machine Learning, (ICML","author":"Jia Zhihao","year":"2018","unstructured":"Zhihao Jia, Sina Lin, Charles R. Qi, and Alex Aiken. 2018. Exploring Hidden Dimensions in Accelerating Convolutional Neural Networks. In Proceedings of the 35th International Conference on Machine Learning, (ICML 2018). 2274-2283. https:\/\/proceedings.mlr.press\/v80\/jia18a.html"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the 2nd Conference on Systems and Machine Learning, (SysML","author":"Jia Zhihao","year":"2019","unstructured":"Zhihao Jia, Matei Zaharia, and Alex Aiken. 2019. Beyond Data and Model Parallelism for Deep Neural Networks. In Proceedings of the 2nd Conference on Systems and Machine Learning, (SysML 2019)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/238"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3140659.3080246"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00010"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aimag.v33i1.2395"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/266021.266273"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370308"},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the 9th International Conference on Learning Representations, (ICLR","author":"Lepikhin Dmitry","year":"2021","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. 2021. GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding. In Proceedings of the 9th International Conference on Learning Representations, (ICLR 2021). https:\/\/openreview.net\/forum?id=qrwe7XHTmYb"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"key":"e_1_3_2_1_45_1","volume-title":"Alpa: Automated Model-Parallel Deep Learning. https:\/\/research.google\/blog\/alpa-automated-model-parallel-deep-learning\/","author":"Li Zhuohan","year":"2022","unstructured":"Zhuohan Li and Yu Emma Wang. 2022. Alpa: Automated Model-Parallel Deep Learning. https:\/\/research.google\/blog\/alpa-automated-model-parallel-deep-learning\/"},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of the 17th USENIX Symposium on Operating Systems Design and Implementation, (OSDI","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E. Gonzalez, and Ion Stoica. 2023. AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. In Proceedings of the 17th USENIX Symposium on Operating Systems Design and Implementation, (OSDI 2023). 663-679. https:\/\/www.usenix.org\/conference\/osdi23\/presentation\/li-zhouhan"},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, (ICML","author":"Li Zhuohan","year":"2021","unstructured":"Zhuohan Li, Siyuan Zhuang, Shiyuan Guo, Danyang Zhuo, Hao Zhang, Dawn Song, and Ion Stoica. 2021. TeraPipe: Token-Level Pipeline Parallelism for Training Large-Scale Language Models. In Proceedings of the 38th International Conference on Machine Learning, (ICML 2021). 6543-6552. https:\/\/proceedings.mlr.press\/v139\/li21y.html"},{"key":"e_1_3_2_1_48_1","volume-title":"Proceedings of the 18th USENIX Symposium on Operating Systems Design and Implementation, (OSDI","author":"Lin Zhiqi","year":"2024","unstructured":"Zhiqi Lin, Youshan Miao, Quanlu Zhang, Fan Yang, Yi Zhu, Cheng Li, Saeed Maleki, Xu Cao, Ning Shang, Yilei Yang, Weijiang Xu, Mao Yang, Lintao Zhang, and Lidong Zhou. 2024. nnScaler: Constraint-Guided Parallelization Plan Generation for Deep Learning Training. In Proceedings of the 18th USENIX Symposium on Operating Systems Design and Implementation, (OSDI 2024). 347-363. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/lin-zhiqi"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2010.2089569"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1240"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1609\/aimag.v21i2.1505"},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of the 21th International Conference on Very Large Data Bases, (VLDB","author":"Mehta Manish","year":"1995","unstructured":"Manish Mehta and David J. DeWitt. 1995. Managing Intra-operator Parallelism in Parallel Database Systems. In Proceedings of the 21th International Conference on Very Large Data Bases, (VLDB 1995). 382-394. https:\/\/dl.acm.org\/doi\/10.5555\/645921.673299"},{"key":"e_1_3_2_1_53_1","volume-title":"Proceedings of the 34th International Conference on Machine Learning, (ICML","author":"Mirhoseini Azalia","year":"2017","unstructured":"Azalia Mirhoseini, Hieu Pham, Quoc V. Le, Benoit Steiner, Rasmus Larsen, Yuefeng Zhou, Naveen Kumar, Mohammad Norouzi, Samy Bengio, and Jeff Dean. 2017. Device Placement Optimization with Reinforcement Learning. In Proceedings of the 34th International Conference on Machine Learning, (ICML 2017). 2430-2439. https:\/\/dl.acm.org\/doi\/10.5555\/3305890.3305932"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.sysarc.2019.101635"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3623278.3624752"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/1391469.1391655"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/1055137.1055182"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/1231996.1232029"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_60_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, (ICML","author":"Narayanan Deepak","year":"2021","unstructured":"Deepak Narayanan, Amar Phanishayee, Kaiyu Shi, Xie Chen, and Matei Zaharia. 2021a. Memory-Efficient Pipeline-Parallel DNN Training. In Proceedings of the 38th International Conference on Machine Learning, (ICML 2021). 7937-7947. https:\/\/proceedings.mlr.press\/v139\/narayanan21a.html"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_62_1","volume-title":"Proceedings of the 2020 USENIX Annual Technical Conference, (USENIX ATC 2020). 307-321","author":"Park Jay H.","year":"2020","unstructured":"Jay H. Park, Gyeongchan Yun, Chang M. Yi, Nguyen T. Nguyen, Seungmin Lee, Jaesik Choi, Sam H. Noh, and Young-ri Choi. 2020. HetPipe: Enabling Large DNN Training on (Whimpy) Heterogeneous GPU Clusters through Integration of Pipelined Model Parallelism and Data Parallelism. In Proceedings of the 2020 USENIX Annual Technical Conference, (USENIX ATC 2020). 307-321. https:\/\/www.usenix.org\/conference\/atc20\/presentation\/park"},{"key":"e_1_3_2_1_63_1","volume-title":"Proceedings of the 33rd Conference on Neural Information Processing Systems, (NeurIPS","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas K\u00f6pf, Edward Yang, Zach DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Proceedings of the 33rd Conference on Neural Information Processing Systems, (NeurIPS 2019). Article 721. https:\/\/dl.acm.org\/doi\/abs\/10.5555\/3454287.3455008"},{"key":"e_1_3_2_1_64_1","volume-title":"Proceedings of the 37th Conference on Neural Information Processing Systems, (NeurIPS","author":"Phothilimthana Mangpo","year":"2023","unstructured":"Mangpo Phothilimthana, Sami Abu-El-Haija, Kaidi Cao, Bahare Fatemi, Michael Burrows, Charith Mendis, and Bryan Perozzi. 2023. TpuGraphs: A Performance Prediction Dataset on Large Tensor Computational Graphs. In Proceedings of the 37th Conference on Neural Information Processing Systems, (NeurIPS 2023). Article 3083. https:\/\/dl.acm.org\/doi\/abs\/10.5555\/3666122.3669205"},{"key":"e_1_3_2_1_65_1","volume-title":"Proceedings of the 6th Conference on Machine Learning and Systems. 606-624","author":"Pope Reiner","year":"2023","unstructured":"Reiner Pope, Sholto Douglas, Aakanksha Chowdhery, Jacob Devlin, James Bradbury, Jonathan Heek, Kefan Xiao, Shivani Agrawal, and Jeff Dean. 2023. Efficiently Scaling Transformer Inference. In Proceedings of the 6th Conference on Machine Learning and Systems. 606-624."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/1553374.1553486"},{"key":"e_1_3_2_1_67_1","volume-title":"Proceedings of the 39th International Conference on Machine Learning, (ICML 2022). 1833","author":"Rajbhandari Samyam","year":"2022","unstructured":"Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, and Yuxiong He. 2022. DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale. In Proceedings of the 39th International Conference on Machine Learning, (ICML 2022). 18332-18346. https:\/\/proceedings.mlr.press\/v162\/rajbhandari22a.html"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.5555\/3600270.3602913"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.5555\/3524938.3525722"},{"key":"e_1_3_2_1_75_1","volume-title":"Proceedings of the 32nd Conference on Neural Information Processing Systems, (NIPS","author":"Shazeer Noam","year":"2018","unstructured":"Noam Shazeer, Youlong Cheng, Niki Parmar, Dustin Tran, Ashish Vaswani, Penporn Koanantakool, Peter Hawkins, HyoukJoong Lee, Mingsheng Hong, Cliff Young, Ryan Sepassi, and Blake Hechtman. 2018. Mesh-TensorFlow: Deep Learning for Supercomputers. In Proceedings of the 32nd Conference on Neural Information Processing Systems, (NIPS 2018). 10435-10444. https:\/\/dl.acm.org\/doi\/10.5555\/3327546.3327703"},{"key":"e_1_3_2_1_76_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning, (ICML","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher Re, Ion Stoica, and Ce Zhang. 2023. FlexGen: High-Throughput Generative Inference of Large Language Models with a Single GPU. In Proceedings of the 40th International Conference on Machine Learning, (ICML 2023). 31094-31116. https:\/\/proceedings.mlr.press\/v202\/sheng23a.html"},{"key":"e_1_3_2_1_77_1","volume-title":"Proceedings of the ISCA 2023 Workshop on Architecture and System Support for Transformer Models, (ASSYST","author":"Shi Ziji","year":"2023","unstructured":"Ziji Shi, Le Jiang, Ang Wang, Jie Zhang, Xianyan Jia, Yong Li, Chencan Wu, Jialin Li, and Wei Lin. 2023. TAP: Efficient Derivation of Tensor Parallel Plans for Large Neural Networks. In Proceedings of the ISCA 2023 Workshop on Architecture and System Support for Transformer Models, (ASSYST 2023). https:\/\/openreview.net\/forum?id=6d5El_LENnf"},{"key":"e_1_3_2_1_78_1","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper and Bryan Catanzaro. 2020. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. https:\/\/arxiv.org\/abs\/1909.08053"},{"key":"e_1_3_2_1_79_1","volume-title":"Julie Bernauer, Xia Song, Mohammad Shoeybi, Yuxiong He, Michael Houston, Saurabh Tiwary, and Bryan Catanzaro.","author":"Smith Shaden","year":"2022","unstructured":"Shaden Smith, Mostofa Patwary, Brandon Norick, Patrick LeGresley, Samyam Rajbhandari, Jared Casper, Zhun Liu, Shrimai Prabhumoye, George Zerveas, Vijay Korthikanti, Elton Zhang, Rewon Child, Reza Yazdani Aminabadi, Julie Bernauer, Xia Song, Mohammad Shoeybi, Yuxiong He, Michael Houston, Saurabh Tiwary, and Bryan Catanzaro. 2022. Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model. https:\/\/arxiv.org\/abs\/2201.11990"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2017.7863730"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10601-010-9093-0"},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1609\/aimag.v35i2.2539"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651359"},{"key":"e_1_3_2_1_84_1","volume-title":"Proceedings of the 34th Conference on Neural Information Processing Systems, (NeurIPS 2021","author":"Tarnawski Jakub","year":"2021","unstructured":"Jakub Tarnawski, Deepak Narayanan, and Amar Phanishayee. 2021. Piper: Multidimensional Planner for DNN Parallelization. In Proceedings of the 34th Conference on Neural Information Processing Systems, (NeurIPS 2021). Article 1902. https:\/\/dl.acm.org\/doi\/10.5555\/3540261.3542163"},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3497020"},{"key":"e_1_3_2_1_86_1","unstructured":"Gemma Team. 2024a. Gemma 2: Improving Open Language Models at a Practical Size. https:\/\/arxiv.org\/abs\/2408.00118"},{"key":"e_1_3_2_1_87_1","volume-title":"Gemma: Open Models Based on Gemini Research and Technology. https:\/\/arxiv.org\/abs\/2403.08295","author":"Team Gemma","year":"2024","unstructured":"Gemma Team. 2024b. Gemma: Open Models Based on Gemini Research and Technology. https:\/\/arxiv.org\/abs\/2403.08295"},{"key":"e_1_3_2_1_88_1","unstructured":"Gemma Team. 2025. Gemma 3 Technical Report. https:\/\/storage.googleapis.com\/deepmind-media\/gemma\/Gemma3Report.pdf"},{"key":"e_1_3_2_1_89_1","volume-title":"Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation, (OSDI","author":"Unger Colin","year":"2022","unstructured":"Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Pat McCormick, Jamaludin Mohd-Yusof, Xi Luo, Dheevatsa Mudigere, Jongsoo Park, Misha Smelyanskiy, and Alex Aiken. 2022. Unity: Accelerating DNN Training Through Joint Optimization of Algebraic Transformations and Parallelization. In Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation, (OSDI 2022). 267-284. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/unger"},{"key":"e_1_3_2_1_90_1","doi-asserted-by":"publisher","DOI":"10.1145\/2228360.2228500"},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"publisher","DOI":"10.1145\/2429384.2429456"},{"key":"e_1_3_2_1_92_1","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303953"},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.trb.2013.01.006"},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2015.2472014"},{"key":"e_1_3_2_1_95_1","unstructured":"Yuanzhong Xu HyoukJoong Lee Dehao Chen Hongjun Choi Blake Hechtman and Shibo Wang. 2020. Automatic Cross-Replica Sharding of Weight Update in Data-Parallel Training. https:\/\/arxiv.org\/abs\/2004.13336"},{"key":"e_1_3_2_1_96_1","volume-title":"GSPMD: General and Scalable Parallelization for ML Computation Graphs. https:\/\/arxiv.org\/abs\/2105.04663","author":"Xu Yuanzhong","year":"2021","unstructured":"Yuanzhong Xu, HyoukJoong Lee, Dehao Chen, Blake Hechtman, Yanping Huang, Rahul Joshi, Maxim Krikun, Dmitry Lepikhin, Andy Ly, Marcello Maggioni, Ruoming Pang, Noam Shazeer, Shibo Wang, Tao Wang, Yonghui Wu, and Zhifeng Chen. 2021. GSPMD: General and Scalable Parallelization for ML Computation Graphs. https:\/\/arxiv.org\/abs\/2105.04663"},{"key":"e_1_3_2_1_97_1","doi-asserted-by":"publisher","DOI":"10.1145\/3519939.3523437"},{"key":"e_1_3_2_1_98_1","doi-asserted-by":"publisher","DOI":"10.14778\/3611540.3611569"},{"key":"e_1_3_2_1_99_1","volume-title":"Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation, (OSDI","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, Joseph E. Gonzalez, and Ion Stoica. 2020. Ansor: Generating High-Performance Tensor Programs for Deep Learning. In Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation, (OSDI 2020). 863-879. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/zheng"},{"key":"e_1_3_2_1_100_1","volume-title":"Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation, (OSDI","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P. Xing, Joseph E. Gonzalez, and Ion Stoica. 2022. Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning. In Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation, (OSDI 2022). 559-578. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/zheng-lianmin"}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"],"location":"Rotterdam Netherlands","acronym":"ASPLOS '25"},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676642.3736399","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,10]],"date-time":"2025-09-10T22:23:46Z","timestamp":1757543026000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676642.3736399"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,6]]},"references-count":100,"alternative-id":["10.1145\/3676642.3736399","10.1145\/3676642"],"URL":"https:\/\/doi.org\/10.1145\/3676642.3736399","relation":{},"subject":[],"published":{"date-parts":[[2025,8,6]]},"assertion":[{"value":"2025-08-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}