{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T01:13:21Z","timestamp":1780708401477,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,5,9]],"date-time":"2023-05-09T00:00:00Z","timestamp":1683590400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"The National Key R&D Program of China under Grant","award":["2022YFB4501401"],"award-info":[{"award-number":["2022YFB4501401"]}]},{"name":"the National Natural Science Foundation of China (NSFC)","award":["62222210, and 62072297, and 61832006"],"award-info":[{"award-number":["62222210, and 62072297, and 61832006"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,5,9]]},"DOI":"10.1145\/3587135.3592200","type":"proceedings-article","created":{"date-parts":[[2023,8,4]],"date-time":"2023-08-04T10:22:41Z","timestamp":1691144561000},"page":"112-122","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["DistSim"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-6759-9108","authenticated-orcid":false,"given":"Guandong","family":"Lu","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai Qi Zhi Institusion, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8955-2201","authenticated-orcid":false,"given":"Runzhe","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai Qi Zhi Institusion, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0835-6304","authenticated-orcid":false,"given":"Yakai","family":"Wang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai Qi Zhi Institusion, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3652-5437","authenticated-orcid":false,"given":"Yangjie","family":"Zhou","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai Qi Zhi Institusion, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3887-9344","authenticated-orcid":false,"given":"Rui","family":"Zhang","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3526-0297","authenticated-orcid":false,"given":"Zheng","family":"Hu","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3244-2660","authenticated-orcid":false,"given":"Yanming","family":"Miao","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4756-8806","authenticated-orcid":false,"given":"Zhifang","family":"Cai","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6099-614X","authenticated-orcid":false,"given":"Li","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5660-5493","authenticated-orcid":false,"given":"Jingwen","family":"Leng","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai Qi Zhi Institusion, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0034-2302","authenticated-orcid":false,"given":"Minyi","family":"Guo","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai Qi Zhi Institusion, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,8,4]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Amazon. [n.d.]. AWS Pricing Calculator. https:\/\/calculator.aws\/."},{"key":"e_1_3_2_1_2_1","unstructured":"Baidu. [n. d.]. Ring AllReduce. https:\/\/github.com\/baidu-research\/baidu-allreduce."},{"key":"e_1_3_2_1_3_1","volume-title":"Maximizing Parallelism in Distributed Training for Huge Neural Networks. CoRR abs\/2105.14450","author":"Bian Zhengda","year":"2021","unstructured":"Zhengda Bian, Qifan Xu, Boxiang Wang, and Yang You. 2021. Maximizing Parallelism in Distributed Training for Huge Neural Networks. CoRR abs\/2105.14450 (2021). arXiv:2105.14450 https:\/\/arxiv.org\/abs\/2105.14450"},{"key":"e_1_3_2_1_4_1","volume-title":"Lin (Eds.)","volume":"33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems, H. Larochelle, M. Ranzato, R. Hadsell, M.F. Balcan, and H. Lin (Eds.), Vol. 33. Curran Associates, Inc., 1877--1901."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421307"},{"key":"e_1_3_2_1_6_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR abs\/1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR abs\/1810.04805 (2018). arXiv:1810.04805"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS49936.2021.00111"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433722"},{"key":"e_1_3_2_1_10_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=JXhROKNZzOc","author":"Guo Cong","year":"2022","unstructured":"Cong Guo, Yuxian Qiu, Jingwen Leng, Xiaotian Gao, Chen Zhang, Yunxin Liu, Fan Yang, Yuhao Zhu, and Minyi Guo. 2022. SQuant: On-the-Fly Data-Free Quantization via Diagonal Hessian Approximation. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=JXhROKNZzOc"},{"key":"e_1_3_2_1_11_1","volume-title":"Nesting Forward Automatic Differentiation for Memory-Efficient Deep Neural Network Training. In 2022 IEEE 40th International Conference on Computer Design (ICCD). IEEE, 738--745","author":"Guo Cong","year":"2022","unstructured":"Cong Guo, Yuxian Qiu, Jingwen Leng, Chen Zhang, Ying Cao, Quanlu Zhang, Yunxin Liu, Fan Yang, and Minyi Guo. 2022. Nesting Forward Automatic Differentiation for Memory-Efficient Deep Neural Network Training. In 2022 IEEE 40th International Conference on Computer Design (ICCD). IEEE, 738--745."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00095"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18072.2020.9218732"},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.)","volume":"4","author":"Hu Hanpeng","year":"2022","unstructured":"Hanpeng Hu, Chenyu Jiang, Yuchen Zhong, Yanghua Peng, Chuan Wu, Yibo Zhu, Haibin Lin, and Chuanxiong Guo. 2022. dPRO: A Generic Performance Diagnosis and Optimization Toolkit for Expediting Distributed DNN Training. In Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.), Vol. 4. 623--637."},{"key":"e_1_3_2_1_15_1","volume-title":"Advances in Neural Information Processing Systems, H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch\u00e9-Buc","author":"Huang Yanping","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, and zhifeng Chen. 2019. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. In Advances in Neural Information Processing Systems, H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch\u00e9-Buc, E. Fox, and R. Garnett (Eds.), Vol. 32. Curran Associates, Inc."},{"key":"e_1_3_2_1_16_1","volume-title":"One weird trick for parallelizing convolutional neural networks. CoRR abs\/1404.5997","author":"Krizhevsky Alex","year":"2014","unstructured":"Alex Krizhevsky. 2014. One weird trick for parallelizing convolutional neural networks. CoRR abs\/1404.5997 (2014). arXiv:1404.5997"},{"key":"e_1_3_2_1_17_1","volume-title":"Weinberger (Eds.)","volume":"25","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E Hinton. 2012. ImageNet Classification with Deep Convolutional Neural Networks. In Advances in Neural Information Processing Systems, F. Pereira, C.J. Burges, L. Bottou, and K.Q. Weinberger (Eds.), Vol. 25. Curran Associates, Inc."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","unstructured":"Jingwen Leng Alper Buyuktosunoglu Ramon Bertran Pradip Bose Quan Chen Minyi Guo and Vijay Janapa Reddi. 2020. Asymmetric Resilience: Exploiting Task-Level Idempotency for Transient Error Recovery in Accelerator-Based Systems. In 2020 IEEE International Symposium on High Performance Computer Architecture (HPCA). 44--57. https:\/\/doi.org\/10.1109\/HPCA47549.2020.00014","DOI":"10.1109\/HPCA47549.2020.00014"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476145"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507752"},{"key":"e_1_3_2_1_22_1","volume-title":"Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. 2021. Check-Freq: Frequent, Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21). USENIX Association, 203--216. https:\/\/www.usenix.org\/conference\/fast21\/presentation\/mohan"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_25_1","unstructured":"NVIDIA. [n.d.]. CUPTI. https:\/\/docs.nvidia.com\/cuda\/cupti\/."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00491"},{"key":"e_1_3_2_1_27_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2018. Language Models are Unsupervised Multitask Learners. (2018)."},{"key":"e_1_3_2_1_28_1","volume-title":"Liu","author":"Raffel Colin","year":"2019","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. 2019. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. CoRR abs\/1910.10683 (2019). arXiv:1910.10683"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437984.3458829"},{"key":"e_1_3_2_1_32_1","volume-title":"Horovod: fast and easy distributed deep learning in TensorFlow. CoRR abs\/1802.05799","author":"Sergeev Alexander","year":"2018","unstructured":"Alexander Sergeev and Mike Del Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. CoRR abs\/1802.05799 (2018). arXiv:1802.05799"},{"key":"e_1_3_2_1_33_1","volume-title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. CoRR abs\/1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. CoRR abs\/1909.08053 (2019). arXiv:1909.08053"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00036"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322230"},{"key":"e_1_3_2_1_36_1","volume-title":"Wortman Vaughan (Eds.)","volume":"34","author":"Tarnawski Jakub M","year":"2021","unstructured":"Jakub M Tarnawski, Deepak Narayanan, and Amar Phanishayee. 2021. Piper: Multidimensional Planner for DNN Parallelization. In Advances in Neural Information Processing Systems, M. Ranzato, A. Beygelzimer, Y. Dauphin, P.S. Liang, and J. Wortman Vaughan (Eds.), Vol. 34. Curran Associates, Inc., 24829--24840."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00088"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3140659.3080203"},{"key":"e_1_3_2_1_39_1","volume-title":"Habitat: A Runtime-Based Computational Performance Predictor for Deep Neural Network Training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Yu Geoffrey X.","year":"2021","unstructured":"Geoffrey X. Yu, Yubo Gao, Pavel Golikov, and Gennady Pekhimenko. 2021. Habitat: A Runtime-Based Computational Performance Predictor for Deep Neural Network Training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). USENIX Association, 503--521. https:\/\/www.usenix.org\/conference\/atc21\/presentation\/yu"},{"key":"e_1_3_2_1_40_1","volume-title":"Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning. CoRR abs\/2201.12023","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Joseph E. Gonzalez, and Ion Stoica. 2022. Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning. CoRR abs\/2201.12023 (2022). arXiv:2201.12023"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/IWQoS.2017.7969161"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575723"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC53511.2021.00029"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2018.8573476"},{"key":"e_1_3_2_1_45_1","volume-title":"Daydream: Accurately Estimating the Efficacy of Optimizations for DNN Training. In 2020 USENIX Annual Technical Conference (USENIX ATC 20)","author":"Zhu Hongyu","year":"2020","unstructured":"Hongyu Zhu, Amar Phanishayee, and Gennady Pekhimenko. 2020. Daydream: Accurately Estimating the Efficacy of Optimizations for DNN Training. In 2020 USENIX Annual Technical Conference (USENIX ATC 20). USENIX Association, 337--352."}],"event":{"name":"CF '23: 20th ACM International Conference on Computing Frontiers","location":"Bologna Italy","acronym":"CF '23","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["Proceedings of the 20th ACM International Conference on Computing Frontiers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3587135.3592200","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3587135.3592200","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:47:14Z","timestamp":1750178834000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3587135.3592200"}},"subtitle":["A performance model of large-scale hybrid distributed DNN training"],"short-title":[],"issued":{"date-parts":[[2023,5,9]]},"references-count":45,"alternative-id":["10.1145\/3587135.3592200","10.1145\/3587135"],"URL":"https:\/\/doi.org\/10.1145\/3587135.3592200","relation":{},"subject":[],"published":{"date-parts":[[2023,5,9]]},"assertion":[{"value":"2023-08-04","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}