{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:18:47Z","timestamp":1750220327014,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,4,25]],"date-time":"2022-04-25T00:00:00Z","timestamp":1650844800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,4,25]]},"DOI":"10.1145\/3485447.3511981","type":"proceedings-article","created":{"date-parts":[[2022,4,25]],"date-time":"2022-04-25T05:13:07Z","timestamp":1650863587000},"page":"1764-1773","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Modeling and Optimizing the Scaling Performance in Distributed Deep Learning Training"],"prefix":"10.1145","author":[{"given":"Ting","family":"Liu","sequence":"first","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, China and University of Chinese Academy of Sciences, China"}]},{"given":"Tianhao","family":"Miao","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, China and University of Chinese Academy of Sciences, China"}]},{"given":"Qinghua","family":"Wu","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, China and Purple Mountain Laboratories, China"}]},{"given":"Zhenyu","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, China and Purple Mountain Laboratories, China"}]},{"given":"Guangxin","family":"He","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, China and University of Chinese Academy of Sciences, China"}]},{"given":"Jiaoren","family":"Wu","sequence":"additional","affiliation":[{"name":"Kuaishou, China"}]},{"given":"Shengzhuo","family":"Zhang","sequence":"additional","affiliation":[{"name":"Kuaishou, China"}]},{"given":"Xingwu","family":"Yang","sequence":"additional","affiliation":[{"name":"Kuaishou, China"}]},{"given":"Gareth","family":"Tyson","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong and Queen Mary University of London, United Kingdom"}]},{"given":"Gaogang","family":"Xie","sequence":"additional","affiliation":[{"name":"Computer Network Information Center, Chinese Academy of Sciences, China and University of Chinese Academy of Sciences, China"}]}],"member":"320","published-online":{"date-parts":[[2022,4,25]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Tensorflow: A system for large-scale machine learning. In 12th {USENIX} symposium on operating systems design and implementation ({OSDI} 16). 265\u2013283.","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, 2016. Tensorflow: A system for large-scale machine learning. In 12th {USENIX} symposium on operating systems design and implementation ({OSDI} 16). 265\u2013283."},{"key":"e_1_3_2_1_2_1","unstructured":"Alham\u00a0Fikri Aji and Kenneth Heafield. 2017. Sparse communication for distributed gradient descent. arXiv preprint arXiv:1704.05021(2017)."},{"key":"e_1_3_2_1_3_1","first-page":"1709","article-title":"QSGD: Communication-efficient SGD via gradient quantization and encoding","volume":"30","author":"Alistarh Dan","year":"2017","unstructured":"Dan Alistarh, Demjan Grubic, Jerry Li, Ryota Tomioka, and Milan Vojnovic. 2017. QSGD: Communication-efficient SGD via gradient quantization and encoding. Advances in Neural Information Processing Systems 30 (2017), 1709\u20131720.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3325413.3329793"},{"key":"e_1_3_2_1_5_1","volume-title":"Parallel Blockwise Knowledge Distillation for Deep Neural Network Compression","author":"Blakeney J","year":"2020","unstructured":"Cody\u00a0J Blakeney, Xiaomin Li, Yan Yan, and Ziliang Zong. 2020. Parallel Blockwise Knowledge Distillation for Deep Neural Network Compression. IEEE Transactions on Parallel and Distributed Systems (2020)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-44947-7_10"},{"key":"e_1_3_2_1_7_1","volume-title":"CADA: Communication-Adaptive Distributed Adam. In International Conference on Artificial Intelligence and Statistics. PMLR, 613\u2013621","author":"Chen Tianyi","year":"2021","unstructured":"Tianyi Chen, Ziye Guo, Yuejiao Sun, and Wotao Yin. 2021. CADA: Communication-Adaptive Distributed Adam. In International Conference on Artificial Intelligence and Statistics. PMLR, 613\u2013621."},{"key":"e_1_3_2_1_8_1","volume-title":"Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems. arXiv preprint arXiv:1512.01274(2015).","author":"Chen Tianqi","year":"2015","unstructured":"Tianqi Chen, Mu Li, Yutian Li, Min Lin, Naiyan Wang, Minjie Wang, Tianjun Xiao, Bing Xu, Chiyuan Zhang, and Zheng Zhang. 2015. Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems. arXiv preprint arXiv:1512.01274(2015)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1147\/JRD.2019.2947013"},{"key":"e_1_3_2_1_10_1","volume-title":"d.]. AI and Compute. [EB\/OL]. https:\/\/openai.com\/blog\/ai-and-compute\/ Accessed","author":"Dario\u00a0Amodei Danny\u00a0Hernandez","year":"2021","unstructured":"Danny\u00a0Hernandez Dario\u00a0Amodei. [n. d.]. AI and Compute. [EB\/OL]. https:\/\/openai.com\/blog\/ai-and-compute\/ Accessed May 1, 2021."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_12_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805(2018).","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805(2018)."},{"key":"e_1_3_2_1_13_1","unstructured":"FaceBook. [n. d.]. Gloo. https:\/\/github.com\/facebookincubator\/gloo"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3229543.3229544"},{"key":"e_1_3_2_1_15_1","unstructured":"Andrew Gibiansky. 2017. Bringing HPC techniques to deep learning. https:\/\/andrew.gibiansky.com\/blog\/machine-learning\/baidu-allreduce\/"},{"key":"e_1_3_2_1_16_1","unstructured":"Priya Goyal Piotr Doll\u00e1r Ross Girshick Pieter Noordhuis Lukasz Wesolowski Aapo Kyrola Andrew Tulloch Yangqing Jia and Kaiming He. 2017. Accurate large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677(2017)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6638947"},{"key":"e_1_3_2_1_18_1","volume-title":"Tictac: Accelerating distributed deep learning with communication scheduling. arXiv preprint arXiv:1803.03288(2018).","author":"Hashemi Sayed\u00a0Hadi","year":"2018","unstructured":"Sayed\u00a0Hadi Hashemi, Sangeetha\u00a0Abdu Jyothi, and Roy\u00a0H Campbell. 2018. Tictac: Accelerating distributed deep learning with communication scheduling. arXiv preprint arXiv:1803.03288(2018)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_20_1","volume-title":"More effective distributed ml via a stale synchronous parallel parameter server. Advances in neural information processing systems 2013","author":"Ho Qirong","year":"2013","unstructured":"Qirong Ho, James Cipar, Henggang Cui, Jin\u00a0Kyu Kim, Seunghak Lee, Phillip\u00a0B Gibbons, Garth\u00a0A Gibson, Gregory\u00a0R Ganger, and Eric\u00a0P Xing. 2013. More effective distributed ml via a stale synchronous parallel parameter server. Advances in neural information processing systems 2013 (2013), 1223."},{"key":"e_1_3_2_1_21_1","unstructured":"Horovod. 2018. Horovod Synthetic Benchmark. https:\/\/github.com\/horovod\/horovod\/tree\/master\/examples\/tensorflow2"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.284"},{"key":"e_1_3_2_1_23_1","unstructured":"Nikita Ivkin Daniel Rothchild Enayat Ullah Vladimir Braverman Ion Stoica and Raman Arora. 2019. Communication-efficient distributed SGD with sketching. arXiv preprint arXiv:1903.04488(2019)."},{"key":"e_1_3_2_1_24_1","unstructured":"Anand Jayarajan Jinliang Wei Garth Gibson Alexandra Fedorova and Gennady Pekhimenko. 2019. Priority-based parameter propagation for distributed DNN training. arXiv preprint arXiv:1905.03960(2019)."},{"key":"e_1_3_2_1_25_1","unstructured":"Xianyan Jia Shutao Song Wei He Yangzihao Wang Haidong Rong Feihu Zhou Liqiang Xie Zhenyu Guo Yuanzhou Yang Liwei Yu 2018. Highly scalable deep learning training system with mixed-precision: Training imagenet in four minutes. arXiv preprint arXiv:1807.11205(2018)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.435"},{"volume-title":"11th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 14). 583\u2013598.","author":"Li Mu","key":"e_1_3_2_1_28_1","unstructured":"Mu Li, David\u00a0G Andersen, Jun\u00a0Woo Park, Alexander\u00a0J Smola, Amr Ahmed, Vanja Josifovski, James Long, Eugene\u00a0J Shekita, and Bor-Yiing Su. 2014. Scaling distributed machine learning with the parameter server. In 11th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 14). 583\u2013598."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/2623330.2623612"},{"key":"e_1_3_2_1_30_1","unstructured":"Min Lin Qiang Chen and Shuicheng Yan. 2013. Network in network. arXiv preprint arXiv:1312.4400(2013)."},{"key":"e_1_3_2_1_31_1","unstructured":"Yujun Lin Song Han Huizi Mao Yu Wang and William\u00a0J Dally. 2017. Deep gradient compression: Reducing the communication bandwidth for distributed training. arXiv preprint arXiv:1712.01887(2017)."},{"key":"e_1_3_2_1_32_1","unstructured":"Mellanox. 2021. How to Enable Disable Lossy RoCE Accelerations. https:\/\/community.mellanox.com\/s\/article\/How-to-Enable-Disable-Lossy-RoCE-Accelerations"},{"key":"e_1_3_2_1_33_1","unstructured":"Hiroaki Mikami Hisahiro Suganuma Yoshiki Tanaka Yuichi Kageyama 2018. Massively distributed SGD: ImageNet\/ResNet-50 training in a flash. arXiv preprint arXiv:1811.05233(2018)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_35_1","unstructured":"NVIDIA. 2017. NVIDIA collective communications library (NCCL). https:\/\/developer.nvidia.com\/nccl\/"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3419394.3423637"},{"key":"e_1_3_2_1_37_1","unstructured":"Adam Paszke Sam Gross Soumith Chintala Gregory Chanan Edward Yang Zachary DeVito Zeming Lin Alban Desmaison Luca Antiga and Adam Lerer. 2017. Automatic differentiation in pytorch. (2017)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"e_1_3_2_1_39_1","volume-title":"Alleviating Load Imbalance in Data Processing for Large-Scale Deep Learning. In 2020 20th IEEE\/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID). IEEE, 262\u2013271","author":"Pumma Sarunya","year":"2020","unstructured":"Sarunya Pumma, Daniele Buono, Fabio Checconi, Xinyu Que, and Wu-chun Feng. 2020. Alleviating Load Imbalance in Data Processing for Large-Scale Deep Learning. In 2020 20th IEEE\/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID). IEEE, 262\u2013271."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2014-274"},{"key":"e_1_3_2_1_41_1","unstructured":"Alexander Sergeev and Mike Del\u00a0Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. arXiv preprint arXiv:1802.05799(2018)."},{"key":"e_1_3_2_1_42_1","unstructured":"Serebryakov Sergey. 2019. Neural network runtime characteristics. https:\/\/github.com\/sergey-serebryakov\/nns."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3052862"},{"key":"e_1_3_2_1_44_1","unstructured":"Shaohuai Shi Zhenheng Tang Xiaowen Chu Chengjian Liu Wei Wang and Bo Li. 2020. Communication-Efficient Distributed Deep Learning: Survey Evaluation and Challenges. arXiv preprint arXiv:2005.13247(2020)."},{"key":"e_1_3_2_1_45_1","volume-title":"A Quantitative Survey of Communication Optimizations in Distributed Deep Learning","author":"Shi Shaohuai","year":"2020","unstructured":"Shaohuai Shi, Zhenheng Tang, Xiaowen Chu, Chengjian Liu, Wei Wang, and Bo Li. 2020. A Quantitative Survey of Communication Optimizations in Distributed Deep Learning. IEEE Network (2020)."},{"key":"e_1_3_2_1_46_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556(2014)."},{"key":"e_1_3_2_1_47_1","volume-title":"International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems. Springer, 3\u201321","author":"Tallent R","year":"2017","unstructured":"Nathan\u00a0R Tallent, Nitin\u00a0A Gawande, Charles Siegel, Abhinav Vishnu, and Adolfy Hoisie. 2017. Evaluating on-node gpu interconnects for deep learning workloads. In International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems. Springer, 3\u201321."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342005051521"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CCGRID.2019.00057"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737595"},{"key":"e_1_3_2_1_51_1","volume-title":"Terngrad: Ternary gradients to reduce communication in distributed deep learning. arXiv preprint arXiv:1705.07878(2017).","author":"Wen Wei","year":"2017","unstructured":"Wei Wen, Cong Xu, Feng Yan, Chunpeng Wu, Yandan Wang, Yiran Chen, and Hai Li. 2017. Terngrad: Ternary gradients to reduce communication in distributed deep learning. arXiv preprint arXiv:1705.07878(2017)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3417607"},{"key":"e_1_3_2_1_53_1","volume-title":"Scaling sgd batch size to 32k for imagenet training. arXiv preprint arXiv:1708.03888 6","author":"You Yang","year":"2017","unstructured":"Yang You, Igor Gitman, and Boris Ginsburg. 2017. Scaling sgd batch size to 32k for imagenet training. arXiv preprint arXiv:1708.03888 6 (2017), 12."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3225058.3225069"},{"key":"e_1_3_2_1_55_1","volume-title":"Poseidon: An efficient communication architecture for distributed deep learning on {GPU} clusters. In 2017 {USENIX} Annual Technical Conference ({USENIX}{ATC} 17). 181\u2013193.","author":"Zhang Hao","year":"2017","unstructured":"Hao Zhang, Zeyu Zheng, Shizhen Xu, Wei Dai, Qirong Ho, Xiaodan Liang, Zhiting Hu, Jinliang Wei, Pengtao Xie, and Eric\u00a0P Xing. 2017. Poseidon: An efficient communication architecture for distributed deep learning on {GPU} clusters. In 2017 {USENIX} Annual Technical Conference ({USENIX}{ATC} 17). 181\u2013193."},{"key":"e_1_3_2_1_56_1","unstructured":"Wei Zhang Suyog Gupta Xiangru Lian and Ji Liu. 2015. Staleness-aware async-sgd for distributed deep learning. arXiv preprint arXiv:1511.05950(2015)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3405671.3405810"}],"event":{"name":"WWW '22: The ACM Web Conference 2022","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Virtual Event, Lyon France","acronym":"WWW '22"},"container-title":["Proceedings of the ACM Web Conference 2022"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3485447.3511981","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3485447.3511981","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:12:09Z","timestamp":1750191129000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3485447.3511981"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,4,25]]},"references-count":56,"alternative-id":["10.1145\/3485447.3511981","10.1145\/3485447"],"URL":"https:\/\/doi.org\/10.1145\/3485447.3511981","relation":{},"subject":[],"published":{"date-parts":[[2022,4,25]]},"assertion":[{"value":"2022-04-25","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}