{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,29]],"date-time":"2026-07-29T14:54:35Z","timestamp":1785336875956,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,11,13]],"date-time":"2021-11-13T00:00:00Z","timestamp":1636761600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,11,14]]},"DOI":"10.1145\/3458817.3480859","type":"proceedings-article","created":{"date-parts":[[2021,10,21]],"date-time":"2021-10-21T05:10:34Z","timestamp":1634793034000},"page":"1-15","source":"Crossref","is-referenced-by-count":21,"title":["Online evolutionary batch size orchestration for scheduling deep learning workloads in GPU clusters"],"prefix":"10.1145","author":[{"given":"Zhengda","family":"Bian","sequence":"first","affiliation":[{"name":"National University of Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shenggui","family":"Li","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wei","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yang","family":"You","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2021,11,13]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Tensorflow: A system for large-scale machine learning. In 12th {USENIX} symposium on operating systems design and implementation ({OSDI} 16). 265--283.","author":"Abadi Mart\u00edn","year":"2016"},{"key":"e_1_3_2_2_2_1","volume-title":"International conference on machine learning. 173--182","author":"Amodei Dario","year":"2016"},{"key":"e_1_3_2_2_3_1","volume-title":"Online Job Scheduling in Distributed Machine Learning Clusters. In IEEE INFOCOM 2018 - IEEE Conference on Computer Communications. 495--503","author":"Bao Y.","year":"2018"},{"key":"e_1_3_2_2_4_1","unstructured":"Texas Advanced Computing Center. 2021. LONGHORN - TEXAS ADVANCED COMPUTING CENTER. https:\/\/www.tacc.utexas.edu\/systems\/longhorn.  Texas Advanced Computing Center. 2021. LONGHORN - TEXAS ADVANCED COMPUTING CENTER. https:\/\/www.tacc.utexas.edu\/systems\/longhorn."},{"key":"e_1_3_2_2_6_1","unstructured":"Jeffrey Dean Greg Corrado Rajat Monga Kai Chen Matthieu Devin Mark Mao Marc'aurelio Ranzato Andrew Senior Paul Tucker Ke Yang etal 2012. Large scale distributed deep networks. In Advances in neural information processing systems. 1223--1231.  Jeffrey Dean Greg Corrado Rajat Monga Kai Chen Matthieu Devin Mark Mao Marc'aurelio Ranzato Andrew Senior Paul Tucker Ke Yang et al. 2012. Large scale distributed deep networks. In Advances in neural information processing systems. 1223--1231."},{"key":"e_1_3_2_2_7_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018"},{"key":"e_1_3_2_2_8_1","volume-title":"Proceedings of the International Symposium on Quality of Service. 1--10","author":"Gong Yifan","year":"2019"},{"key":"e_1_3_2_2_9_1","volume-title":"large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677","author":"Goyal Priya","year":"2017"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2740070.2626334"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"crossref","first-page":"154","DOI":"10.1016\/j.peva.2018.10.001","article-title":"SRPT for multiserver systems","volume":"127","author":"Grosof Isaac","year":"2018","journal-title":"Performance Evaluation"},{"key":"e_1_3_2_2_12_1","volume-title":"Tiresias: A GPU Cluster Manager for Distributed Deep Learning. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)","author":"Gu Juncheng","year":"2019"},{"key":"e_1_3_2_2_13_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 770--778","author":"He Kaiming","year":"2016"},{"key":"e_1_3_2_2_14_1","unstructured":"Elad Hoffer Itay Hubara and Daniel Soudry. 2017. Train longer generalize better: closing the generalization gap in large batch training of neural networks. In Advances in Neural Information Processing Systems. 1731--1741.  Elad Hoffer Itay Hubara and Daniel Soudry. 2017. Train longer generalize better: closing the generalization gap in large batch training of neural networks. In Advances in Neural Information Processing Systems. 1731--1741."},{"key":"e_1_3_2_2_15_1","volume-title":"Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In 2019 USENIX Annual Technical Conference (USENIXATC 19)","author":"Jeon Myeongjae","year":"2019"},{"key":"e_1_3_2_2_16_1","volume-title":"On large-batch training for deep learning: Generalization gap and sharp minima. arXiv preprint arXiv:1609.04836","author":"Keskar Nitish Shirish","year":"2016"},{"key":"e_1_3_2_2_17_1","unstructured":"Alex Krizhevsky. 2009. CIFAR-10 and CIFAR-100 datasets. https:\/\/www.cs.toronto.edu\/~kriz\/cifar.html.  Alex Krizhevsky. 2009. CIFAR-10 and CIFAR-100 datasets. https:\/\/www.cs.toronto.edu\/~kriz\/cifar.html."},{"key":"e_1_3_2_2_18_1","unstructured":"Stanford Vision Lab. 2016. ImageNet. http:\/\/www.image-net.org\/.  Stanford Vision Lab. 2016. ImageNet. http:\/\/www.image-net.org\/."},{"key":"e_1_3_2_2_19_1","volume-title":"Proceedings of the 9th International Conference on Cloud Computing and Services Science, CLOSER 2019","author":"Lin Chan-Yi","year":"2019"},{"key":"e_1_3_2_2_20_1","volume-title":"Dynamic mini-batch SGD for elastic distributed training: learning in the limbo of resources. arXiv preprint arXiv:1904.12043","author":"Lin Haibin","year":"2019"},{"key":"e_1_3_2_2_21_1","volume-title":"Proceedings of the 15th ACM Workshop on Hot Topics in Networks. ACM, 50--56","author":"Mao Hongzi","year":"2016"},{"key":"e_1_3_2_2_22_1","volume-title":"Zili Meng, and Mohammad Alizadeh.","author":"Mao Hongzi","year":"2019"},{"key":"e_1_3_2_2_23_1","unstructured":"NVIDIA. 2020. NVIDIA Collective Communications Library (NCCL). https:\/\/developer.nvidia.com\/nccl.  NVIDIA. 2020. NVIDIA Collective Communications Library (NCCL). https:\/\/developer.nvidia.com\/nccl."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"crossref","first-page":"1346","DOI":"10.1016\/j.jss.2010.01.009","article-title":"Performance evaluation of bag of gangs scheduling in a heterogeneous distributed system","volume":"83","author":"Papazachos Zafeirios C","year":"2010","journal-title":"Journal of systems and software"},{"key":"e_1_3_2_2_25_1","volume-title":"Proceedings of the Thirteenth EuroSys Conference. ACM, 3.","author":"Peng Yanghua","year":"2018"},{"key":"e_1_3_2_2_26_1","volume-title":"DL2: A Deep Learning-driven Scheduler for Deep Learning Clusters. arXiv preprint arXiv:1909.06040","author":"Peng Yanghua","year":"2019"},{"key":"e_1_3_2_2_27_1","volume-title":"Language models are unsupervised multitask learners. OpenAI blog 1, 8","author":"Radford Alec","year":"2019"},{"key":"e_1_3_2_2_28_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014"},{"key":"e_1_3_2_2_29_1","volume-title":"2017 IEEE Winter Conference on Applications of Computer Vision (WACV). IEEE, 464--472","author":"Smith Leslie N","year":"2017"},{"key":"e_1_3_2_2_30_1","volume-title":"Don't decay the learning rate, increase the batch size. arXiv preprint arXiv:1711.00489","author":"Smith Samuel L","year":"2017"},{"key":"e_1_3_2_2_31_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 1--9.","author":"Szegedy Christian","year":"2015"},{"key":"e_1_3_2_2_32_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 2818--2826","author":"Szegedy Christian","year":"2016"},{"key":"e_1_3_2_2_33_1","volume-title":"Proceedings of the Tenth European Conference on Computer Systems. 1--17","author":"Verma Abhishek","year":"2015"},{"key":"e_1_3_2_2_34_1","volume-title":"Breakthroughs in statistics","author":"Wilcoxon Frank"},{"key":"e_1_3_2_2_35_1","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Xiao Wencong","year":"2018"},{"key":"e_1_3_2_2_36_1","volume-title":"Large batch training of convolutional networks. arXiv preprint arXiv:1708.03888","author":"You Yang","year":"2017"},{"key":"e_1_3_2_2_37_1","volume-title":"Large batch optimization for deep learning: Training bert in 76 minutes. arXiv preprint arXiv:1904.00962","author":"You Yang","year":"2019"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"crossref","unstructured":"Yang You Zhao Zhang Cho-Jui Hsieh James Demmel and Kurt Keutzer. 2018. ImageNet Training in Minutes. arXiv:1709.05011 [cs.CV]  Yang You Zhao Zhang Cho-Jui Hsieh James Demmel and Kurt Keutzer. 2018. ImageNet Training in Minutes. arXiv:1709.05011 [cs.CV]","DOI":"10.1145\/3225058.3225069"},{"key":"e_1_3_2_2_39_1","volume-title":"Proceedings of the 2017 Symposium on Cloud Computing. ACM, 390--404","author":"Zhang Haoyu","year":"2017"}],"event":{"name":"SC '21: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis Missouri","acronym":"SC '21","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","IEEE CS"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3458817.3480859","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3458817.3480859","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:12:22Z","timestamp":1750191142000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3458817.3480859"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,11,13]]},"references-count":38,"alternative-id":["10.1145\/3458817.3480859","10.1145\/3458817"],"URL":"https:\/\/doi.org\/10.1145\/3458817.3480859","relation":{},"subject":[],"published":{"date-parts":[[2021,11,13]]}}}