{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T01:12:10Z","timestamp":1780708330066,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,4,15]],"date-time":"2020-04-15T00:00:00Z","timestamp":1586908800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,4,15]]},"DOI":"10.1145\/3342195.3387555","type":"proceedings-article","created":{"date-parts":[[2020,5,4]],"date-time":"2020-05-04T07:19:58Z","timestamp":1588576798000},"page":"1-16","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":125,"title":["Balancing efficiency and fairness in heterogeneous GPU clusters for deep learning"],"prefix":"10.1145","author":[{"given":"Shubham","family":"Chaudhary","sequence":"first","affiliation":[{"name":"Microsoft Research India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ramachandran","family":"Ramjee","sequence":"additional","affiliation":[{"name":"Microsoft Research India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Muthian","family":"Sivathanu","sequence":"additional","affiliation":[{"name":"Microsoft Research India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Nipun","family":"Kwatra","sequence":"additional","affiliation":[{"name":"Microsoft Research India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Srinidhi","family":"Viswanatha","sequence":"additional","affiliation":[{"name":"Microsoft Research India"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2020,4,17]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Akka Actors. https:\/\/akka.io.  Akka Actors. https:\/\/akka.io."},{"key":"e_1_3_2_1_2_1","unstructured":"Blobfuse. https:\/\/docs.microsoft.com\/bs-latn-ba\/azure\/storage\/blobs\/storage-how-to-mount-container-linux.  Blobfuse. https:\/\/docs.microsoft.com\/bs-latn-ba\/azure\/storage\/blobs\/storage-how-to-mount-container-linux."},{"key":"e_1_3_2_1_3_1","unstructured":"Checkpoint\/Restore in User Space. https:\/\/criu.org\/Main_Page.  Checkpoint\/Restore in User Space. https:\/\/criu.org\/Main_Page."},{"key":"e_1_3_2_1_4_1","unstructured":"Fair scheduler in hadoop. https:\/\/hadoop.apache.org\/docs\/r2.7.4\/hadoop-yarn\/hadoop-yarn-site\/FairScheduler.html.  Fair scheduler in hadoop. https:\/\/hadoop.apache.org\/docs\/r2.7.4\/hadoop-yarn\/hadoop-yarn-site\/FairScheduler.html."},{"key":"e_1_3_2_1_5_1","unstructured":"gRPC A high-performance open-source universal RPC framework. https:\/\/grpc.io.  gRPC A high-performance open-source universal RPC framework. https:\/\/grpc.io."},{"key":"e_1_3_2_1_6_1","unstructured":"Lstm training on wikitext-2 dataset. https:\/\/github.com\/pytorch\/examples\/tree\/master\/word_language_model.  Lstm training on wikitext-2 dataset. https:\/\/github.com\/pytorch\/examples\/tree\/master\/word_language_model."},{"key":"e_1_3_2_1_7_1","unstructured":"The Scala Programming Language. https:\/\/www.scala-lang.org.  The Scala Programming Language. https:\/\/www.scala-lang.org."},{"key":"e_1_3_2_1_8_1","volume-title":"The lovely but lonely vickrey auction. Combinatorial auctions 17","author":"Ausubel L. M.","year":"2006","unstructured":"Ausubel , L. M. , Milgrom , P. , The lovely but lonely vickrey auction. Combinatorial auctions 17 ( 2006 ), 22--26. Ausubel, L. M., Milgrom, P., et al. The lovely but lonely vickrey auction. Combinatorial auctions 17 (2006), 22--26."},{"key":"e_1_3_2_1_9_1","volume-title":"Scalable Multi-Framework Multi-Tenant Lifecycle Management of Deep Learning Training Jobs. In Workshop on ML Systems, NIPS","author":"Boag S.","year":"2017","unstructured":"Boag , S. , Dube , P. , Herta , B. , Hummer , W. , Ishakian , V. , Jayaram , K. , Kalantar , M. , Muthusamy , V. , Nagpurkar , P. , and Rosenberg , F . Scalable Multi-Framework Multi-Tenant Lifecycle Management of Deep Learning Training Jobs. In Workshop on ML Systems, NIPS ( 2017 ). Boag, S., Dube, P., Herta, B., Hummer, W., Ishakian, V., Jayaram, K., Kalantar, M., Muthusamy, V., Nagpurkar, P., and Rosenberg, F. Scalable Multi-Framework Multi-Tenant Lifecycle Management of Deep Learning Training Jobs. In Workshop on ML Systems, NIPS (2017)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2898442.2898444"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2007.70755"},{"key":"e_1_3_2_1_12_1","volume-title":"On the properties of neural machine translation: Encoder-decoder approaches. arXiv preprint arXiv:1409.1259","author":"Cho K.","year":"2014","unstructured":"Cho , K. , Van Merri\u00ebnboer , B. , Bahdanau , D. , and Bengio , Y . On the properties of neural machine translation: Encoder-decoder approaches. arXiv preprint arXiv:1409.1259 ( 2014 ). Cho, K., Van Merri\u00ebnboer, B., Bahdanau, D., and Bengio, Y. On the properties of neural machine translation: Encoder-decoder approaches. arXiv preprint arXiv:1409.1259 (2014)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPPS.1998.669970"},{"key":"e_1_3_2_1_14_1","first-page":"24","volume-title":"Nsdi","volume":"11","author":"Ghodsi A.","year":"2011","unstructured":"Ghodsi , A. , Zaharia , M. , Hindman , B. , Konwinski , A. , Shenker , S. , and Stoica , I . Dominant resource fairness: Fair allocation of multiple resource types . In Nsdi ( 2011 ), vol. 11 , pp. 24 -- 24 . Ghodsi, A., Zaharia, M., Hindman, B., Konwinski, A., Shenker, S., and Stoica, I. Dominant resource fairness: Fair allocation of multiple resource types. In Nsdi (2011), vol. 11, pp. 24--24."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/2465351.2465387"},{"key":"e_1_3_2_1_16_1","first-page":"99","volume-title":"12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","author":"Gog I.","year":"2016","unstructured":"Gog , I. , Schwarzkopf , M. , Gleave , A. , Watson , R. N. M. , and Hand , S . Firmament: Fast, Centralized Cluster Scheduling at Scale . In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16) (Savannah, GA, 2016 ), USENIX Association , pp. 99 -- 115 . Gog, I., Schwarzkopf, M., Gleave, A., Watson, R. N. M., and Hand, S. Firmament: Fast, Centralized Cluster Scheduling at Scale. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16) (Savannah, GA, 2016), USENIX Association, pp. 99--115."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/2740070.2626334"},{"key":"e_1_3_2_1_18_1","first-page":"81","volume-title":"12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","author":"Grandl R.","year":"2016","unstructured":"Grandl , R. , Kandula , S. , Rao , S. , Akella , A. , and Kulkarni , J . GRAPHENE: Packing and dependency-aware scheduling for data-parallel clusters . In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16) (Savannah, GA, 2016 ), USENIX Association , pp. 81 -- 97 . Grandl, R., Kandula, S., Rao, S., Akella, A., and Kulkarni, J. GRAPHENE: Packing and dependency-aware scheduling for data-parallel clusters. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16) (Savannah, GA, 2016), USENIX Association, pp. 81--97."},{"key":"e_1_3_2_1_19_1","first-page":"485","volume-title":"16th {USENIX} Symposium on Networked Systems Design and Implementation ({NSDI} 19)","author":"Gu J.","year":"2019","unstructured":"Gu , J. , Chowdhury , M. , Shin , K. G. , Zhu , Y. , Jeon , M. , Qian , J. , Liu , H. , and Guo , C . Tiresias: A {GPU} cluster manager for distributed deep learning . In 16th {USENIX} Symposium on Networked Systems Design and Implementation ({NSDI} 19) ( 2019 ), pp. 485 -- 500 . Gu, J., Chowdhury, M., Shin, K. G., Zhu, Y., Jeon, M., Qian, J., Liu, H., and Guo, C. Tiresias: A {GPU} cluster manager for distributed deep learning. In 16th {USENIX} Symposium on Networked Systems Design and Implementation ({NSDI} 19) (2019), pp. 485--500."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_21_1","first-page":"22","volume-title":"Mesos: A Platform for Fine-Grained Resource Sharing in the Data Center. In NSDI","volume":"11","author":"Hindman B.","year":"2011","unstructured":"Hindman , B. , Konwinski , A. , Zaharia , M. , Ghodsi , A. , Joseph , A. D. , Katz , R. H. , Shenker , S. , and Stoica , I . Mesos: A Platform for Fine-Grained Resource Sharing in the Data Center. In NSDI ( 2011 ), vol. 11 , pp. 22 -- 22 . Hindman, B., Konwinski, A., Zaharia, M., Ghodsi, A., Joseph, A. D., Katz, R. H., Shenker, S., and Stoica, I. Mesos: A Platform for Fine-Grained Resource Sharing in the Data Center. In NSDI (2011), vol. 11, pp. 22--22."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/1272996.1273005"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/1629575.1629601"},{"key":"e_1_3_2_1_24_1","first-page":"947","volume-title":"Analysis of large-scale multi-tenant {GPU} clusters for {DNN} training workloads. In 2019 {USENIX} Annual Technical Conference ({USENIX}{ATC} 19)","author":"Jeon M.","year":"2019","unstructured":"Jeon , M. , Venkataraman , S. , Phanishayee , A. , Qian , J. , Xiao , W. , and Yang , F . Analysis of large-scale multi-tenant {GPU} clusters for {DNN} training workloads. In 2019 {USENIX} Annual Technical Conference ({USENIX}{ATC} 19) ( 2019 ), pp. 947 -- 960 . Jeon, M., Venkataraman, S., Phanishayee, A., Qian, J., Xiao, W., and Yang, F. Analysis of large-scale multi-tenant {GPU} clusters for {DNN} training workloads. In 2019 {USENIX} Annual Technical Conference ({USENIX}{ATC} 19) (2019), pp. 947--960."},{"key":"e_1_3_2_1_25_1","volume-title":"Multi-tenant gpu clusters for deep learning workloads: Analysis and implications. MSR-TR-2018-13","author":"Jeon M.","year":"2018","unstructured":"Jeon , M. , Venkataraman , S. , Qian , J. , Phanishayee , A. , Xiao , W. , and Yang , F . Multi-tenant gpu clusters for deep learning workloads: Analysis and implications. MSR-TR-2018-13 ( 2018 ). Jeon, M., Venkataraman, S., Qian, J., Phanishayee, A., Xiao, W., and Yang, F. Multi-tenant gpu clusters for deep learning workloads: Analysis and implications. MSR-TR-2018-13 (2018)."},{"key":"e_1_3_2_1_26_1","volume-title":"Second International Conference on Learning Representations, ICLR","author":"Kingma D. P.","year":"2014","unstructured":"Kingma , D. P. , and Welling , M . Stochastic gradient vb and the variational auto-encoder . In Second International Conference on Learning Representations, ICLR ( 2014 ). Kingma, D. P., and Welling, M. Stochastic gradient vb and the variational auto-encoder. In Second International Conference on Learning Representations, ICLR (2014)."},{"key":"e_1_3_2_1_27_1","unstructured":"Krizhevsky A. Nair V. and Hinton G. Cifar-10 (canadian institute for advanced research).  Krizhevsky A. Nair V. and Hinton G. Cifar-10 (canadian institute for advanced research)."},{"key":"e_1_3_2_1_28_1","unstructured":"LeCun Y. and Cortes C. MNIST handwritten digit database.  LeCun Y. and Cortes C. MNIST handwritten digit database."},{"key":"e_1_3_2_1_29_1","volume-title":"Massively parallel hyperparameter tuning. arXiv preprint arXiv:1810.05934","author":"Li L.","year":"2018","unstructured":"Li , L. , Jamieson , K. , Rostamizadeh , A. , Gonina , E. , Hardt , M. , Recht , B. , and Talwalkar , A . Massively parallel hyperparameter tuning. arXiv preprint arXiv:1810.05934 ( 2018 ). Li, L., Jamieson, K., Rostamizadeh, A., Gonina, E., Hardt, M., Recht, B., and Talwalkar, A. Massively parallel hyperparameter tuning. arXiv preprint arXiv:1810.05934 (2018)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2001.937655"},{"key":"e_1_3_2_1_31_1","volume-title":"Docker: Lightweight linux containers for consistent development and deployment. Linux J","author":"Merkel D.","year":"2014","unstructured":"Merkel , D. Docker: Lightweight linux containers for consistent development and deployment. Linux J . 2014 , 239 (Mar. 2014). Merkel, D. Docker: Lightweight linux containers for consistent development and deployment. Linux J. 2014, 239 (Mar. 2014)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511800481"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/2377677.2377717"},{"key":"e_1_3_2_1_35_1","volume-title":"Unsupervised representation learning with deep convolutional generative adversarial networks. arXiv preprint arXiv:1511.06434","author":"Radford A.","year":"2015","unstructured":"Radford , A. , Metz , L. , and Chintala , S . Unsupervised representation learning with deep convolutional generative adversarial networks. arXiv preprint arXiv:1511.06434 ( 2015 ). Radford, A., Metz, L., and Chintala, S. Unsupervised representation learning with deep convolutional generative adversarial networks. arXiv preprint arXiv:1511.06434 (2015)."},{"key":"e_1_3_2_1_36_1","volume-title":"Horovod: fast and easy distributed deep learning in tensorflow. arXiv preprint arXiv:1802.05799","author":"Sergeev A.","year":"2018","unstructured":"Sergeev , A. , and Del Balso , M. Horovod: fast and easy distributed deep learning in tensorflow. arXiv preprint arXiv:1802.05799 ( 2018 ). Sergeev, A., and Del Balso, M. Horovod: fast and easy distributed deep learning in tensorflow. arXiv preprint arXiv:1802.05799 (2018)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.207"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/2523616.2523633"},{"key":"e_1_3_2_1_39_1","unstructured":"Waldspurger C. A. Lottery and stride scheduling: Flexible proportional-share resource management MIT.  Waldspurger C. A. Lottery and stride scheduling: Flexible proportional-share resource management MIT."},{"key":"e_1_3_2_1_40_1","first-page":"1","volume-title":"Proceedings of the 1st USENIX conference on Operating Systems Design and Implementation","author":"Waldspurger C. A.","year":"1994","unstructured":"Waldspurger , C. A. , and Weihl , W. E . Lottery scheduling: Flexible proportional-share resource management . In Proceedings of the 1st USENIX conference on Operating Systems Design and Implementation ( 1994 ), pp. 1 --es. Waldspurger, C. A., and Weihl, W. E. Lottery scheduling: Flexible proportional-share resource management. In Proceedings of the 1st USENIX conference on Operating Systems Design and Implementation (1994), pp. 1--es."},{"key":"e_1_3_2_1_41_1","first-page":"595","volume-title":"13th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 18)","author":"Xiao W.","year":"2018","unstructured":"Xiao , W. , Bhardwaj , R. , Ramjee , R. , Sivathanu , M. , Kwatra , N. , Han , Z. , Patel , P. , Peng , X. , Zhao , H. , Zhang , Q. , Gandiva: Introspective cluster scheduling for deep learning . In 13th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 18) ( 2018 ), pp. 595 -- 610 . Xiao, W., Bhardwaj, R., Ramjee, R., Sivathanu, M., Kwatra, N., Han, Z., Patel, P., Peng, X., Zhao, H., Zhang, Q., et al. Gandiva: Introspective cluster scheduling for deep learning. In 13th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 18) (2018), pp. 595--610."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"e_1_3_2_1_43_1","volume-title":"International Conference on Learning Representations","author":"You Y.","year":"2020","unstructured":"You , Y. , Li , J. , Reddi , S. , Hseu , J. , Kumar , S. , Bhojanapalli , S. , Song , X. , Demmel , J. , Keutzer , K. , and Hsieh , C . -J. Large batch optimization for deep learning: Training bert in 76 minutes . In International Conference on Learning Representations ( 2020 ). You, Y., Li, J., Reddi, S., Hseu, J., Kumar, S., Bhojanapalli, S., Song, X., Demmel, J., Keutzer, K., and Hsieh, C.-J. Large batch optimization for deep learning: Training bert in 76 minutes. In International Conference on Learning Representations (2020)."},{"key":"e_1_3_2_1_44_1","volume-title":"OSDI'08: Eighth Symposium on Operating System Design and Implementation (December","author":"Yu Y.","year":"2008","unstructured":"Yu , Y. , Isard , M. , Fetterly , D. , Budiu , M. , Erlingsson , U. , Gunda , P. K. , and Currey , J . Dryadlinq: A system for general-purpose distributed data-parallel computing using a high-level language . In OSDI'08: Eighth Symposium on Operating System Design and Implementation (December 2008 ), USENIX. Yu, Y., Isard, M., Fetterly, D., Budiu, M., Erlingsson, U., Gunda, P. K., and Currey, J. Dryadlinq: A system for general-purpose distributed data-parallel computing using a high-level language. In OSDI'08: Eighth Symposium on Operating System Design and Implementation (December 2008), USENIX."}],"event":{"name":"EuroSys '20: Fifteenth EuroSys Conference 2020","location":"Heraklion Greece","acronym":"EuroSys '20","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Fifteenth European Conference on Computer Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3342195.3387555","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3342195.3387555","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T22:33:22Z","timestamp":1750199602000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3342195.3387555"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,4,15]]},"references-count":44,"alternative-id":["10.1145\/3342195.3387555","10.1145\/3342195"],"URL":"https:\/\/doi.org\/10.1145\/3342195.3387555","relation":{},"subject":[],"published":{"date-parts":[[2020,4,15]]},"assertion":[{"value":"2020-04-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}