{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T14:45:16Z","timestamp":1773153916656,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":24,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,10,3]],"date-time":"2022-10-03T00:00:00Z","timestamp":1664755200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"NSF","award":["CAREER CNS-2110259, CNS-2112471, CNS-2102233, CCF-2110252, 21-20448, 19-34884, CNS-2112694"],"award-info":[{"award-number":["CAREER CNS-2110259, CNS-2112471, CNS-2102233, CCF-2110252, 21-20448, 19-34884, CNS-2112694"]}]},{"name":"Cisco Systems Research Grant","award":["GR127298"],"award-info":[{"award-number":["GR127298"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,3]]},"DOI":"10.1145\/3492866.3549716","type":"proceedings-article","created":{"date-parts":[[2022,9,21]],"date-time":"2022-09-21T16:34:33Z","timestamp":1663778073000},"page":"21-30","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":11,"title":["On scheduling ring-all-reduce learning jobs in multi-tenant GPU clusters with communication contention"],"prefix":"10.1145","author":[{"given":"Menglu","family":"Yu","sequence":"first","affiliation":[{"name":"Iowa State University"}]},{"given":"Bo","family":"Ji","sequence":"additional","affiliation":[{"name":"Virginia Tech"}]},{"given":"Hridesh","family":"Rajan","sequence":"additional","affiliation":[{"name":"Iowa State University"}]},{"given":"Jia","family":"Liu","sequence":"additional","affiliation":[{"name":"The Ohio State University and Iowa State University"}]}],"member":"320","published-online":{"date-parts":[[2022,10,3]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proc. of USENIX OSDI","author":"Abadi M.","year":"2016","unstructured":"Abadi, M., Barham, P., et al. TensorFlow: A system for large-scale machine learning. In Proc. of USENIX OSDI (2016)."},{"key":"e_1_3_2_1_2_1","volume-title":"Deep learning-based job placement in distributed machine learning clusters. In in IEEE INFOCOM","author":"Bao Y.","year":"2019","unstructured":"Bao, Y., Peng, Y., and Wu, C. Deep learning-based job placement in distributed machine learning clusters. In in IEEE INFOCOM (2019)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3077839.3077855"},{"key":"e_1_3_2_1_4_1","first-page":"7","volume-title":"Ultra-performance pascal gpu and nvlink interconnect","author":"Foley D.","year":"2017","unstructured":"Foley, D., and Danskin, J. Ultra-performance pascal gpu and nvlink interconnect. In IEEE Micr (2017), vol. 37, pp. 7--17."},{"key":"e_1_3_2_1_5_1","volume-title":"OSDI","author":"Grandl R.","year":"2016","unstructured":"Grandl, R., Chowdhury, M., Akella, A., and Ananthanarayanan, G. Altruistic scheduling in multi-resource clusters. In OSDI (2016)."},{"key":"e_1_3_2_1_6_1","volume-title":"OSDI","author":"Grandl R.","year":"2016","unstructured":"Grandl, R., Kandula, S., Rao, S., Akella, A., and Kulkarni, J. Graphene: Packing and dependency-aware scheduling for data-parallel clusters. In OSDI (2016)."},{"key":"e_1_3_2_1_7_1","first-page":"485","article-title":"Tiresias: A gpu cluster manager for distributed deep learning","volume":"19","author":"Gu J.","year":"2019","unstructured":"Gu, J., Chowdhury, M., Shin, K. G., Zhu, Y., Jeon, M., Qian, J., Liu, H., and Guo, C. Tiresias: A gpu cluster manager for distributed deep learning. In NSDI 19 (2019), pp. 485--500.","journal-title":"NSDI"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2019.00201"},{"key":"e_1_3_2_1_9_1","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon M.","year":"2019","unstructured":"Jeon, M., Venkataraman, S., Phanishayee, A., Qian, J., Xiao, W., and Yang, F. Analysis of large-scale multi-tenant gpu clusters for dnn training workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19) (2019)."},{"key":"e_1_3_2_1_10_1","first-page":"289","volume-title":"17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20)","author":"Mahajan K.","year":"2020","unstructured":"Mahajan, K., Balasubramanian, A., Singhvi, A., Venkataraman, S., Akella, A., Phanishayee, A., and Chawla, S. Themis: Fair and efficient gpu cluster scheduling. In 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20) (2020), pp. 289--304."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2017.8057205"},{"key":"e_1_3_2_1_12_1","volume-title":"NeurIPS","author":"Paszke A.","year":"2019","unstructured":"Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., Desmaison, A., K\u00f6pf, A., Yang, E., DeVito, Z., Raison, M., Tejani, A., Chilamkurthy, S., Steiner, B., Fang, L., Bai, J., and Chintala, S. Pytorch: an imperative style, high-performance deep learning library. In NeurIPS (2019)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2008.09.002"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"key":"e_1_3_2_1_15_1","volume-title":"Horovod: Fast and easy distributed deep learning in tensorflow. In arXiv preprint arXiv:1802.05799","author":"Sergeev A.","year":"2018","unstructured":"Sergeev, A., and Balso, M. D. Horovod: Fast and easy distributed deep learning in tensorflow. In arXiv preprint arXiv:1802.05799 (2018)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/DASC\/PiCom\/DataCom\/CyberSciTec.2018.000-4"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.simpat.2010.08.010"},{"key":"e_1_3_2_1_18_1","first-page":"1","volume-title":"SC20: International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Wang L.","year":"2020","unstructured":"Wang, L., Weng, Q., Wang, W., Chen, C., and Li, B. Metis: Learning to schedule long-running applications in shared container clusters at scale. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis (2020), pp. 1--17."},{"key":"e_1_3_2_1_19_1","volume-title":"Communication contention aware scheduling of multiple deep learning training jobs. In arXiv:2002.10105","author":"Wang Q.","year":"2020","unstructured":"Wang, Q., Shi, S., Wang, C., and Chu, X. Communication contention aware scheduling of multiple deep learning training jobs. In arXiv:2002.10105 (2020)."},{"key":"e_1_3_2_1_20_1","first-page":"595","volume-title":"in 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Xiao W.","year":"2018","unstructured":"Xiao, W., Bhardwaj, R., Ramjee, R., Sivathanu, M., Kwatra, N., Han, Z., Patel, P., Peng, X., Zhao, H., Zhang, Q., Yang, F., and Zhou, L. Gandiva: Introspective cluster scheduling for deep learning. In in 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18) (2018), pp. 595--610."},{"key":"e_1_3_2_1_21_1","unstructured":"Yu M. Ji B. Rajan H. and Liu J. On scheduling ring-all-reduce learning jobs in multi-tenant gpu clusters with communication contention. https:\/\/arxiv.org\/abs\/2207.07817."},{"key":"e_1_3_2_1_22_1","article-title":"Toward efficient online scheduling for distributed machine learning systems","author":"Yu M.","year":"2021","unstructured":"Yu, M., Liu, J., Wu, C., Ji, B., and Bentley, E. S. Toward efficient online scheduling for distributed machine learning systems. IEEE Transactions on Network Science and Engineering (TNSE) (2021).","journal-title":"IEEE Transactions on Network Science and Engineering (TNSE) ("},{"key":"e_1_3_2_1_23_1","volume-title":"Gadget: Online resource optimization for scheduling ring-all-reduce learning jobs","author":"Yu M.","year":"2022","unstructured":"Yu, M., Tian, Y., Ji, B., Wu, C., Rajan, H., and Liu, J. Gadget: Online resource optimization for scheduling ring-all-reduce learning jobs. In IEEE INFOCOM (2022)."},{"key":"e_1_3_2_1_24_1","first-page":"181","volume-title":"in 2017 USENIX Annual Technical Conference (USENIX ATC 17)","author":"Zhang H.","year":"2017","unstructured":"Zhang, H., Zheng, Z., Xu, S., Dai, W., Ho, Q., Liang, X., Hu, Z., Wei, J., Xie, P., and Xing, E. P. Poseidon: An efficient communication architecture for distributed deep learning on gpu clusters. In in 2017 USENIX Annual Technical Conference (USENIX ATC 17) (2017), pp. 181--193."}],"event":{"name":"MobiHoc '22: The Twenty-third International Symposium on Theory, Algorithmic Foundations, and Protocol Design for Mobile Networks and Mobile Computing","location":"Seoul Republic of Korea","acronym":"MobiHoc '22","sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing"]},"container-title":["Proceedings of the Twenty-Third International Symposium on Theory, Algorithmic Foundations, and Protocol Design for Mobile Networks and Mobile Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3492866.3549716","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3492866.3549716","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3492866.3549716","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:27Z","timestamp":1750193307000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3492866.3549716"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,3]]},"references-count":24,"alternative-id":["10.1145\/3492866.3549716","10.1145\/3492866"],"URL":"https:\/\/doi.org\/10.1145\/3492866.3549716","relation":{},"subject":[],"published":{"date-parts":[[2022,10,3]]},"assertion":[{"value":"2022-10-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}