{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,20]],"date-time":"2025-07-20T03:51:13Z","timestamp":1752983473759,"version":"3.28.0"},"reference-count":56,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,5,6]],"date-time":"2024-05-06T00:00:00Z","timestamp":1714953600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,5,6]],"date-time":"2024-05-06T00:00:00Z","timestamp":1714953600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,5,6]]},"DOI":"10.1109\/noms59830.2024.10575188","type":"proceedings-article","created":{"date-parts":[[2024,7,2]],"date-time":"2024-07-02T17:23:51Z","timestamp":1719941031000},"page":"1-10","source":"Crossref","is-referenced-by-count":1,"title":["Accelerating Containerized Machine Learning Workloads"],"prefix":"10.1109","author":[{"given":"Ali","family":"Tariq","sequence":"first","affiliation":[{"name":"University of Colorado Boulder"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lianjie","family":"Cao","sequence":"additional","affiliation":[{"name":"Hewlett Packard Labs"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Faraz","family":"Ahmed","sequence":"additional","affiliation":[{"name":"Hewlett Packard Labs"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Eric","family":"Rozner","sequence":"additional","affiliation":[{"name":"University of Colorado Boulder"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Puneet","family":"Sharma","sequence":"additional","affiliation":[{"name":"Hewlett Packard Labs"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"279","article-title":"Accelerating distributed reinforcement learning with in-switch computing","volume-title":"2019 ACM\/IEEE 46th Annual International Symposium on Computer Architecture (ISCA)","author":"Li"},{"year":"2022","key":"ref2","article-title":"Parameter server training with parameterserverstrategy"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.5555\/2685048.2685095"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421307"},{"key":"ref5","first-page":"947","article-title":"Analysis of Large-Scale Multi-Tenant GPU clusters for DNN training workloads","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon"},{"key":"ref6","first-page":"945","article-title":"{MLaaS} in the wild: Workload analysis and scheduling in {Large-Scale} heterogeneous {GPU} clusters","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Weng"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3127479.3127490"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.48550\/arxiv.1811.06965"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"ref11","first-page":"265","article-title":"{TensorFlow}: a system for {Large-Scale} machine learning","volume-title":"12th USENIX symposium on operating systems design and implementation (OSDI 16)","author":"Abadi"},{"volume-title":"GPipe: Efficient Training of Giant Neural Networks Using Pipeline Parallelism","year":"2019","author":"Huang","key":"ref12"},{"article-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism","year":"2019","author":"Shoeybi","key":"ref13"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2019.00203"},{"article-title":"Elastic model aggregation with parameter service","year":"2022","author":"Gu","key":"ref15"},{"key":"ref16","first-page":"485","article-title":"Tiresias: A GPU cluster manager for distributed deep learning","volume-title":"16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)","author":"Gu"},{"key":"ref17","first-page":"595","article-title":"Gandiva: Introspective cluster scheduling for deep learning","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Xiao"},{"key":"ref18","first-page":"533","article-title":"AntMan: Dynamic scaling on GPU clusters for deep learning","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Xiao"},{"key":"ref19","first-page":"98","article-title":"Fine-grained gpu sharing primitives for deep learning applications","volume-title":"Proceedings of Machine Learning and Systems","volume":"2","author":"Yu"},{"volume-title":"HiveD: Sharing a GPU Cluster for Deep Learning with Guarantees","year":"2020","author":"Zhao","key":"ref20"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421284"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/tcc.2020.3006751"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3071762"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3074057"},{"article-title":"Deepergcn: All you need to train deeper gcns","year":"2020","author":"Li","key":"ref25"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403076"},{"key":"ref27","first-page":"291","article-title":"Slide : In defense of smart algorithms over hardware acceleration for large-scale deep learning systems","volume-title":"Proceedings of Machine Learning and Systems","volume":"2","author":"Chen"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3410463.3414655"},{"article-title":"Deep learning training in facebook data centers: Design of scale-up and scale-out systems","year":"2020","author":"Naumov","key":"ref29"},{"key":"ref30","first-page":"951","article-title":"Deepcpu: Serving rnn-based deep learning models 10x faster","volume-title":"Proceedings of the 2018 USENIX Conference on Usenix Annual Technical Conference, ser. USENIX ATC \u201918","author":"Zhang"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-022-10221-5"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"author":"Krizhevsky","key":"ref33","article-title":"Cifar-10 (canadian institute for advanced research)"},{"key":"ref34","article-title":"Tensorflow:tf.config.threading"},{"key":"ref35","article-title":"Tensorflow:tf.config.threading"},{"article-title":"Alpa: Automating inter- and intra-operator parallelism for distributed deep learning","year":"2022","author":"Zheng","key":"ref36"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/2668930.2688047"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/PMBS.2018.8641686"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1016\/j.is.2019.01.006"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/2741948.2741964"},{"article-title":"Mobilenets: Efficient convolutional neural networks for mobile vision applications","year":"2017","author":"Howard","key":"ref41"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2017.243"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00907"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2016.90"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2016.308"},{"article-title":"Cyclegan","year":"2020","author":"Nain","key":"ref46"},{"article-title":"Traffic forecasting using graph neural networks and lstm","year":"2021","author":"Khodadadi","key":"ref47"},{"article-title":"Gpt text generation from scratch with kerasnlp","year":"2022","author":"Chan","key":"ref48"},{"article-title":"Image classification with vision transformer","year":"2021","author":"Salama","key":"ref49"},{"year":"2020","key":"ref50","article-title":"Bidirectional lstm"},{"article-title":"Palm: Scaling language modeling with pathways","year":"2022","author":"Chowdhery","key":"ref51"},{"year":"2022","key":"ref52","article-title":"Nvidia multi-process service"},{"year":"2022","key":"ref53","article-title":"Nvidia multi-instance gpu user guide"},{"key":"ref54","first-page":"20","article-title":"Sla-driven ml inference framework for clouds with heterogeneous accelerators","volume-title":"Proceedings of Machine Learning and Systems","volume":"4","author":"Cho"},{"article-title":"Horovod: fast and easy distributed deep learning in tensorflow","year":"2018","author":"Sergeev","key":"ref55"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.5555\/3291168.3291210"}],"event":{"name":"NOMS 2024-2024 IEEE Network Operations and Management Symposium","start":{"date-parts":[[2024,5,6]]},"location":"Seoul, Korea, Republic of","end":{"date-parts":[[2024,5,10]]}},"container-title":["NOMS 2024-2024 IEEE Network Operations and Management Symposium"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10574855\/10574897\/10575188.pdf?arnumber=10575188","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,6]],"date-time":"2024-07-06T04:50:19Z","timestamp":1720241419000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10575188\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,6]]},"references-count":56,"URL":"https:\/\/doi.org\/10.1109\/noms59830.2024.10575188","relation":{},"subject":[],"published":{"date-parts":[[2024,5,6]]}}}