{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,20]],"date-time":"2026-04-20T22:52:07Z","timestamp":1776725527643,"version":"3.51.2"},"reference-count":68,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"4","license":[{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"U.S. NSF","award":["CRI-1823270"],"award-info":[{"award-number":["CRI-1823270"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Cloud Comput."],"published-print":{"date-parts":[[2024,10]]},"DOI":"10.1109\/tcc.2024.3476210","type":"journal-article","created":{"date-parts":[[2024,10,7]],"date-time":"2024-10-07T17:46:14Z","timestamp":1728323174000},"page":"1344-1358","source":"Crossref","is-referenced-by-count":2,"title":["D-STACK: High Throughput DNN Inference by Effective Multiplexing and Spatio-Temporal Scheduling of GPUs"],"prefix":"10.1109","volume":"12","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8297-8525","authenticated-orcid":false,"given":"Aditya","family":"Dhakal","sequence":"first","affiliation":[{"name":"University of California, Riverside, Riverside, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4727-6875","authenticated-orcid":false,"given":"Sameer G.","family":"Kulkarni","sequence":"additional","affiliation":[{"name":"IIT Gandhinagar, Gujarat, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1849-5155","authenticated-orcid":false,"given":"K. K.","family":"Ramakrishnan","sequence":"additional","affiliation":[{"name":"University of California, Riverside, Riverside, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2018.00091"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"ref3","first-page":"8024","article-title":"PyTorch: An imperative style, high-performance deep learning library","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Paszke"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421284"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3330345.3330351"},{"key":"ref8","article-title":"The architectural implications of distributed reinforcement learning on CPU-GPU systems","author":"Inci","year":"2020"},{"key":"ref9","first-page":"30","article-title":"A systematic methodology for analysis of deep learning hardware and software platforms","volume-title":"Proc. Mach. Learn. Syst.","author":"Wang"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/MDAT.2021.3095215"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359658"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.48550\/arxiv.1811.06965"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"ref14","year":"2024","journal-title":"NVIDIA Multi-Process Service"},{"key":"ref15","article-title":"Driving digital transformation with GPU virtualization and enterprise cloud","year":"2017"},{"key":"ref16","article-title":"Unlock next level performance with virtual GPUs","year":"2021"},{"key":"ref17","first-page":"613","article-title":"Clipper: A low-latency online prediction serving system","volume-title":"Proc. 14th USENIX Symp. Netw. Syst. Des. Implementation","author":"Crankshaw"},{"key":"ref18","first-page":"485","article-title":"Tiresias: A GPU cluster manager for distributed deep learning","volume-title":"Proc. 16th USENIX Symp. Netw. Syst. Des. Implementation","author":"Gu"},{"key":"ref19","first-page":"443","article-title":"Serving DNNs like clockwork: Performance predictability from the bottom up","volume-title":"Proc. 14th USENIX Symp. Operating Syst. Des. Implementation","author":"Gujarati"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190541"},{"key":"ref21","article-title":"Host multiple models with multi-model endpoints","year":"2021"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3369583.3392679"},{"key":"ref23","first-page":"595","article-title":"Gandiva: Introspective cluster scheduling for deep learning","volume-title":"Proc. 13th USENIX Symp. Operating Syst. Des. Implementation","author":"Xiao"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2016.73"},{"key":"ref25","first-page":"187","article-title":"G-net: Effective GPU sharing in NFV systems","volume-title":"Proc. 15th USENIX Symp. Netw. Syst. Des. Implementation","author":"Zhang"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/IWQOS52092.2021.9521266"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/2954679.2872368"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/RTAS.2018.00028"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037700"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/2925426.2926271"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2018.2848621"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2022.102958"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3205289.3205311"},{"key":"ref34","first-page":"947","article-title":"Analysis of large-scale multi-tenant GPU clusters for DNN training workloads","volume-title":"Proc. 2019 USENIX Annu. Tech. Conf.","author":"Jeon"},{"key":"ref35","article-title":"Towards GPU utilization prediction for cloud deep learning","volume-title":"Proc. 12th USENIX Workshop Hot Topics Cloud Comput.","author":"Yeung"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-60239-0_33"},{"key":"ref37","first-page":"27","article-title":"Optimizing DNN computation with relaxed graph substitutions","volume-title":"Proc. Mach. Learn. Syst.","author":"Jia"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2017.111"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.52"},{"key":"ref40","first-page":"578","article-title":"TVM: An automated end-to-end optimizing compiler for deep learning","volume-title":"Proc. 13th USENIX Symp. Operating Syst. Des. Implementation","author":"Chen"},{"key":"ref41","article-title":"The OOO VLIW JIT compiler for GPU inference","author":"Jain","year":"2019"},{"key":"ref42","article-title":"Dynamic space-time scheduling for gpu inference","author":"Jain","year":"2018"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2019.00061"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2013.6618813"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3582080"},{"key":"ref46","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","volume-title":"Proc. 25th Int. Conf. Neural Inf. Process. Syst.","author":"Krizhevsky"},{"key":"ref47","article-title":"MobileNets: Efficient convolutional neural networks for mobile vision applications","author":"Howard","year":"2017"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref49","article-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan","year":"2014"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.4324\/9781410605337-29"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"ref52","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Krizhevsky"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1145\/2940147.2940155"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/2815675.2815692"},{"key":"ref57","first-page":"951","article-title":"DeepCPU: Serving RNN-based deep learning models 10x faster","volume-title":"Proc. 2018 USENIX Annu. Tech. Conf.","author":"Zhang"},{"key":"ref58","article-title":"Deep learning performance documentation","year":"2021"},{"key":"ref59","article-title":"Google\u2019s neural machine translation system: Bridging the gap between human and machine translation","author":"Wu","year":"2016"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2016.2549523"},{"key":"ref61","article-title":"Dissecting the nvidia volta GPU architecture via microbenchmarking","author":"Jia","year":"2018"},{"key":"ref62","article-title":"Towards QoS-aware and resource-efficient GPU microservices based on spatial multitasking GPUs in datacenters","author":"Zhang","year":"2020"},{"key":"ref63","first-page":"71","article-title":"GPU performance analysis and optimization","volume-title":"Proc. GPU Technol. Conf.","author":"Micikevicius"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1145\/1465482.1465560"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1145\/42411.42415"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1145\/3210240.3210319"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1989.1.4.541"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref71","first-page":"142","article-title":"Learning word vectors for sentiment analysis","volume-title":"Proc. 49th Annu. Meeting Assoc. Comput. Linguistics: Hum. Lang. Technol.","author":"Maas"},{"key":"ref72","volume-title":"Data Networks","volume":"2","author":"Bertsekas","year":"1992"},{"issue":"184","key":"ref73","first-page":"184","article-title":"Completely fair scheduler","volume":"2009","author":"Pabla","year":"2009","journal-title":"Linux J."}],"container-title":["IEEE Transactions on Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielam\/6245519\/10780435\/10707184-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6245519\/10780435\/10707184.pdf?arnumber=10707184","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,11]],"date-time":"2024-12-11T02:00:35Z","timestamp":1733882435000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10707184\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10]]},"references-count":68,"journal-issue":{"issue":"4"},"URL":"https:\/\/doi.org\/10.1109\/tcc.2024.3476210","relation":{},"ISSN":["2168-7161","2372-0018"],"issn-type":[{"value":"2168-7161","type":"electronic"},{"value":"2372-0018","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10]]}}}