{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,5]],"date-time":"2025-12-05T18:57:48Z","timestamp":1764961068522,"version":"3.46.0"},"reference-count":52,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"1","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100000923","name":"Australian Research Council","doi-asserted-by":"publisher","award":["LP220200893"],"award-info":[{"award-number":["LP220200893"]}],"id":[{"id":"10.13039\/501100000923","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2026,1]]},"DOI":"10.1109\/tpds.2025.3627574","type":"journal-article","created":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T17:13:36Z","timestamp":1761930816000},"page":"287-303","source":"Crossref","is-referenced-by-count":0,"title":["Joint Optimization of Resource Allocation and Request Batching for Multi-Tenant Inference Serving on GPU"],"prefix":"10.1109","volume":"37","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0306-1859","authenticated-orcid":false,"given":"Yuning","family":"Zhang","sequence":"first","affiliation":[{"name":"School of Electrical and Computer Engineering, University of Sydney, Darlington, NSW, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5257-9227","authenticated-orcid":false,"given":"Nan","family":"Yang","sequence":"additional","affiliation":[{"name":"School of Electrical and Computer Engineering, University of Sydney, Darlington, NSW, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7891-4474","authenticated-orcid":false,"given":"Chen","family":"Pan","sequence":"additional","affiliation":[{"name":"School of Electrical and Computer Engineering, University of Sydney, Darlington, NSW, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1130-0888","authenticated-orcid":false,"given":"Dong","family":"Yuan","sequence":"additional","affiliation":[{"name":"School of Electrical and Computer Engineering, University of Sydney, Darlington, NSW, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1162\/neco_a_00990"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-5431"},{"article-title":"LLaMA: Open and efficient foundation language models","year":"2023","author":"Touvron","key":"ref3"},{"article-title":"Tensorflow-serving: Flexible, high-performance ML serving","year":"2017","author":"Olston","key":"ref4"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3605573.3605638"},{"article-title":"CUDA multi process service overview","year":"2021","author":"Corporation","key":"ref6"},{"article-title":"Triton inference server","year":"2023","author":"Corporation","key":"ref7"},{"key":"ref8","first-page":"613","article-title":"Clipper: A low-latency online prediction serving system","volume-title":"Proc. 14th USENIX Symp. Netw. Syst. Des. Implementation","author":"Crankshaw","year":"2017"},{"year":"2021","key":"ref9","article-title":"Nvidia multi-instance GPU"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/n19-1423"},{"key":"ref12","article-title":"An image is worth 16 x 16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy","year":"2021"},{"key":"ref13","first-page":"173","article-title":"Deep speech 2: End-to-end speech recognition in english and mandarin","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Amodei","year":"2016"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"ref16","first-page":"199","article-title":"Serving heterogeneous machine learning models on Multi-GPU servers with spatio-temporal sharing","volume-title":"Proc. USENIX Annu. Tech. Conf.","author":"Choi","year":"2022"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3542929.3563510"},{"key":"ref18","article-title":"Continuous control with deep reinforcement learning","volume-title":"Proc. 4th Int. Conf. Learn. Representations","author":"Lillicrap","year":"2016"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.501"},{"key":"ref20","first-page":"135","article-title":"ServerlessLLM: Low-latency serverless inference for large language models","volume-title":"Proc. 18th USENIX Symp. Operating Syst. Des. Implementation","author":"Fu","year":"2024"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3629565"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD51958.2021.9643501"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/NEUREL.2018.8587006"},{"key":"ref24","article-title":"A survey of multi-tenant deep learning inference on GPU","volume-title":"Proc. Workshop Cloud Intell. \/ AIOps","author":"Yu","year":"2022"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441578"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS54959.2023.00042"},{"year":"2024","key":"ref27","article-title":"A tensorrt toolbox for optimized large language model inference"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421284"},{"key":"ref29","first-page":"539","article-title":"Microsecond-scale preemption for concurrent GPU-accelerated DNN inferences","volume-title":"Proc. 16th USENIX Symp. Operating Syst. Des. Implementation","author":"Han","year":"2022"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3629578"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613163"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TSUSC.2025.3528105"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/TETC.2024.3403874"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2023.3249153"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3437984.3458837"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1613\/jair.301"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2017.2743240"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1038\/nature14236"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TWC.2019.2933417"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TII.2022.3189725"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/COMST.2020.2988367"},{"key":"ref42","article-title":"Playing atari with deep reinforcement learning","volume-title":"Proc. NIPS Deep Learn. Workshop","author":"Mnih","year":"2013"},{"key":"ref43","first-page":"387","article-title":"Deterministic policy gradient algorithms","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Silver","year":"2014"},{"key":"ref44","first-page":"1008","article-title":"Actor-critic algorithms","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Konda","year":"1999"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3605573.3605585"},{"key":"ref46","first-page":"1","article-title":"Apollo: Automatic partition-based operator fusion through layer by layer optimization","volume-title":"Proc. Mach. Learn. Syst.","author":"Zhao","year":"2022"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/2663346"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3455008"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"article-title":"CIFAR-10 (canadian institute for advanced research)","year":"2009","author":"Krizhevsky","key":"ref50"},{"article-title":"The LJ speech dataset","year":"2017","author":"Ito","key":"ref51"},{"key":"ref52","first-page":"485","article-title":"Tiresias: A GPU cluster manager for distributed deep learning","volume-title":"Proc. 16th USENIX Symp. Netw. Syst. Des. Implementation","author":"Gu","year":"2019"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/71\/11261373\/11223102.pdf?arnumber=11223102","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,5]],"date-time":"2025-12-05T18:40:34Z","timestamp":1764960034000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11223102\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1]]},"references-count":52,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2025.3627574","relation":{},"ISSN":["1045-9219","1558-2183","2161-9883"],"issn-type":[{"type":"print","value":"1045-9219"},{"type":"electronic","value":"1558-2183"},{"type":"electronic","value":"2161-9883"}],"subject":[],"published":{"date-parts":[[2026,1]]}}}