{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,17]],"date-time":"2026-04-17T16:11:52Z","timestamp":1776442312862,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,3]],"date-time":"2024-06-03T00:00:00Z","timestamp":1717372800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CNS-1763617"],"award-info":[{"award-number":["CNS-1763617"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CNS-1901137"],"award-info":[{"award-number":["CNS-1901137"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CNS-2106463"],"award-info":[{"award-number":["CNS-2106463"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CNS-2312396"],"award-info":[{"award-number":["CNS-2312396"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CNS-2338512"],"award-info":[{"award-number":["CNS-2338512"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CNS-2224054"],"award-info":[{"award-number":["CNS-2224054"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["DMS-2220211"],"award-info":[{"award-number":["DMS-2220211"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,3]]},"DOI":"10.1145\/3625549.3658688","type":"proceedings-article","created":{"date-parts":[[2024,8,30]],"date-time":"2024-08-30T15:55:29Z","timestamp":1725033329000},"page":"267-280","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Loki: A System for Serving ML Inference Pipelines with Hardware and Accuracy Scaling"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-3892-9460","authenticated-orcid":false,"given":"Sohaib","family":"Ahmad","sequence":"first","affiliation":[{"name":"University of Massachusetts Amherst, Amherst, Massachusetts, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9128-2231","authenticated-orcid":false,"given":"Hui","family":"Guan","sequence":"additional","affiliation":[{"name":"University of Massachusetts Amherst, Amherst, Massachusetts, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0558-6875","authenticated-orcid":false,"given":"Ramesh K.","family":"Sitaraman","sequence":"additional","affiliation":[{"name":"University of Massachusetts Amherst, Amherst, Massachusetts, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,8,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2018. Twitter Streaming Traces. https:\/\/archive.org\/details\/archiveteam-twitter-stream-2018-04."},{"key":"e_1_3_2_1_2_1","unstructured":"2020. Amazon SageMaker. Build train and deploy machine learning models at scale. https:\/\/aws.amazon.com\/sagemaker\/. Accessed: 2021-06-23."},{"key":"e_1_3_2_1_3_1","unstructured":"2022. Triton Inference Server. https:\/\/developer.nvidia.com\/nvidia-triton-inference-server."},{"key":"e_1_3_2_1_4_1","unstructured":"Adobe. 2024. Adobe Firefly. https:\/\/www.adobe.com\/products\/firefly.html"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3617232.3624849"},{"key":"e_1_3_2_1_6_1","volume-title":"Ekya: Continuous Learning of Video Analytics Models on Edge Compute Servers. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Bhardwaj Romil","year":"2022","unstructured":"Romil Bhardwaj, Zhengxu Xia, Ganesh Ananthanarayanan, Junchen Jiang, Yuanchao Shu, Nikolaos Karianakis, Kevin Hsieh, Paramvir Bahl, and Ion Stoica. 2022. Ekya: Continuous Learning of Video Analytics Models on Edge Compute Servers. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). USENIX Association, Renton, WA, 119--135. https:\/\/www.usenix.org\/conference\/nsdi22\/presentation\/bhardwaj"},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of machine learning and systems 2","author":"Blalock Davis","year":"2020","unstructured":"Davis Blalock, Jose Javier Gonzalez Ortiz, Jonathan Frankle, and John Guttag. 2020. What is the state of neural network pruning? Proceedings of machine learning and systems 2 (2020), 129--146."},{"key":"e_1_3_2_1_8_1","volume-title":"Return of the devil in the details: Delving deep into convolutional nets. arXiv preprint arXiv:1405.3531","author":"Chatfield Ken","year":"2014","unstructured":"Ken Chatfield, Karen Simonyan, Andrea Vedaldi, and Andrew Zisserman. 2014. Return of the devil in the details: Delving deep into convolutional nets. arXiv preprint arXiv:1405.3531 (2014)."},{"key":"e_1_3_2_1_9_1","volume-title":"Microsoft COCO Captions: Data Collection and Evaluation Server. CoRR abs\/1504.00325","author":"Chen Xinlei","year":"2015","unstructured":"Xinlei Chen, Hao Fang, Tsung-Yi Lin, Ramakrishna Vedantam, Saurabh Gupta, Piotr Doll\u00e1r, and C. Lawrence Zitnick. 2015. Microsoft COCO Captions: Data Collection and Evaluation Server. CoRR abs\/1504.00325 (2015). arXiv:1504.00325 http:\/\/arxiv.org\/abs\/1504.00325"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421285"},{"key":"e_1_3_2_1_11_1","volume-title":"Clipper: A Low-Latency Online Prediction Serving System. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Crankshaw Daniel","year":"2017","unstructured":"Daniel Crankshaw, Xin Wang, Guilio Zhou, Michael J. Franklin, Joseph E. Gonzalez, and Ion Stoica. 2017. Clipper: A Low-Latency Online Prediction Serving System. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17). USENIX Association, Boston, MA, 613--627. https:\/\/www.usenix.org\/conference\/nsdi17\/technical-sessions\/presentation\/crankshaw"},{"key":"e_1_3_2_1_12_1","first-page":"11","article-title":"ONNX Runtime. https:\/\/onnxruntime.ai\/","volume":"1","author":"ONNX","year":"2021","unstructured":"ONNX Runtime developers. 2021. ONNX Runtime. https:\/\/onnxruntime.ai\/. Version: 1.11.0.","journal-title":"Version"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3241539.3241559"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3135974.3135993"},{"key":"e_1_3_2_1_15_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati, Reza Karimi, Safya Alzayat, Wei Hao, Antoine Kaufmann, Ymir Vigfusson, and Jonathan Mace. 2020. Serving {DNNs} like Clockwork: Performance Predictability from the Bottom Up. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 443--462."},{"key":"e_1_3_2_1_16_1","volume-title":"Cocktail: A Multidimensional Optimization for Model Serving in Cloud. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Gunasekaran Jashwant Raj","unstructured":"Jashwant Raj Gunasekaran, Cyan Subhra Mishra, Prashanth Thinakaran, Bikash Sharma, Mahmut Taylan Kandemir, and Chita R. Das. 2022. Cocktail: A Multidimensional Optimization for Model Serving in Cloud. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). USENIX Association, Renton, WA, 1041--1057. https:\/\/www.usenix.org\/conference\/nsdi22\/presentation\/gunasekaran"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3514221.3526173"},{"key":"e_1_3_2_1_18_1","unstructured":"Gurobi Optimization LLC. 2022. Gurobi Optimizer Reference Manual. https:\/\/www.gurobi.com"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00012"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472883.3486993"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/2723372.2742785"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3567508"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","unstructured":"Glenn Jocher. 2020. YOLOv5 by Ultralytics. 10.5281\/zenodo.3908559","DOI":"10.5281\/zenodo.3908559"},{"key":"e_1_3_2_1_25_1","volume-title":"Markus Weimer, and Matteo Interlandi.","author":"Lee Yunseong","year":"2018","unstructured":"Yunseong Lee, Alberto Scolari, Byung-Gon Chun, Marco Domenico Santambrogio, Markus Weimer, and Matteo Interlandi. 2018. {PRETZEL}: Opening the black box of machine learning prediction serving systems. In 13th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 18). 611--626."},{"key":"e_1_3_2_1_26_1","volume-title":"2020 IEEE International Conference on Cloud Engineering (IC2E). IEEE, 66--72","author":"LeMay Matthew","year":"2020","unstructured":"Matthew LeMay, Shijian Li, and Tian Guo. 2020. Perseus: Characterizing performance and cost of multi-tenant serving for cnn models. In 2020 IEEE International Conference on Cloud Engineering (IC2E). IEEE, 66--72."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607034"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588195.3592997"},{"key":"e_1_3_2_1_29_1","volume-title":"AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E. Gonzalez, and Ion Stoica. 2023. AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). USENIX Association, Boston, MA, 663--679. https:\/\/www.usenix.org\/conference\/osdi23\/presentation\/li-zhouhan"},{"key":"e_1_3_2_1_30_1","volume-title":"High-Performance ML Serving. In Workshop on ML Systems at NIPS","author":"Olston Christopher","year":"2017","unstructured":"Christopher Olston, Fangwei Li, Jeremiah Harmsen, Jordan Soyke, Kiril Gorovoy, Li Lao, Noah Fiedel, Sukriti Ramesh, and Vinu Rajashekhar. 2017. TensorFlow-Serving: Flexible, High-Performance ML Serving. In Workshop on ML Systems at NIPS 2017."},{"key":"e_1_3_2_1_31_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arXiv:2103.00020 [cs.CV]"},{"key":"e_1_3_2_1_32_1","volume-title":"INFaaS: A Model-less and Managed Inference Serving System. arXiv preprint arXiv:1905.13348","author":"Romero Francisco","year":"2019","unstructured":"Francisco Romero, Qian Li, Neeraja J Yadwadkar, and Christos Kozyrakis. 2019. INFaaS: A Model-less and Managed Inference Serving System. arXiv preprint arXiv:1905.13348 (2019)."},{"key":"e_1_3_2_1_33_1","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Romero Francisco","year":"2021","unstructured":"Francisco Romero, Qian Li, Neeraja J Yadwadkar, and Christos Kozyrakis. 2021. {INFaaS}: Automated Model-less Inference Serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). 397--411."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472883.3486972"},{"key":"e_1_3_2_1_35_1","volume-title":"2020 USENIX Annual Technical Conference (USENIX ATC 20)","author":"Shahrad Mohammad","year":"2020","unstructured":"Mohammad Shahrad, Rodrigo Fonseca, \u00cd\u00f1igo Goiri, Gohar Chaudhry, Paul Batum, Jason Cooke, Eduardo Laureano, Colby Tresness, Mark Russinovich, and Ricardo Bianchini. 2020. Serverless in the wild: Characterizing and optimizing the serverless workload at a large cloud provider. In 2020 USENIX Annual Technical Conference (USENIX ATC 20). 205--218."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359658"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2017.2761740"},{"key":"e_1_3_2_1_38_1","volume-title":"International conference on machine learning. PMLR, 6105--6114","author":"Tan Mingxing","year":"2019","unstructured":"Mingxing Tan and Quoc Le. 2019. Efficientnet: Rethinking model scaling for convolutional neural networks. In International conference on machine learning. PMLR, 6105--6114."},{"key":"e_1_3_2_1_39_1","volume-title":"Le","author":"Tan Mingxing","year":"2020","unstructured":"Mingxing Tan and Quoc V. Le. 2020. EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks. arXiv:1905.11946 [cs.LG]"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472883.3486987"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.14778\/3282495.3282499"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587438"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2022.3232715"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2016.25"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507709"},{"key":"e_1_3_2_1_46_1","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Zhang Chengliang","year":"2019","unstructured":"Chengliang Zhang, Minchen Yu, Wei Wang, and Feng Yan. 2019. {MArk}: Exploiting Cloud Services for {Cost-Effective},{SLO-Aware} Machine Learning Inference Serving. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). 1049--1062."},{"key":"e_1_3_2_1_47_1","volume-title":"14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Zhang Haoyu","unstructured":"Haoyu Zhang, Ganesh Ananthanarayanan, Peter Bodik, Matthai Philipose, Paramvir Bahl, and Michael J. Freedman. 2017. Live Video Analytics at Scale with Approximation and Delay-Tolerance. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17). USENIX Association, Boston, MA, 377--392. https:\/\/www.usenix.org\/conference\/nsdi17\/technical-sessions\/presentation\/zhang"},{"key":"e_1_3_2_1_48_1","volume-title":"SHEPHERD: Serving DNNs in the Wild. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Zhang Hong","year":"2023","unstructured":"Hong Zhang, Yupeng Tang, Anurag Khandelwal, and Ion Stoica. 2023. SHEPHERD: Serving DNNs in the Wild. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 787--808. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/zhang-hong"}],"event":{"name":"HPDC '24: 33rd International Symposium on High-Performance Parallel and Distributed Computing","location":"Pisa Italy","acronym":"HPDC '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the 33rd International Symposium on High-Performance Parallel and Distributed Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3625549.3658688","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3625549.3658688","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T22:50:38Z","timestamp":1750287038000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3625549.3658688"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,3]]},"references-count":48,"alternative-id":["10.1145\/3625549.3658688","10.1145\/3625549"],"URL":"https:\/\/doi.org\/10.1145\/3625549.3658688","relation":{},"subject":[],"published":{"date-parts":[[2024,6,3]]},"assertion":[{"value":"2024-08-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}