{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T08:36:45Z","timestamp":1777106205323,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,17]],"date-time":"2024-04-17T00:00:00Z","timestamp":1713312000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["CNS-1763617"],"award-info":[{"award-number":["CNS-1763617"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["CNS-2106299"],"award-info":[{"award-number":["CNS-2106299"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2312396"],"award-info":[{"award-number":["2312396"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2220211"],"award-info":[{"award-number":["2220211"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2224054"],"award-info":[{"award-number":["2224054"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,27]]},"DOI":"10.1145\/3617232.3624849","type":"proceedings-article","created":{"date-parts":[[2024,4,17]],"date-time":"2024-04-17T20:10:56Z","timestamp":1713384656000},"page":"318-334","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":26,"title":["Proteus: A High-Throughput Inference-Serving System with Accuracy Scaling"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-3892-9460","authenticated-orcid":false,"given":"Sohaib","family":"Ahmad","sequence":"first","affiliation":[{"name":"University of Massachusetts, Amherst, Amherst, Massachusetts, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9128-2231","authenticated-orcid":false,"given":"Hui","family":"Guan","sequence":"additional","affiliation":[{"name":"University of Massachusetts, Amherst, Amherst, Massachusetts, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3534-8787","authenticated-orcid":false,"given":"Brian D.","family":"Friedman","sequence":"additional","affiliation":[{"name":"Nokia Bell Labs, Murray Hill, New Jersey, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9675-0928","authenticated-orcid":false,"given":"Thomas","family":"Williams","sequence":"additional","affiliation":[{"name":"Nokia Bell Labs, Murray Hill, New Jersey, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0558-6875","authenticated-orcid":false,"given":"Ramesh K.","family":"Sitaraman","sequence":"additional","affiliation":[{"name":"University of Massachusetts, Amherst, Amherst, Massachusetts, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2028-5533","authenticated-orcid":false,"given":"Thomas","family":"Woo","sequence":"additional","affiliation":[{"name":"Nokia Bell Labs, Murray Hill, New Jersey, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,4,17]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2018. Twitter Streaming Traces. https:\/\/archive.org\/details\/archiveteam-twitter-stream-2018-04."},{"key":"e_1_3_2_1_2_1","unstructured":"2020. Amazon SageMaker. Build train and deploy machine learning models at scale. https:\/\/aws.amazon.com\/sagemaker\/. Accessed: 2021-06-23."},{"key":"e_1_3_2_1_3_1","unstructured":"2022. Azure Machine Learning. https:\/\/azure.microsoft.com\/en-us\/services\/machine-learning\/."},{"key":"e_1_3_2_1_4_1","unstructured":"2022. The ONNX Model Zoo. https:\/\/github.com\/onnx\/models. Accessed: 2022-06-06."},{"key":"e_1_3_2_1_5_1","unstructured":"2022. Triton Inference Server. https:\/\/developer.nvidia.com\/nvidia-triton-inference-server."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of Machine Learning and Systems, I. Dhillon, D. Papailiopoulos, and V. Sze (Eds.)","volume":"2","author":"Blalock Davis","year":"2020","unstructured":"Davis Blalock, Jose Javier Gonzalez Ortiz, Jonathan Frankle, and John Guttag. 2020. What is the State of Neural Network Pruning?. In Proceedings of Machine Learning and Systems, I. Dhillon, D. Papailiopoulos, and V. Sze (Eds.), Vol. 2. 129--146. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2020\/file\/6c44dc73014d66ba49b28d483a8f8b0d-Paper.pdf"},{"key":"e_1_3_2_1_7_1","volume-title":"Multi-model Machine Learning Inference Serving with GPU Spatial Partitioning. CoRR abs\/2109.01611","author":"Choi Seungbeom","year":"2021","unstructured":"Seungbeom Choi, Sunho Lee, Yeonjae Kim, Jongse Park, Youngjin Kwon, and Jaehyuk Huh. 2021. Multi-model Machine Learning Inference Serving with GPU Spatial Partitioning. CoRR abs\/2109.01611 (2021). arXiv:2109.01611 https:\/\/arxiv.org\/abs\/2109.01611"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00049"},{"key":"e_1_3_2_1_9_1","volume-title":"Clipper: A Low-Latency Online Prediction Serving System. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Crankshaw Daniel","year":"2017","unstructured":"Daniel Crankshaw, Xin Wang, Guilio Zhou, Michael J. Franklin, Joseph E. Gonzalez, and Ion Stoica. 2017. Clipper: A Low-Latency Online Prediction Serving System. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17). USENIX Association, Boston, MA, 613--627. https:\/\/www.usenix.org\/conference\/nsdi17\/technical-sessions\/presentation\/crankshaw"},{"issue":"8","key":"e_1_3_2_1_10_1","first-page":"1","article-title":"ONNX Runtime. https:\/\/onnxruntime.ai\/","volume":"1","author":"ONNX","year":"2021","unstructured":"ONNX Runtime developers. 2021. ONNX Runtime. https:\/\/onnxruntime.ai\/. Version: 1.8.1.","journal-title":"Version"},{"key":"e_1_3_2_1_11_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR abs\/1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR abs\/1810.04805 (2018). arXiv:1810.04805 http:\/\/arxiv.org\/abs\/1810.04805"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421284"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3241539.3241559"},{"key":"e_1_3_2_1_14_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati, Reza Karimi, Safya Alzayat, Wei Hao, Antoine Kaufmann, Ymir Vigfusson, and Jonathan Mace. 2020. Serving DNNs like Clockwork: Performance Predictability from the Bottom Up. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 443--462. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/gujarati"},{"key":"e_1_3_2_1_15_1","volume-title":"Cocktail: A Multidimensional Optimization for Model Serving in Cloud. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Gunasekaran Jashwant Raj","unstructured":"Jashwant Raj Gunasekaran, Cyan Subhra Mishra, Prashanth Thinakaran, Bikash Sharma, Mahmut Taylan Kandemir, and Chita R. Das. 2022. Cocktail: A Multidimensional Optimization for Model Serving in Cloud. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). USENIX Association, Renton, WA, 1041--1057. https:\/\/www.usenix.org\/conference\/nsdi22\/presentation\/gunasekaran"},{"key":"e_1_3_2_1_16_1","first-page":"1","article-title":"GluonCV and GluonNLP: deep learning in computer vision and natural language processing","volume":"21","author":"Guo Jian","year":"2020","unstructured":"Jian Guo, He He, Tong He, Leonard Lausen, Mu Li, Haibin Lin, Xingjian Shi, Chenguang Wang, Junyuan Xie, Sheng Zha, et al. 2020. GluonCV and GluonNLP: deep learning in computer vision and natural language processing. J. Mach. Learn. Res. 21, 23 (2020), 1--7.","journal-title":"J. Mach. Learn. Res."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3514221.3526173"},{"key":"e_1_3_2_1_18_1","unstructured":"Gurobi Optimization LLC. 2022. Gurobi Optimizer Reference Manual. https:\/\/www.gurobi.com"},{"key":"e_1_3_2_1_19_1","volume-title":"One Size Does Not Fit All: Quantifying and Exposing the Accuracy-Latency Trade-off in Machine Learning Cloud Service APIs via Tolerance Tiers. CoRR abs\/1906.11307","author":"Halpern Matthew","year":"2019","unstructured":"Matthew Halpern, Behzad Boroujerdian, Todd W. Mummert, Evelyn Duesterwald, and Vijay Janapa Reddi. 2019. One Size Does Not Fit All: Quantifying and Exposing the Accuracy-Latency Trade-off in Machine Learning Cloud Service APIs via Tolerance Tiers. CoRR abs\/1906.11307 (2019). arXiv:1906.11307 http:\/\/arxiv.org\/abs\/1906.11307"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_21_1","volume-title":"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications. CoRR abs\/1704.04861","author":"Howard Andrew G.","year":"2017","unstructured":"Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, and Hartwig Adam. 2017. MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications. CoRR abs\/1704.04861 (2017). arXiv:1704.04861 http:\/\/arxiv.org\/abs\/1704.04861"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 4700--4708","author":"Huang Gao","unstructured":"Gao Huang, Zhuang Liu, Laurens van der Maaten, and Kilian Q. Weinberger. 2017. Densely Connected Convolutional Networks. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 4700--4708."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/2723372.2742785"},{"key":"e_1_3_2_1_24_1","volume-title":"Accelerating Multi-Model Inference by Merging DNNs of Different Weights. arXiv preprint arXiv:2009.13062","author":"Jeong Joo Seong","year":"2020","unstructured":"Joo Seong Jeong, Soojeong Kim, Gyeong-In Yu, Yunseong Lee, and Byung-Gon Chun. 2020. Accelerating Multi-Model Inference by Merging DNNs of Different Weights. arXiv preprint arXiv:2009.13062 (2020)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","unstructured":"Glenn Jocher Ayush Chaurasia Alex Stoken Jirka Borovec NanoCode012 Yonghye Kwon Kalen Michael TaoXie Jiacong Fang imyhxy Lorna Zeng Yifu Colin Wong Abhiram V Diego Montes Zhiqiang Wang Cristi Fati Jebastin Nadar Laughing UnglvKitDe Victor Sonck tkianai yxNONG Piotr Skalski Adam Hogan Dhruv Nair Max Strobel and Mrinal Jain. 2022. ultralytics\/yolov5: v7.0 - YOLOv5 SOTA Realtime Instance Segmentation. 10.5281\/zenodo.7347926","DOI":"10.5281\/zenodo.7347926"},{"key":"e_1_3_2_1_26_1","volume-title":"ALBERT: A Lite BERT for Self-supervised Learning of Language Representations. CoRR abs\/1909.11942","author":"Lan Zhenzhong","year":"2019","unstructured":"Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, and Radu Soricut. 2019. ALBERT: A Lite BERT for Self-supervised Learning of Language Representations. CoRR abs\/1909.11942 (2019). arXiv:1909.11942 http:\/\/arxiv.org\/abs\/1909.11942"},{"key":"e_1_3_2_1_27_1","volume-title":"PRETZEL: Opening the Black Box of Machine Learning Prediction Serving Systems. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Lee Yunseong","year":"2018","unstructured":"Yunseong Lee, Alberto Scolari, Byung-Gon Chun, Marco Domenico Santambrogio, Markus Weimer, and Matteo Interlandi. 2018. PRETZEL: Opening the Black Box of Machine Learning Prediction Serving Systems. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). USENIX Association, Carlsbad, CA, 611--626. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/lee"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/IC2E48712.2020.00014"},{"key":"e_1_3_2_1_29_1","volume-title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach. CoRR abs\/1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. RoBERTa: A Robustly Optimized BERT Pretraining Approach. CoRR abs\/1907.11692 (2019). arXiv:1907.11692 http:\/\/arxiv.org\/abs\/1907.11692"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507752"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.5555\/2600239.2600241"},{"key":"e_1_3_2_1_32_1","volume-title":"High-Performance ML Serving. In Workshop on ML Systems at NIPS","author":"Olston Christopher","year":"2017","unstructured":"Christopher Olston, Fangwei Li, Jeremiah Harmsen, Jordan Soyke, Kiril Gorovoy, Li Lao, Noah Fiedel, Sukriti Ramesh, and Vinu Rajashekhar. 2017. TensorFlow-Serving: Flexible, High-Performance ML Serving. In Workshop on ML Systems at NIPS 2017."},{"key":"e_1_3_2_1_33_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_2_1_34_1","first-page":"1","article-title":"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. 2020. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. Journal of Machine Learning Research 21, 140 (2020), 1--67. http:\/\/jmlr.org\/papers\/v21\/20-074.html","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_35_1","volume-title":"INFaaS: Automated Model-less Inference Serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Romero Francisco","year":"2021","unstructured":"Francisco Romero, Qian Li, Neeraja J. Yadwadkar, and Christos Kozyrakis. 2021. INFaaS: Automated Model-less Inference Serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). USENIX Association, 397--411. https:\/\/www.usenix.org\/conference\/atc21\/presentation\/romero"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359658"},{"key":"e_1_3_2_1_37_1","volume-title":"2019 USENIX Conference on Operational Machine Learning (OpML 19)","author":"Soifer Jonathan","year":"2019","unstructured":"Jonathan Soifer, Jason Li, Mingqin Li, Jeffrey Zhu, Yingnan Li, Yuxiong He, Elton Zheng, Adi Oltean, Maya Mosyak, Chris Barnes, Thomas Liu, and Junhua Wang. 2019. Deep Learning Inference Service at Microsoft. In 2019 USENIX Conference on Operational Machine Learning (OpML 19). USENIX Association, Santa Clara, CA, 15--17. https:\/\/www.usenix.org\/conference\/opml19\/presentation\/soifer"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CLOUD53861.2021.00016"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2017.2761740"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2017.2761740"},{"key":"e_1_3_2_1_41_1","volume-title":"Well-Read Students Learn Better: The Impact of Student Initialization on Knowledge Distillation. CoRR abs\/1908.08962","author":"Turc Iulia","year":"2019","unstructured":"Iulia Turc, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Well-Read Students Learn Better: The Impact of Student Initialization on Knowledge Distillation. CoRR abs\/1908.08962 (2019). arXiv:1908.08962 http:\/\/arxiv.org\/abs\/1908.08962"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446763"},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Association for Computational Linguistics, Online, 38--45","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, R\u00e9mi Louf, Morgan Funtowicz, Joe Davison, Sam Shleifer, Patrick von Platen, Clara Ma, Yacine Jernite, Julien Plu, Canwen Xu, Teven Le Scao, Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush. 2020. Transformers: State-of-the-Art Natural Language Processing. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Association for Computational Linguistics, Online, 38--45. https:\/\/www.aclweb.org\/anthology\/2020.emnlp-demos.6"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3317550.3321443"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507709"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582029"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00309"}],"event":{"name":"ASPLOS '24: 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1","location":"La Jolla CA USA","acronym":"ASPLOS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3617232.3624849","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3617232.3624849","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3617232.3624849","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:46:13Z","timestamp":1750178773000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3617232.3624849"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,17]]},"references-count":47,"alternative-id":["10.1145\/3617232.3624849","10.1145\/3617232"],"URL":"https:\/\/doi.org\/10.1145\/3617232.3624849","relation":{},"subject":[],"published":{"date-parts":[[2024,4,17]]},"assertion":[{"value":"2024-04-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}