{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,7]],"date-time":"2026-07-07T04:45:37Z","timestamp":1783399537262,"version":"3.54.6"},"publisher-location":"New York, NY, USA","reference-count":84,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T00:00:00Z","timestamp":1730678400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-sa\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,11,4]]},"DOI":"10.1145\/3694715.3695963","type":"proceedings-article","created":{"date-parts":[[2024,11,15]],"date-time":"2024-11-15T19:28:18Z","timestamp":1731698898000},"page":"607-623","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["Apparate: Rethinking Early Exits to Tame Latency-Throughput Tensions in ML Serving"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9291-2060","authenticated-orcid":false,"given":"Yinwei","family":"Dai","sequence":"first","affiliation":[{"name":"Princeton University, Princeton, New Jersey, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6973-3259","authenticated-orcid":false,"given":"Rui","family":"Pan","sequence":"additional","affiliation":[{"name":"Princeton University, Princeton, New Jersey, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5952-3346","authenticated-orcid":false,"given":"Anand","family":"Iyer","sequence":"additional","affiliation":[{"name":"Georgia Tech, Atlanta, Georgia, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2095-7024","authenticated-orcid":false,"given":"Kai","family":"Li","sequence":"additional","affiliation":[{"name":"Princeton University, Princeton, New Jersey, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7002-5033","authenticated-orcid":false,"given":"Ravi","family":"Netravali","sequence":"additional","affiliation":[{"name":"Princeton University, Princeton, New Jersey, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,11,15]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2013. Web data: Amazon reviews. https:\/\/snap.stanford.edu\/data\/web-Amazon.html."},{"key":"e_1_3_2_1_2_1","unstructured":"2018. Neural Network Exchange Format (NNEF). https:\/\/www.khronos.org\/nnef\/."},{"key":"e_1_3_2_1_3_1","unstructured":"2023. Apache TVM: An End to End Machine Learning Compiler Framework for CPUs GPUs and accelerators. https:\/\/tvm.apache.org\/."},{"key":"e_1_3_2_1_4_1","unstructured":"2023. How Microsoft's bet on Azure unlocked an AI revolution. https:\/\/news.microsoft.com\/source\/features\/ai\/how-microsofts-bet-on-azure-unlocked-an-ai-revolution\/."},{"key":"e_1_3_2_1_5_1","unstructured":"2023. NVIDIA TensorRT: Programmable Inference Accelerator. https:\/\/developer.nvidia.com\/tensorrt."},{"key":"e_1_3_2_1_6_1","unstructured":"2024. HuggingFace Pipelines. https:\/\/huggingface.co\/docs\/transformers\/en\/main_classes\/pipelines."},{"key":"e_1_3_2_1_7_1","unstructured":"2024. NVIDIA Triton Inference Server. https:\/\/developer.nvidia.com\/nvidia-triton-inference-server."},{"key":"e_1_3_2_1_8_1","unstructured":"2024. ONNX Run Time. https:\/\/github.com\/microsoft\/onnxruntime."},{"key":"e_1_3_2_1_9_1","unstructured":"2024. Open Neural Network Exchange (ONNX). https:\/\/onnx.ai\/."},{"key":"e_1_3_2_1_10_1","unstructured":"2024. The Yelp Reviews Dataset. https:\/\/www.yelp.com\/dataset."},{"key":"e_1_3_2_1_11_1","unstructured":"2024. TorchServe. https:\/\/pytorch.org\/serve\/."},{"key":"e_1_3_2_1_12_1","volume-title":"Boggart: Towards GeneralPurpose Acceleration of Retrospective Video Analytics. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Agarwal Neil","year":"2023","unstructured":"Neil Agarwal and Ravi Netravali. 2023. Boggart: Towards GeneralPurpose Acceleration of Retrospective Video Analytics. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 933--951. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/agarwal-neil"},{"key":"e_1_3_2_1_13_1","volume-title":"Preech: A System for Privacy-Preserving Speech Transcription. In 29th USENIX Security Symposium (USENIX Security 20)","author":"Ahmed Shimaa","year":"2020","unstructured":"Shimaa Ahmed, Amrita Roy Chowdhury, Kassem Fawaz, and Parmesh Ramanathan. 2020. Preech: A System for Privacy-Preserving Speech Transcription. In 29th USENIX Security Symposium (USENIX Security 20). USENIX Association, 2703--2720. https:\/\/www.usenix.org\/conference\/usenixsecurity20\/presentation\/ahmed-shimaa"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.362"},{"key":"e_1_3_2_1_15_1","volume-title":"Ekya: Continuous Learning of Video Analytics Models on Edge Compute Servers. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Bhardwaj Romil","year":"2022","unstructured":"Romil Bhardwaj, Zhengxu Xia, Ganesh Ananthanarayanan, Junchen Jiang, Yuanchao Shu, Nikolaos Karianakis, Kevin Hsieh, Paramvir Bahl, and Ion Stoica. 2022. Ekya: Continuous Learning of Video Analytics Models on Edge Compute Servers. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). USENIX Association, Renton, WA, 119--135. https:\/\/www.usenix.org\/conference\/nsdi22\/presentation\/bhardwaj"},{"key":"e_1_3_2_1_16_1","volume-title":"International Conference on Machine Learning. PMLR, 527--536","author":"Bolukbasi Tolga","year":"2017","unstructured":"Tolga Bolukbasi, Joseph Wang, Ofer Dekel, and Venkatesh Saligrama. 2017. Adaptive neural networks for efficient inference. In International Conference on Machine Learning. PMLR, 527--536."},{"key":"e_1_3_2_1_17_1","first-page":"406","article-title":"Scaling video analytics on constrained edge nodes","volume":"1","author":"Canel Christopher","year":"2019","unstructured":"Christopher Canel, Thomas Kim, Giulio Zhou, Conglong Li, Hyeontaek Lim, David G Andersen, Michael Kaminsky, and Subramanya Dulloor. 2019. Scaling video analytics on constrained edge nodes. Proceedings of Machine Learning and Systems 1 (2019), 406--417.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00049"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421285"},{"key":"e_1_3_2_1_20_1","volume-title":"Clipper: A Low-Latency Online Prediction Serving System. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Crankshaw Daniel","year":"2017","unstructured":"Daniel Crankshaw, Xin Wang, Guilio Zhou, Michael J. Franklin, Joseph E. Gonzalez, and Ion Stoica. 2017. Clipper: A Low-Latency Online Prediction Serving System. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17). USENIX Association, Boston, MA, 613--627. https:\/\/www.usenix.org\/conference\/nsdi17\/technical-sessions\/presentation\/crankshaw"},{"key":"e_1_3_2_1_21_1","volume-title":"Optimizing Dynamic Neural Networks with Brainstorm. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Cui Weihao","year":"2023","unstructured":"Weihao Cui, Zhenhua Han, Lingji Ouyang, Yichuan Wang, Ningxin Zheng, Lingxiao Ma, Yuqing Yang, Fan Yang, Jilong Xue, Lili Qiu, Lidong Zhou, Quan Chen, Haisheng Tan, and Minyi Guo. 2023. Optimizing Dynamic Neural Networks with Brainstorm. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). USENIX Association, Boston, MA, 797--815. https:\/\/www.usenix.org\/conference\/osdi23\/presentation\/cui"},{"key":"e_1_3_2_1_22_1","unstructured":"Harm de Vries Florian Strub J\u00e9r\u00e9mie Mary Hugo Larochelle Olivier Pietquin and Aaron Courville. 2017. Modulating early visual processing by language. arXiv:1707.00683 [cs.CV]"},{"key":"e_1_3_2_1_23_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv e-prints, Article arXiv:1810.04805 (Oct.","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv e-prints, Article arXiv:1810.04805 (Oct. 2018), arXiv:1810.04805 pages. arXiv:1810.04805 [cs.CL]"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Mostafa Elhoushi Akshat Shrivastava Diana Liskovich Basil Hosmer Bram Wasti Liangzhen Lai Anas Mahmoud Bilge Acun Saurabh Agarwal Ahmed Roman et al. 2024. Layer skip: Enabling early exit inference and self-speculative decoding. arXiv preprint arXiv:2404.16710 (2024).","DOI":"10.18653\/v1\/2024.acl-long.681"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.5555\/3586589.3586709"},{"key":"e_1_3_2_1_26_1","unstructured":"Elias Frantar and Dan Alistarh. 2023. SparseGPT: Massive Language Models Can Be Accurately Pruned in One-Shot. arXiv:2301.00774 [cs.LG]"},{"key":"e_1_3_2_1_27_1","volume-title":"GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers. arXiv:2210.17323 [cs.LG]","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2023. GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers. arXiv:2210.17323 [cs.LG]"},{"key":"e_1_3_2_1_28_1","unstructured":"Gigaspaces. 2023. Amazon Found Every 100ms of Latency Cost them 1% in Sales. https:\/\/www.gigaspaces.com\/blog\/amazon-found-every-100ms-of-latency-cost-them-1-in-sales."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3135974.3135993"},{"key":"e_1_3_2_1_30_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati, Reza Karimi, Safya Alzayat, Wei Hao, Antoine Kaufmann, Ymir Vigfusson, and Jonathan Mace. 2020. Serving DNNs like Clockwork: Performance Predictability from the Bottom Up. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 443--462. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/gujarati"},{"key":"e_1_3_2_1_31_1","volume-title":"18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)","author":"Guo Peizhen","year":"2021","unstructured":"Peizhen Guo, Bo Hu, and Wenjun Hu. 2021. Mistify: Automating DNN Model Porting for On-Device Inference at the Edge. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21). USENIX Association, 705--719. https:\/\/www.usenix.org\/conference\/nsdi21\/presentation\/guo"},{"key":"e_1_3_2_1_32_1","volume-title":"2018 IEEE International Symposium on High Performance Computer Architecture (HPCA). 620--629","author":"Hazelwood Kim","year":"2018","unstructured":"Kim Hazelwood, Sarah Bird, David Brooks, Soumith Chintala, Utku Diril, Dmytro Dzhulgakov, Mohamed Fawzy, Bill Jia, Yangqing Jia, Aditya Kalro, James Law, Kevin Lee, Jason Lu, Pieter Noordhuis, Misha Smelyanskiy, Liang Xiong, and Xiaodong Wang. 2018. Applied Machine Learning at Facebook: A Datacenter Infrastructure Perspective. In 2018 IEEE International Symposium on High Performance Computer Architecture (HPCA). 620--629."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2012.2205597"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the 13th USENIX Conference on Operating Systems Design and Implementation","author":"Hsieh Kevin","year":"2018","unstructured":"Kevin Hsieh, Ganesh Ananthanarayanan, Peter Bodik, Shivaram Venkataraman, Paramvir Bahl, Matthai Philipose, Phillip B. Gibbons, and Onur Mutlu. 2018. Focus: Querying Large Video Datasets with Low Latency and Low Cost. In Proceedings of the 13th USENIX Conference on Operating Systems Design and Implementation (Carlsbad, CA, USA) (OSDI'18). USENIX Association, USA, 269--286."},{"key":"e_1_3_2_1_35_1","volume-title":"Weinberger","author":"Huang Gao","year":"2018","unstructured":"Gao Huang, Danlu Chen, Tianhong Li, Felix Wu, Laurens van der Maaten, and Kilian Q. Weinberger. 2018. Multi-Scale Dense Networks for Resource Efficient Image Classification. arXiv:1703.09844 [cs.LG]"},{"key":"e_1_3_2_1_36_1","volume-title":"Weinberger","author":"Huang Gao","year":"2018","unstructured":"Gao Huang, Danlu Chen, Tianhong Li, Felix Wu, Laurens van der Maaten, and Kilian Q. Weinberger. 2018. Multi-Scale Dense Networks for Resource Efficient Image Classification. arXiv:1703.09844 [cs.LG]"},{"key":"e_1_3_2_1_37_1","unstructured":"Yukun Huang Yanda Chen Zhou Yu and Kathleen McKeown. 2022. In-context Learning Distillation: Transferring Few-shot Learning Ability of Pre-trained Language Models. arXiv:2212.10670 [cs.CL]"},{"key":"e_1_3_2_1_38_1","unstructured":"HuggingFace. 2023. Pretrained Models. https:\/\/huggingface.co\/transformers\/v3.3.1\/pretrained_models.html."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695978"},{"key":"e_1_3_2_1_40_1","volume-title":"Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). USENIX Association, Renton, WA, 947--960. https:\/\/www.usenix.org\/conference\/atc19\/presentation\/jeon"},{"key":"e_1_3_2_1_41_1","volume-title":"International conference on machine learning. PMLR, 3301--3310","author":"Kaya Yigitcan","year":"2019","unstructured":"Yigitcan Kaya, Sanghyun Hong, and Tudor Dumitras. 2019. Shallow-deep networks: Understanding and mitigating network overthinking. In International conference on machine learning. PMLR, 3301--3310."},{"key":"e_1_3_2_1_42_1","volume-title":"RECL: Responsive Resource-Efficient Continuous Learning for Video Analytics. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Khani Mehrdad","year":"2023","unstructured":"Mehrdad Khani, Ganesh Ananthanarayanan, Kevin Hsieh, Junchen Jiang, Ravi Netravali, Yuanchao Shu, Mohammad Alizadeh, and Victor Bahl. 2023. RECL: Responsive Resource-Efficient Continuous Learning for Video Analytics. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 917--932. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/khani"},{"key":"e_1_3_2_1_43_1","unstructured":"Sehoon Kim Coleman Hooper Amir Gholami Zhen Dong Xiuyu Li Sheng Shen Michael W. Mahoney and Kurt Keutzer. 2024. SqueezeLLM: Dense-and-Sparse Quantization. arXiv:2306.07629 [cs.CL]"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_45_1","unstructured":"Yaniv Leviathan Matan Kalman and Yossi Matias. 2023. Fast Inference from Transformers via Speculative Decoding. arXiv:2211.17192 [cs.LG]"},{"key":"e_1_3_2_1_46_1","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E Gonzalez, et al. 2023. {AlpaServe}: Statistical multiplexing with model parallelism for deep learning serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). 663--679."},{"key":"e_1_3_2_1_47_1","volume-title":"AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. arXiv:2306.00978 [cs.CL]","author":"Lin Ji","year":"2023","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Xingyu Dang, Chuang Gan, and Song Han. 2023. AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. arXiv:2306.00978 [cs.CL]"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.537"},{"key":"e_1_3_2_1_49_1","unstructured":"Xinyin Ma Gongfan Fang and Xinchao Wang. 2023. LLM-Pruner: On the Structural Pruning of Large Language Models. arXiv:2305.11627 [cs.CL]"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2020.2974843"},{"key":"e_1_3_2_1_51_1","unstructured":"Christopher Olston Noah Fiedel Kiril Gorovoy Jeremiah Harmsen Li Lao Fangwei Li Vinu Rajashekhar Sukriti Ramesh and Jordan Soyke. 2017. TensorFlow-Serving: Flexible High-Performance ML Serving. arXiv:1712.06139 [cs.DC]"},{"key":"e_1_3_2_1_52_1","volume-title":"Guoqing Harry Xu, and Ravi Netravali","author":"Padmanabhan Arthi","year":"2022","unstructured":"Arthi Padmanabhan, Neil Agarwal, Anand P. Iyer, Ganesh Ananthanarayanan, Yuanchao Shu, Nikolaos Karianakis, Guoqing Harry Xu, and Ravi Netravali. 2022. GEMEL: Model Merging for Memory-Efficient, Real-Time Video Analytics at the Edge. CoRR abs\/2201.07705 (2022). arXiv:2201.07705 https:\/\/arxiv.org\/abs\/2201.07705"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","unstructured":"Aditya Pal Abhilash Barigidad and Abhijit Mustafi. 2020. IMDb Movie Reviews Dataset. 10.21227\/zm1y-b270","DOI":"10.21227\/zm1y-b270"},{"key":"e_1_3_2_1_54_1","unstructured":"Jon Porter. 2023. ChatGPT continues to be one of the fastest-growing services ever. https:\/\/www.theverge.com\/2023\/11\/6\/23948386\/chatgpt-active-user-count-openai-developer-conference."},{"key":"e_1_3_2_1_55_1","unstructured":"PyTorch. 2023. Model Zoo. https:\/\/pytorch.org\/serve\/model_zoo.html."},{"key":"e_1_3_2_1_56_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research 21, 140 (2020), 1--67.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_57_1","volume-title":"100,000+ questions for machine comprehension of text. arXiv preprint arXiv:1606.05250","author":"Rajpurkar Pranav","year":"2016","unstructured":"Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, and Percy Liang. 2016. Squad: 100,000+ questions for machine comprehension of text. arXiv preprint arXiv:1606.05250 (2016)."},{"key":"e_1_3_2_1_58_1","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Romero Francisco","year":"2021","unstructured":"Francisco Romero, Qian Li, Neeraja J Yadwadkar, and Christos Kozyrakis. 2021. {INFaaS}: Automated model-less inference serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). 397--411."},{"key":"e_1_3_2_1_59_1","volume-title":"a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108","author":"Sanh Victor","year":"2019","unstructured":"Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf. 2019. DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)."},{"key":"e_1_3_2_1_60_1","first-page":"17456","article-title":"Confident adaptive language modeling","volume":"35","author":"Schuster Tal","year":"2022","unstructured":"Tal Schuster, Adam Fisch, Jai Gupta, Mostafa Dehghani, Dara Bahri, Vinh Tran, Yi Tay, and Donald Metzler. 2022. Confident adaptive language modeling. Advances in Neural Information Processing Systems 35 (2022), 17456--17472.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.593"},{"key":"e_1_3_2_1_62_1","volume-title":"Get to the point: Summarization with pointer-generator networks. arXiv preprint arXiv:1704.04368","author":"Liu Peter J","year":"2017","unstructured":"Abigail See, Peter J Liu, and Christopher D Manning. 2017. Get to the point: Summarization with pointer-generator networks. arXiv preprint arXiv:1704.04368 (2017)."},{"key":"e_1_3_2_1_63_1","volume-title":"Hill-climbing search. Encyclopedia of cognitive science 81","author":"Selman Bart","year":"2006","unstructured":"Bart Selman and Carla P Gomes. 2006. Hill-climbing search. Encyclopedia of cognitive science 81 (2006), 82."},{"key":"e_1_3_2_1_64_1","unstructured":"Jaime Sevilla Pablo Villalobos and Juan Cer\u00f3n. 2021. Parameter counts in Machine Learning. https:\/\/www.lesswrong.com\/posts\/GzoWcYibWYwJva8aL\/parameter-counts-in-machine-learning."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359658"},{"key":"e_1_3_2_1_66_1","volume-title":"Ekko: A Large-Scale Deep Learning Recommender System with Low-Latency Model Update. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Sima Chijun","year":"2022","unstructured":"Chijun Sima, Yao Fu, Man-Kit Sit, Liyi Guo, Xuri Gong, Feng Lin, Junyu Wu, Yongsheng Li, Haidong Rong, Pierre-Louis Aublin, and Luo Mai. 2022. Ekko: A Large-Scale Deep Learning Recommender System with Low-Latency Model Update. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 821--839. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/sima"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.14778\/3407790.3407837"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/MWC.2019.1800553"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"crossref","unstructured":"Surat Teerapittayanon Bradley McDanel and H. T. Kung. 2017. BranchyNet: Fast Inference via Early Exiting from Deep Neural Networks. arXiv:1709.01686 [cs.NE]","DOI":"10.1109\/ICPR.2016.7900006"},{"key":"e_1_3_2_1_70_1","unstructured":"Think with Google. 2017. The Need for Mobile Speed: How Mobile Latency Impacts Publisher Revenue. https:\/\/www.thinkwithgoogle.com\/marketing-strategies\/app-and-mobile\/mobile-speed-latency-impacts-publisher-revenue\/."},{"key":"e_1_3_2_1_71_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_72_1","volume-title":"Gonzalez","author":"Wang Xin","year":"2018","unstructured":"Xin Wang, Fisher Yu, Zi-Yi Dou, Trevor Darrell, and Joseph E. Gonzalez. 2018. SkipNet: Learning Dynamic Routing in Convolutional Networks. arXiv:1711.09485 [cs.CV]"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587438"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2019.00048"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414572"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"crossref","unstructured":"Ji Xin Raphael Tang Jaejun Lee Yaoliang Yu and Jimmy Lin. 2020. DeeBERT: Dynamic Early Exiting for Accelerating BERT Inference. arXiv:2004.12993 [cs.CL]","DOI":"10.18653\/v1\/2020.acl-main.204"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.8"},{"key":"e_1_3_2_1_78_1","unstructured":"Yu-Syuan Xu Tsu-Jui Fu Hsuan-Kung Yang and Chun-Yi Lee. 2018. Dynamic Video Segmentation Network. arXiv:1804.00931 [cs.CV]"},{"key":"e_1_3_2_1_79_1","volume-title":"Wei Li, Yanzhang (Ryan) He, Tara N Sainath, and Trevor Deatrick Strohman.","author":"Chang Shuo","year":"2020","unstructured":"Shuo yiin Chang, Bo Li, David Johannes Rybach, Wei Li, Yanzhang (Ryan) He, Tara N Sainath, and Trevor Deatrick Strohman. 2020. Low Latency Speech Recognition using End-to-End Prefetching. In Interspeech 2020."},{"key":"e_1_3_2_1_80_1","volume-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 521--538. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/yu"},{"key":"e_1_3_2_1_81_1","volume-title":"Cocktailer: Analyzing and Optimizing Dynamic Control Flow in Deep Learning. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Zhang Chen","year":"2023","unstructured":"Chen Zhang, Lingxiao Ma, Jilong Xue, Yining Shi, Ziming Miao, Fan Yang, Jidong Zhai, Zhi Yang, and Mao Yang. 2023. Cocktailer: Analyzing and Optimizing Dynamic Control Flow in Deep Learning. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). USENIX Association, Boston, MA, 681--699."},{"key":"e_1_3_2_1_82_1","volume-title":"SLO-Aware Machine Learning Inference Serving. In 2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Zhang Chengliang","year":"2019","unstructured":"Chengliang Zhang, Minchen Yu, Wei Wang, and Feng Yan. 2019. MArk: Exploiting Cloud Services for Cost-Effective, SLO-Aware Machine Learning Inference Serving. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). USENIX Association, Renton, WA, 1049--1062. https:\/\/www.usenix.org\/conference\/atc19\/presentation\/zhang-chengliang"},{"key":"e_1_3_2_1_83_1","volume-title":"SHEPHERD: Serving DNNs in the Wild. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Zhang Hong","year":"2023","unstructured":"Hong Zhang, Yupeng Tang, Anurag Khandelwal, and Ion Stoica. 2023. SHEPHERD: Serving DNNs in the Wild. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 787--808. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/zhang-hong"},{"key":"e_1_3_2_1_84_1","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Zhou Wangchunshu","year":"2020","unstructured":"Wangchunshu Zhou, Canwen Xu, Tao Ge, Julian McAuley, Ke Xu, and Furu Wei. 2020. BERT Loses Patience: Fast and Robust Inference with Early Exit. In Advances in Neural Information Processing Systems, Vol. 33. Curran Associates, Inc., 18330--18341. https:\/\/proceedings.neurips.cc\/paper\/2020\/file\/d4dd111a4fd973394238aca5c05bebe3-Paper.pdf"}],"event":{"name":"SOSP '24: ACM SIGOPS 30th Symposium on Operating Systems Principles","location":"Austin TX USA","acronym":"SOSP '24","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","USENIX"]},"container-title":["Proceedings of the ACM SIGOPS 30th Symposium on Operating Systems Principles"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3694715.3695963","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3694715.3695963","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:05:48Z","timestamp":1750291548000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3694715.3695963"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,4]]},"references-count":84,"alternative-id":["10.1145\/3694715.3695963","10.1145\/3694715"],"URL":"https:\/\/doi.org\/10.1145\/3694715.3695963","relation":{},"subject":[],"published":{"date-parts":[[2024,11,4]]},"assertion":[{"value":"2024-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}