{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T14:34:08Z","timestamp":1774449248816,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":74,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T00:00:00Z","timestamp":1730678400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,11,4]]},"DOI":"10.1145\/3694715.3695978","type":"proceedings-article","created":{"date-parts":[[2024,11,15]],"date-time":"2024-11-15T19:28:18Z","timestamp":1731698898000},"page":"624-639","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Improving DNN Inference Throughput Using Practical, Per-Input Compute Adaptation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5952-3346","authenticated-orcid":false,"given":"Anand","family":"Padmanabha Iyer","sequence":"first","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, US"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6294-7978","authenticated-orcid":false,"given":"Mingyu","family":"Guan","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, US"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9291-2060","authenticated-orcid":false,"given":"Yinwei","family":"Dai","sequence":"additional","affiliation":[{"name":"Princeton University, New Jersey, US"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6973-3259","authenticated-orcid":false,"given":"Rui","family":"Pan","sequence":"additional","affiliation":[{"name":"Princeton University, New Jersey, US"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3689-9591","authenticated-orcid":false,"given":"Swapnil","family":"Gandhi","sequence":"additional","affiliation":[{"name":"Stanford University, Stanford, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7002-5033","authenticated-orcid":false,"given":"Ravi","family":"Netravali","sequence":"additional","affiliation":[{"name":"Princeton University, New Jersey, US"}]}],"member":"320","published-online":{"date-parts":[[2024,11,15]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n. d.]. ArchiveTeam JSON Download of Twitter Stream 2018-04. https:\/\/archive.org\/details\/archiveteam-twitter-stream-2018-04\/."},{"key":"e_1_3_2_1_2_1","unstructured":"[n. d.]. GLUE Benchmark. https:\/\/gluebenchmark.com\/."},{"key":"e_1_3_2_1_3_1","unstructured":"[n. d.]. Llama Model Family. https:\/\/www.llama.com\/."},{"key":"e_1_3_2_1_4_1","unstructured":"[n. d.]. NVIDIA Triton Inference Server. https:\/\/developer.nvidia.com\/nvidia-triton-inference-server."},{"key":"e_1_3_2_1_5_1","unstructured":"[n. d.]. ONNX Run Time. https:\/\/github.com\/microsoft\/onnxruntime."},{"key":"e_1_3_2_1_6_1","unstructured":"[n. d.]. ONNX Runtime serves over 1 trillion daily inferences at Microsoft. https:\/\/news.microsoft.com\/source\/features\/ai\/how-microsofts-bet-on-azure-unlocked-an-ai-revolution\/."},{"key":"e_1_3_2_1_7_1","unstructured":"[n. d.]. Open Neural Network Exchange (ONNX). https:\/\/onnx.ai\/."},{"key":"e_1_3_2_1_8_1","unstructured":"[n. d.]. PyTorch. https:\/\/pytorch.org\/."},{"key":"e_1_3_2_1_9_1","unstructured":"[n. d.]. TensorFlow. https:\/\/www.tensorflow.org\/."},{"key":"e_1_3_2_1_10_1","unstructured":"[n. d.]. TorchServe. https:\/\/pytorch.org\/serve\/."},{"key":"e_1_3_2_1_11_1","unstructured":"[n. d.]. Transformers. https:\/\/github.com\/huggingface\/transformers."},{"key":"e_1_3_2_1_12_1","unstructured":"2021. Live Video Analytics with Microsoft Rocket for reducing edge compute costs. https:\/\/techcommunity.microsoft.com\/t5\/internet-of-things\/live-video-analytics-with-microsoft-rocket-for-reducing-edge\/ba-p\/1522305"},{"key":"e_1_3_2_1_13_1","volume-title":"Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 117--134. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/agrawal"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.362"},{"key":"e_1_3_2_1_15_1","volume-title":"Accelerating deep learning inference via learned caches. arXiv preprint arXiv:2101.07344","author":"Balasubramanian Arjun","year":"2021","unstructured":"Arjun Balasubramanian, Adarsh Kumar, Yuhan Liu, Han Cao, Shivaram Venkataraman, and Aditya Akella. 2021. Accelerating deep learning inference via learned caches. arXiv preprint arXiv:2101.07344 (2021)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W15-3001"},{"key":"e_1_3_2_1_17_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","unstructured":"Yu Cheng Duo Wang Pan Zhou and Tao Zhang. 2017. A Survey of Model Compression and Acceleration for Deep Neural Networks. 10.48550\/ARXIV.1710.09282","DOI":"10.48550\/ARXIV.1710.09282"},{"key":"e_1_3_2_1_19_1","unstructured":"Christopher Clark Kenton Lee Ming-Wei Chang Tom Kwiatkowski Michael Collins and Kristina Toutanova. 2019. BoolQ: Exploring the Surprising Difficulty of Natural Yes\/No Questions. In NAACL."},{"key":"e_1_3_2_1_20_1","volume-title":"Clipper: A Low-Latency Online Prediction Serving System. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Crankshaw Daniel","year":"2017","unstructured":"Daniel Crankshaw, Xin Wang, Guilio Zhou, Michael J. Franklin, Joseph E. Gonzalez, and Ion Stoica. 2017. Clipper: A Low-Latency Online Prediction Serving System. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17). USENIX Association, Boston, MA, 613--627. https:\/\/www.usenix.org\/conference\/nsdi17\/technical-sessions\/presentation\/crankshaw"},{"key":"e_1_3_2_1_21_1","volume-title":"Optimizing Dynamic Neural Networks with Brainstorm. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Cui Weihao","year":"2023","unstructured":"Weihao Cui, Zhenhua Han, Lingji Ouyang, Yichuan Wang, Ningxin Zheng, Lingxiao Ma, Yuqing Yang, Fan Yang, Jilong Xue, Lili Qiu, Lidong Zhou, Quan Chen, Haisheng Tan, and Minyi Guo. 2023. Optimizing Dynamic Neural Networks with Brainstorm. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). USENIX Association, Boston, MA, 797--815. https:\/\/www.usenix.org\/conference\/osdi23\/presentation\/cui"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_23_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv e-prints, Article arXiv:1810.04805 (Oct.","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv e-prints, Article arXiv:1810.04805 (Oct. 2018), arXiv:1810.04805 pages. arXiv:1810.04805 [cs.CL]"},{"key":"e_1_3_2_1_24_1","volume-title":"Reducing Transformer Depth on Demand with Structured Dropout. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=SylO2yStDr","author":"Fan Angela","year":"2020","unstructured":"Angela Fan, Edouard Grave, and Armand Joulin. 2020. Reducing Transformer Depth on Demand with Structured Dropout. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=SylO2yStDr"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.5555\/3586589.3586709"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d19-5409"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.repl4nlp-1.18"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01453-z"},{"key":"e_1_3_2_1_29_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati, Reza Karimi, Safya Alzayat, Wei Hao, Antoine Kaufmann, Ymir Vigfusson, and Jonathan Mace. 2020. Serving DNNs like Clockwork: Performance Predictability from the Bottom Up. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 443--462. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/gujarati"},{"key":"e_1_3_2_1_30_1","volume-title":"Prashanth Thinakaran, Mahmut Taylan Kandemir, and Chita R. Das.","author":"Gunasekaran Jashwant Raj","year":"2021","unstructured":"Jashwant Raj Gunasekaran, Cyan Subhra Mishra, Prashanth Thinakaran, Mahmut Taylan Kandemir, and Chita R. Das. 2021. Cocktail: Leveraging Ensemble Learning for Optimized Model Serving in Public Cloud. arXiv e-prints, Article arXiv:2106.05345 (June 2021), arXiv:2106.05345 pages. arXiv:2106.05345 [cs.DC]"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3117837"},{"key":"e_1_3_2_1_32_1","volume-title":"European Conference on Computer Vision. Springer, 362--378","author":"Han Yizeng","year":"2022","unstructured":"Yizeng Han, Yifan Pu, Zihang Lai, Chaofei Wang, Shiji Song, Junfeng Cao, Wenhui Huang, Chao Deng, and Gao Huang. 2022. Learning to Weight Samples for Dynamic Early-Exiting Networks. In European Conference on Computer Vision. Springer, 362--378."},{"key":"e_1_3_2_1_33_1","volume-title":"Sangeetha Abdu Jyothi, and Roy Campbell","author":"Hashemi Sayed Hadi","year":"2019","unstructured":"Sayed Hadi Hashemi, Sangeetha Abdu Jyothi, and Roy Campbell. 2019. TicTac: Accelerating Distributed Deep Learning with Communication Scheduling. In Proceedings of Machine Learning and Systems, A. Talwalkar, V. Smith, and M. Zaharia (Eds.), Vol. 1. 418--430. https:\/\/proceedings.mlsys.org\/paper\/2019\/file\/84d9ee44e457ddef7f2c4f25dc8fa865-Paper.pdf"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00059"},{"key":"e_1_3_2_1_35_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In CVPR."},{"key":"e_1_3_2_1_36_1","volume-title":"Article arXiv:1503.02531 (March","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton, Oriol Vinyals, and Jeff Dean. 2015. Distilling the Knowledge in a Neural Network. arXiv e-prints, Article arXiv:1503.02531 (March 2015), arXiv:1503.02531 pages. arXiv:1503.02531 [stat.ML]"},{"key":"e_1_3_2_1_37_1","volume-title":"The use of ARIMA models for reliability forecasting and analysis. Computers & industrial engineering 35, 1--2","author":"Ho Siu Lau","year":"1998","unstructured":"Siu Lau Ho and Min Xie. 1998. The use of ARIMA models for reliability forecasting and analysis. Computers & industrial engineering 35, 1--2 (1998), 213--216."},{"key":"e_1_3_2_1_38_1","volume-title":"Zhiru Zhang, and G Edward Suh.","author":"Hua Weizhe","year":"2019","unstructured":"Weizhe Hua, Yuan Zhou, Christopher M De Sa, Zhiru Zhang, and G Edward Suh. 2019. Channel gating neural networks. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_2_1_39_1","volume-title":"Weinberger","author":"Huang Gao","year":"2018","unstructured":"Gao Huang, Danlu Chen, Tianhong Li, Felix Wu, Laurens van der Maaten, and Kilian Q. Weinberger. 2018. Multi-Scale Dense Networks for Resource Efficient Image Classification. arXiv:1703.09844 [cs.LG]"},{"key":"e_1_3_2_1_40_1","volume-title":"Gpipe: Efficient training of giant neural networks using pipeline parallelism. In Advances in neural information processing systems. 103--112.","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al. 2019. Gpipe: Efficient training of giant neural networks using pipeline parallelism. In Advances in neural information processing systems. 103--112."},{"key":"e_1_3_2_1_41_1","volume-title":"Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). USENIX Association, Renton, WA, 947--960. https:\/\/www.usenix.org\/conference\/atc19\/presentation\/jeon"},{"key":"e_1_3_2_1_42_1","volume-title":"Microsoft Research","author":"Jeon Myeongjae","year":"2018","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Junjie Qian, Amar Phanishayee, Wencong Xiao, and Fan Yang. 2018. Multi-tenant gpu clusters for deep learning workloads: Analysis and implications. Technical report, Microsoft Research (2018)."},{"key":"e_1_3_2_1_43_1","volume-title":"Article arXiv:1909.10351 (Sept.","author":"Jiao Xiaoqi","year":"2019","unstructured":"Xiaoqi Jiao, Yichun Yin, Lifeng Shang, Xin Jiang, Xiao Chen, Linlin Li, Fang Wang, and Qun Liu. 2019. TinyBERT: Distilling BERT for Natural Language Understanding. arXiv e-prints, Article arXiv:1909.10351 (Sept. 2019), arXiv:1909.10351 pages. arXiv:1909.10351 [cs.CL]"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3140659.3080246"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_46_1","volume-title":"International Conference on machine learning. PMLR, 5958--5968","author":"Li Zhuohan","year":"2020","unstructured":"Zhuohan Li, Eric Wallace, Sheng Shen, Kevin Lin, Kurt Keutzer, Dan Klein, and Joey Gonzalez. 2020. Train big, then compress: Rethinking model size for efficient training and inference of transformers. In International Conference on machine learning. PMLR, 5958--5968."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.162"},{"key":"e_1_3_2_1_48_1","volume-title":"Runtime neural pruning. Advances in neural information processing systems 30","author":"Lin Ji","year":"2017","unstructured":"Ji Lin, Yongming Rao, Jiwen Lu, and Jie Zhou. 2017. Runtime neural pruning. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.537"},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining. 3504--3512","author":"Mangrulkar Sourab","year":"2022","unstructured":"Sourab Mangrulkar, Ankith MS, and Vivek Sembium. 2022. BE3R: BERT based Early-Exit Using Expert Routing. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining. 3504--3512."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1874254"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.5963"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_54_1","unstructured":"Jon Porter. 2023. ChatGPT continues to be one of the fastest-growing services ever. https:\/\/www.theverge.com\/2023\/11\/6\/23948386\/chatgpt-active-user-count-openai-developer-conference."},{"key":"e_1_3_2_1_55_1","volume-title":"Pollux: Co-adaptive Cluster Scheduling for Goodput-Optimized Deep Learning. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21)","author":"Qiao Aurick","unstructured":"Aurick Qiao, Sang Keun Choe, Suhas Jayaram Subramanya, Willie Neiswanger, Qirong Ho, Hao Zhang, Gregory R. Ganger, and Eric P. Xing. 2021. Pollux: Co-adaptive Cluster Scheduling for Goodput-Optimized Deep Learning. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21). USENIX Association, 1--18. https:\/\/www.usenix.org\/conference\/osdi21\/presentation\/qiao"},{"key":"e_1_3_2_1_56_1","volume-title":"INFaaS: Automated Model-less Inference Serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Romero Francisco","year":"2021","unstructured":"Francisco Romero, Qian Li, Neeraja J. Yadwadkar, and Christos Kozyrakis. 2021. INFaaS: Automated Model-less Inference Serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). USENIX Association, 397--411. https:\/\/www.usenix.org\/conference\/atc21\/presentation\/romero"},{"key":"e_1_3_2_1_57_1","volume-title":"a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108","author":"Sanh Victor","year":"2019","unstructured":"Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf. 2019. DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)."},{"key":"e_1_3_2_1_58_1","first-page":"17456","article-title":"Confident adaptive language modeling","volume":"35","author":"Schuster Tal","year":"2022","unstructured":"Tal Schuster, Adam Fisch, Jai Gupta, Mostafa Dehghani, Dara Bahri, Vinh Tran, Yi Tay, and Donald Metzler. 2022. Confident adaptive language modeling. Advances in Neural Information Processing Systems 35 (2022), 17456--17472.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.593"},{"key":"e_1_3_2_1_60_1","volume-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. 2017. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538 (2017)."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359658"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6409"},{"key":"e_1_3_2_1_63_1","volume-title":"Early Exiting with Ensemble Internal Classifiers. CoRR abs\/2105.13792","author":"Sun Tianxiang","year":"2021","unstructured":"Tianxiang Sun, Yunhua Zhou, Xiangyang Liu, Xinyu Zhang, Hao Jiang, Zhao Cao, Xuanjing Huang, and Xipeng Qiu. 2021. Early Exiting with Ensemble Internal Classifiers. CoRR abs\/2105.13792 (2021). arXiv:2105.13792 https:\/\/arxiv.org\/abs\/2105.13792"},{"key":"e_1_3_2_1_64_1","unstructured":"Surat Teerapittayanon Bradley McDanel and H. T. Kung. 2017. BranchyNet: Fast Inference via Early Exiting from Deep Neural Networks. arXiv e-prints Article arXiv:1709.01686 (Sept. 2017) arXiv:1709.01686 pages. arXiv:1709.01686 [cs.NE]"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2204.12013"},{"key":"e_1_3_2_1_66_1","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet MarieAnne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar Aurelien Rodriguez Armand Joulin Edouard Grave and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. arXiv:2302.13971 [cs.CL] https:\/\/arxiv.org\/abs\/2302.13971"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587438"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414572"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.204"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.8"},{"key":"e_1_3_2_1_71_1","volume-title":"Condconv: Conditionally parameterized convolutions for efficient inference. Advances in Neural Information Processing Systems 32","author":"Yang Brandon","year":"2019","unstructured":"Brandon Yang, Gabriel Bender, Quoc V Le, and Jiquan Ngiam. 2019. Condconv: Conditionally parameterized convolutions for efficient inference. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_2_1_72_1","volume-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 521--538. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/yu"},{"key":"e_1_3_2_1_73_1","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Zhou Wangchunshu","year":"2020","unstructured":"Wangchunshu Zhou, Canwen Xu, Tao Ge, Julian McAuley, Ke Xu, and Furu Wei. 2020. BERT Loses Patience: Fast and Robust Inference with Early Exit. In Advances in Neural Information Processing Systems, Vol. 33. Curran Associates, Inc., 18330--18341. https:\/\/proceedings.neurips.cc\/paper\/2020\/file\/d4dd111a4fd973394238aca5c05bebe3-Paper.pdf"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.231"}],"event":{"name":"SOSP '24: ACM SIGOPS 30th Symposium on Operating Systems Principles","location":"Austin TX USA","acronym":"SOSP '24","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","USENIX"]},"container-title":["Proceedings of the ACM SIGOPS 30th Symposium on Operating Systems Principles"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3694715.3695978","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3694715.3695978","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:05:48Z","timestamp":1750291548000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3694715.3695978"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,4]]},"references-count":74,"alternative-id":["10.1145\/3694715.3695978","10.1145\/3694715"],"URL":"https:\/\/doi.org\/10.1145\/3694715.3695978","relation":{},"subject":[],"published":{"date-parts":[[2024,11,4]]},"assertion":[{"value":"2024-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}