{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T23:08:18Z","timestamp":1768345698489,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":76,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,19]]},"DOI":"10.1145\/3772052.3772261","type":"proceedings-article","created":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:19:00Z","timestamp":1768321140000},"page":"600-613","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CIS: Checkpointed Inference for Data Drift-Resilient Model Serving at Edge Servers"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-9284-507X","authenticated-orcid":false,"given":"Sudipta Saha","family":"Shubha","sequence":"first","affiliation":[{"name":"University of Virginia, Charlottesville, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7548-6223","authenticated-orcid":false,"given":"Haiying","family":"Shen","sequence":"additional","affiliation":[{"name":"University of Virginia, Charlottesville, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7479-1664","authenticated-orcid":false,"given":"Ganesh","family":"Ananthanarayanan","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,1,13]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proc. of NSDI","author":"Bhardwaj Romil","year":"2022","unstructured":"Romil Bhardwaj, Zhengxu Xia, Ganesh Ananthanarayanan, Junchen Jiang, Yuanchao Shu, Nikolaos Karianakis, Kevin Hsieh, Paramvir Bahl, and Ion Stoica. Ekya: Continuous learning of video analytics models on edge compute servers. In Proc. of NSDI, 2022."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3603269.3604830"},{"key":"e_1_3_2_1_3_1","volume-title":"Proc. of EuroSys","author":"Park Misun","year":"2023","unstructured":"Misun Park, Ketan Bhardwaj, and Ada Gavrilovska. Pocket: ml serving from the edge. In Proc. of EuroSys, 2023."},{"key":"e_1_3_2_1_4_1","volume-title":"Microsoft Small Language Models for Edge. https:\/\/news.microsoft.com\/source\/features\/ai\/the-phi-3-small-language-models-with-big-potential\/","year":"2024","unstructured":"Microsoft. Microsoft Small Language Models for Edge. https:\/\/news.microsoft.com\/source\/features\/ai\/the-phi-3-small-language-models-with-big-potential\/, 2024."},{"key":"e_1_3_2_1_5_1","volume-title":"The era of 1-bit llms: All large language models are in 1.58 bits. arXiv preprint arXiv:2402.17764","author":"Ma Shuming","year":"2024","unstructured":"Shuming Ma, Hongyu Wang, Lingxiao Ma, Lei Wang, Wenhui Wang, Shaohan Huang, Li Dong, Ruiping Wang, Jilong Xue, and Furu Wei. The era of 1-bit llms: All large language models are in 1.58 bits. arXiv preprint arXiv:2402.17764, 2024."},{"key":"e_1_3_2_1_6_1","volume-title":"Rapid Growth of Edge Data Centers. https:\/\/www.marketsandmarkets.com\/Market-Reports\/edge-data-center-market-142018469.html","author":"Markets Markets","year":"2023","unstructured":"Markets and Markets. Rapid Growth of Edge Data Centers. https:\/\/www.marketsandmarkets.com\/Market-Reports\/edge-data-center-market-142018469.html, 2023."},{"key":"e_1_3_2_1_7_1","volume-title":"https:\/\/aws.amazon.com\/wavelength\/","author":"Wavelength Amazon AWS. AWS","year":"2024","unstructured":"Amazon AWS. AWS Wavelength. https:\/\/aws.amazon.com\/wavelength\/, 2024."},{"key":"e_1_3_2_1_8_1","volume-title":"Azure Stack Edge. https:\/\/azure.microsoft.com\/en-us\/products\/azure-stack\/edge","year":"2024","unstructured":"Azure. Azure Stack Edge. https:\/\/azure.microsoft.com\/en-us\/products\/azure-stack\/edge, 2024."},{"key":"e_1_3_2_1_9_1","volume-title":"HPE edge server with multiple GPUs. https:\/\/community.hpe.com\/t5\/servers-systems-the-right\/meet-the-graphics-intensive-hpe-proliant-dl380a-gen11-server\/ba-p\/7184747","author":"HPE.","year":"2023","unstructured":"HPE. HPE edge server with multiple GPUs. https:\/\/community.hpe.com\/t5\/servers-systems-the-right\/meet-the-graphics-intensive-hpe-proliant-dl380a-gen11-server\/ba-p\/7184747, 2023."},{"key":"e_1_3_2_1_10_1","volume-title":"GPU-intensive HPE edge server. https:\/\/www.crn.com\/news\/ai\/2024\/a-new-edge-for-hpe-why-the-nvidia-ai-ready-proliant-dl145-gen11-server-is-a-big-breakthrough","author":"CRN.","year":"2024","unstructured":"CRN. GPU-intensive HPE edge server. https:\/\/www.crn.com\/news\/ai\/2024\/a-new-edge-for-hpe-why-the-nvidia-ai-ready-proliant-dl145-gen11-server-is-a-big-breakthrough, 2024."},{"key":"e_1_3_2_1_11_1","volume-title":"Edge servers with multiple GPUs in Azure Local. https:\/\/learn.microsoft. com\/en-us\/azure-stack\/hci\/manage\/gpu-preparation","year":"2024","unstructured":"Azure. Edge servers with multiple GPUs in Azure Local. https:\/\/learn.microsoft. com\/en-us\/azure-stack\/hci\/manage\/gpu-preparation, 2024."},{"key":"e_1_3_2_1_12_1","volume-title":"HPE Proliant Edge Server with multiple GPUs. https:\/\/community.hpe.com\/t5\/servers-systems-the-right\/the-hpe-proliant-dl145-gen11-server-a-powerhouse-for-edge\/ba-p\/7230779","author":"HPE.","year":"2024","unstructured":"HPE. HPE Proliant Edge Server with multiple GPUs. https:\/\/community.hpe.com\/t5\/servers-systems-the-right\/the-hpe-proliant-dl145-gen11-server-a-powerhouse-for-edge\/ba-p\/7230779, 2024."},{"key":"e_1_3_2_1_13_1","volume-title":"Proc. of NSDI","author":"Khani Mehrdad","year":"2023","unstructured":"Mehrdad Khani, Ganesh Ananthanarayanan, Kevin Hsieh, Junchen Jiang, Ravi Netravali, Yuanchao Shu, Mohammad Alizadeh, and Victor Bahl. {RECL}: Responsive {Resource-Efficient} continuous learning for video analytics. In Proc. of NSDI, 2023."},{"key":"e_1_3_2_1_14_1","volume-title":"Proc. of ICML","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. Smoothquant: Accurate and efficient post-training quantization for large language models. In Proc. of ICML, 2023."},{"key":"e_1_3_2_1_15_1","volume-title":"Proc. of DAC","author":"Lee Soobee","year":"2022","unstructured":"Soobee Lee, Minindu Weerakoon, Jonghyun Choi, Minjia Zhang, Di Wang, and Myeongjae Jeon. Carm: Hierarchical episodic memory for continual learning. In Proc. of DAC, 2022."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9781107298019"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW54805.2022.00065"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.587"},{"key":"e_1_3_2_1_19_1","volume-title":"A continual learning survey: Defying forgetting in classification tasks","author":"Lange Matthias De","year":"2021","unstructured":"Matthias De Lange, Rahaf Aljundi, Marc Masana, Sarah Parisot, Xu Jia, Ale\u0161 Leonardis, Gregory Slabaugh, and Tinne Tuytelaars. A continual learning survey: Defying forgetting in classification tasks. IEEE transactions on pattern analysis and machine intelligence, 44(7), 2021."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00073"},{"key":"e_1_3_2_1_21_1","volume-title":"Dacapo: Accelerating continuous learning in autonomous systems for video analytics. arXiv preprint arXiv:2403.14353","author":"Kim Yoonsung","year":"2024","unstructured":"Yoonsung Kim, Changhun Oh, Jinwoo Hwang, Wonung Kim, Seongryong Oh, Yubin Lee, Hardik Sharma, Amir Yazdanbakhsh, and Jongse Park. Dacapo: Accelerating continuous learning in autonomous systems for video analytics. arXiv preprint arXiv:2403.14353, 2024."},{"key":"e_1_3_2_1_22_1","volume-title":"Proc. of OSDI","author":"Shubha Sudipta Saha","year":"2024","unstructured":"Sudipta Saha Shubha, Haiying Shen, and Anand Iyer. USHER: Holistic interference avoidance for resource optimized ML inference. In Proc. of OSDI, 2024."},{"key":"e_1_3_2_1_23_1","volume-title":"Proc. of OSDI","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P Xing, Joseph E Gonzalez, and Ion Stoica. Alpa: Automating inter-and {Intra-Operator} parallelism for distributed deep learning. In Proc. of OSDI, 2022."},{"key":"e_1_3_2_1_24_1","volume-title":"Proc. of ATC","author":"Choi Seungbeom","year":"2022","unstructured":"Seungbeom Choi, Sunho Lee, Yeonjae Kim, Jongse Park, Youngjin Kwon, and Jaehyuk Huh. Serving heterogeneous machine learning models on {Multi-GPU} servers with { Spatio-Temporal} sharing. In Proc. of ATC, 2022."},{"key":"e_1_3_2_1_25_1","volume-title":"Proc. of NSDI","author":"Padmanabhan Arthi","year":"2023","unstructured":"Arthi Padmanabhan, Neil Agarwal, Anand Iyer, Ganesh Ananthanarayanan, Yuanchao Shu, Nikolaos Karianakis, Guoqing Harry Xu, and Ravi Netravali. Gemel: Model merging for memory-efficient, real-time video analytics at the edge. In Proc. of NSDI, 2023."},{"key":"e_1_3_2_1_26_1","volume-title":"GPU partitioning in Azure Local to increase GPU utilization. https:\/\/docs.nvidia.com\/vgpu\/deployment\/azure-stack-hci\/latest\/gpup-workflow.html","author":"Azure Local Nvidia","year":"2024","unstructured":"Nvidia and Azure Local. GPU partitioning in Azure Local to increase GPU utilization. https:\/\/docs.nvidia.com\/vgpu\/deployment\/azure-stack-hci\/latest\/gpup-workflow.html, 2024."},{"key":"e_1_3_2_1_27_1","volume-title":"Mobilenetv4-universal models for the mobile ecosystem. arXiv preprint arXiv:2404.10518","author":"Qin Danfeng","year":"2024","unstructured":"Danfeng Qin, Chas Leichner, Manolis Delakis, Marco Fornoni, Shixin Luo, Fan Yang, Weijun Wang, Colby Banbury, Chengxi Ye, and Berkin Akin. Mobilenetv4-universal models for the mobile ecosystem. arXiv preprint arXiv:2404.10518, 2024."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00716"},{"key":"e_1_3_2_1_29_1","volume-title":"Tinyllama: An open-source small language model. arXiv preprint arXiv:2401.02385","author":"Zhang Peiyuan","year":"2024","unstructured":"Peiyuan Zhang, Guangtao Zeng, Tianduo Wang, and Wei Lu. Tinyllama: An open-source small language model. arXiv preprint arXiv:2401.02385, 2024."},{"key":"e_1_3_2_1_30_1","volume-title":"Michael Felsberg, Timothy Baldwin, Eric P. Xing, and Fahad Shahbaz Khan. Mobillama: Towards accurate and lightweight fully transparent gpt","author":"Thawakar Omkar","year":"2024","unstructured":"Omkar Thawakar, Ashmal Vayani, Salman Khan, Hisham Cholakkal, Rao Muhammad Anwer, Michael Felsberg, Timothy Baldwin, Eric P. Xing, and Fahad Shahbaz Khan. Mobillama: Towards accurate and lightweight fully transparent gpt, 2024."},{"key":"e_1_3_2_1_31_1","volume-title":"WikiText dataset. https:\/\/huggingface.co\/datasets\/wikitext\/blob\/main\/README.md","year":"2016","unstructured":"WikiPedia. WikiText dataset. https:\/\/huggingface.co\/datasets\/wikitext\/blob\/main\/README.md, 2016."},{"key":"e_1_3_2_1_32_1","volume-title":"https:\/\/huggingface.co\/models","author":"Models HuggingFace","year":"2023","unstructured":"HuggingFace. HuggingFace Models. https:\/\/huggingface.co\/models, 2023."},{"key":"e_1_3_2_1_33_1","volume-title":"Proc. of ATC","author":"Shahrad Mohammad","year":"2020","unstructured":"Mohammad Shahrad, Rodrigo Fonseca, Inigo Goiri, Gohar Chaudhry, Paul Batum, Jason Cooke, Eduardo Laureano, Colby Tresness, Mark Russinovich, and Ricardo Bianchini. Serverless in the wild: Characterizing and optimizing the serverless workload at a large cloud provider. In Proc. of ATC, 2020."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISIT.2004.1365067"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_36_1","volume-title":"Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, et al. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288, 2023."},{"key":"e_1_3_2_1_37_1","volume-title":"Proc. of OSDI","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E Gonzalez, and Ion Stoica. { AlpaServe} : Statistical multiplexing with model parallelism for deep learning serving. In Proc. of OSDI, 2023."},{"key":"e_1_3_2_1_38_1","volume-title":"Proc. of SoCC","author":"Hu Yitao","year":"2021","unstructured":"Yitao Hu, Rajrup Ghosh, and Ramesh Govindan. Scrooge: A cost-effective deep learning inference system. In Proc. of SoCC, 2021."},{"key":"e_1_3_2_1_39_1","volume-title":"Proc. of NSDI","author":"Zhang Hong","year":"2023","unstructured":"Hong Zhang, Yupeng Tang, Anurag Khandelwal, and Ion Stoica. { SHEPHERD } : Serving {DNNs} in the wild. In Proc. of NSDI, 2023."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1039\/C3AY41907J"},{"key":"e_1_3_2_1_41_1","volume-title":"Proc. of OSDI","author":"Zhao Hanyu","year":"2020","unstructured":"Hanyu Zhao, Zhenhua Han, Zhi Yang, Quanlu Zhang, Fan Yang, Lidong Zhou, Mao Yang, Francis CM Lau, Yuqi Wang, Yifan Xiong, and Bin Wang. {HiveD}: Sharing a { GPU} cluster for deep learning with guarantees. In Proc. of OSDI, 2020."},{"key":"e_1_3_2_1_42_1","volume-title":"Custom scheduling in kubernetes: A survey on common problems and solution approaches. ACM Computing Surveys, 55(7)","author":"Rejiba Zeineb","year":"2022","unstructured":"Zeineb Rejiba and Javad Chamanara. Custom scheduling in kubernetes: A survey on common problems and solution approaches. ACM Computing Surveys, 55(7), 2022."},{"key":"e_1_3_2_1_43_1","volume-title":"Proc. of NSDI","author":"Rajasekaran Sudarsanan","year":"2024","unstructured":"Sudarsanan Rajasekaran, Manya Ghobadi, and Aditya Akella. Cassini: Network-aware job scheduling in machine learning clusters. In Proc. of NSDI, 2024."},{"key":"e_1_3_2_1_44_1","volume-title":"Proc. of NSDI","author":"Thorpe John","year":"2023","unstructured":"John Thorpe, Pengzhan Zhao, Jonathan Eyolfson, Yifan Qiao, Zhihao Jia, Minjia Zhang, Ravi Netravali, and Guoqing Harry Xu. Bamboo: Making preemptible instances resilient for affordable training of large {DNNs}. In Proc. of NSDI, 2023."},{"key":"e_1_3_2_1_45_1","volume-title":"Speeding up the hungarian algorithm. Computers & Operations Research, 17(1)","author":"Wright MB","year":"1990","unstructured":"MB Wright. Speeding up the hungarian algorithm. Computers & Operations Research, 17(1), 1990."},{"key":"e_1_3_2_1_46_1","volume-title":"Is combining classifiers with stacking better than selecting the best one? Machine learning, 54","author":"D\u017eeroski Saso","year":"2004","unstructured":"Saso D\u017eeroski and Bernard \u017denko. Is combining classifiers with stacking better than selecting the best one? Machine learning, 54, 2004."},{"key":"e_1_3_2_1_47_1","volume-title":"Nvidia Multi Process Service (MPS). https:\/\/docs.nvidia.com\/deploy\/mps\/index.html","year":"2021","unstructured":"Nvidia. Nvidia Multi Process Service (MPS). https:\/\/docs.nvidia.com\/deploy\/mps\/index.html, 2021."},{"key":"e_1_3_2_1_48_1","volume-title":"Nvidia Profiler: nvprof. https:\/\/docs.nvidia.com\/cuda\/profiler-users-guide\/","year":"2024","unstructured":"Nvidia. Nvidia Profiler: nvprof. https:\/\/docs.nvidia.com\/cuda\/profiler-users-guide\/, 2024."},{"key":"e_1_3_2_1_49_1","volume-title":"Squeezenet: Alexnet-level accuracy with 50x fewer parameters and < 0.5 mb model size. arXiv preprint arXiv:1602.07360","author":"Iandola Forrest N","year":"2016","unstructured":"Forrest N Iandola, Song Han, Matthew W Moskewicz, Khalid Ashraf, William J Dally, and Kurt Keutzer. Squeezenet: Alexnet-level accuracy with 50x fewer parameters and < 0.5 mb model size. arXiv preprint arXiv:1602.07360, 2016."},{"key":"e_1_3_2_1_50_1","volume-title":"Proc. of ICML","author":"Tan Mingxing","year":"2019","unstructured":"Mingxing Tan and Quoc Le. Efficientnet: Rethinking model scaling for convolutional neural networks. In Proc. of ICML, 2019."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477132.3483580"},{"key":"e_1_3_2_1_52_1","volume-title":"Proc. of OSDI","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati, Reza Karimi, Safya Alzayat, Wei Hao, Antoine Kaufmann, Ymir Vigfusson, and Jonathan Mace. Serving {DNNs} like clockwork: Performance predictability from the bottom up. In Proc. of OSDI, 2020."},{"key":"e_1_3_2_1_53_1","volume-title":"Proc. of NSDI","author":"Gunasekaran Jashwant Raj","year":"2022","unstructured":"Jashwant Raj Gunasekaran, Cyan Subhra Mishra, Prashanth Thinakaran, Bikash Sharma, Mahmut Taylan Kandemir, and Chita R Das. Cocktail: A multidimensional optimization for model serving in cloud. In Proc. of NSDI, 2022."},{"key":"e_1_3_2_1_54_1","volume-title":"Proc. of SOSP","author":"Shen Haichen","year":"2019","unstructured":"Haichen Shen, Lequn Chen, Yuchen Jin, Liangyu Zhao, Bingyu Kong, Matthai Philipose, Arvind Krishnamurthy, and Ravi Sundaram. Nexus: A gpu cluster engine for accelerating dnn-based video analysis. In Proc. of SOSP, 2019."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3304109.3306221"},{"key":"e_1_3_2_1_56_1","volume-title":"Proc. of EuroSys","author":"Kannan Ram Srivatsa","year":"2019","unstructured":"Ram Srivatsa Kannan, Lavanya Subramanian, Ashwin Raju, Jeongseob Ahn, Jason Mars, and Lingjia Tang. Grandslam: Guaranteeing slas for jobs in microservices execution frameworks. In Proc. of EuroSys, 2019."},{"key":"e_1_3_2_1_57_1","volume-title":"Proc. of SoCC","author":"Crankshaw Daniel","year":"2020","unstructured":"Daniel Crankshaw, Gur-Eyal Sela, Xiangxi Mo, Corey Zumar, Ion Stoica, Joseph Gonzalez, and Alexey Tumanov. Inferline: latency-aware provisioning and scaling for prediction serving pipelines. In Proc. of SoCC, 2020."},{"key":"e_1_3_2_1_58_1","volume-title":"Proc. of NSDI","author":"Crankshaw Daniel","year":"2017","unstructured":"Daniel Crankshaw, Xin Wang, Guilio Zhou, Michael J Franklin, Joseph E Gonzalez, and Ion Stoica. Clipper: A {Low-Latency} online prediction serving system. In Proc. of NSDI, 2017."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.2737\/FPL-GTR-290"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485730.3493446"},{"key":"e_1_3_2_1_61_1","volume-title":"Proc. of SIGCOMM","author":"Jiang Junchen","year":"2018","unstructured":"Junchen Jiang, Ganesh Ananthanarayanan, Peter Bodik, Siddhartha Sen, and Ion Stoica. Chameleon: scalable adaptation of video analytics. In Proc. of SIGCOMM, 2018."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3489517.3530510"},{"key":"e_1_3_2_1_63_1","volume-title":"Proc. of SOSP","author":"Subramanya Suhas Jayaram","year":"2023","unstructured":"Suhas Jayaram Subramanya, Daiyaan Arfeen, Shouxu Lin, Aurick Qiao, Zhihao Jia, and Gregory R Ganger. Sia: Heterogeneity-aware, goodput-optimized mlcluster scheduling. In Proc. of SOSP, 2023."},{"key":"e_1_3_2_1_64_1","volume-title":"Proc. of OSDI","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. Orca: A distributed serving system for { Transformer-Based} generative models. In Proc. of OSDI, 2022."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_66_1","volume-title":"Proc. of OSDI","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S Gulavani, Alexey Tumanov, and Ramachandran Ramjee. Taming throughput-latency tradeoff in llm inference with sarathi-serve. In Proc. of OSDI, 2024."},{"key":"e_1_3_2_1_67_1","unstructured":"Connor Holmes Masahiro Tanaka Michael Wyatt Ammar Ahmad Awan Jeff Rasley Samyam Rajbhandari Reza Yazdani Aminabadi Heyang Qin Arash Bakhtiari Lev Kurilenko and Yuxiong He. Deepspeed-fastgen: High-throughput text generation for llms via mii and deepspeed-inference. arXiv preprint arXiv:2401.08671 2024."},{"key":"e_1_3_2_1_68_1","volume-title":"Fast distributed inference serving for large language models. arXiv preprint arXiv:2305.05920","author":"Wu Bingyang","year":"2023","unstructured":"Bingyang Wu, Yinmin Zhong, Zili Zhang, Gang Huang, Xuanzhe Liu, and Xin Jin. Fast distributed inference serving for large language models. arXiv preprint arXiv:2305.05920, 2023."},{"key":"e_1_3_2_1_69_1","volume-title":"Proc. of ASPLOS","author":"Feng Boyuan","year":"2024","unstructured":"Boyuan Feng, Zheng Wang, Yuke Wang, Shu Yang, and Yufei Ding. Zeno: A typebased optimization framework for zero knowledge neural network inference. In Proc. of ASPLOS, 2024."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640411"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3617232.3624849"},{"key":"e_1_3_2_1_72_1","volume-title":"Proc. of ATC","author":"Romero Francisco","year":"2021","unstructured":"Francisco Romero, Qian Li, Neeraja J Yadwadkar, and Christos Kozyrakis. {INFaaS}: Automated model-less inference serving. In Proc. of ATC, 2021."},{"key":"e_1_3_2_1_73_1","volume-title":"Proc. of ATC","author":"Jiang Angela H","year":"2018","unstructured":"Angela H Jiang, Daniel L-K Wong, Christopher Canel, Lilia Tang, Ishan Misra, Michael Kaminsky, Michael A Kozuch, Padmanabhan Pillai, David G Andersen, and Gregory R Ganger. Mainstream: Dynamic { Stem-Sharing} for { Multi-Tenant} video processing. In Proc. of ATC, 2018."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3629565"},{"key":"e_1_3_2_1_75_1","volume-title":"Proc. of EuroSys","author":"Strati Foteini","year":"2024","unstructured":"Foteini Strati, Xianzhe Ma, and Ana Klimovic. Orion: Interference-aware, finegrained GPU sharing for ML applications. In Proc. of EuroSys, 2024."},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/3704742.3704964"}],"event":{"name":"SoCC '25: ACM Symposium on Cloud Computing","location":"Online USA","acronym":"SoCC '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 2025 ACM Symposium on Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772052.3772261","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:20:26Z","timestamp":1768321226000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772052.3772261"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,19]]},"references-count":76,"alternative-id":["10.1145\/3772052.3772261","10.1145\/3772052"],"URL":"https:\/\/doi.org\/10.1145\/3772052.3772261","relation":{},"subject":[],"published":{"date-parts":[[2025,11,19]]},"assertion":[{"value":"2026-01-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}