{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T23:44:22Z","timestamp":1768347862193,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":77,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China (NSFC)","award":["62172050"],"award-info":[{"award-number":["62172050"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,19]]},"DOI":"10.1145\/3772052.3772249","type":"proceedings-article","created":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:19:00Z","timestamp":1768321140000},"page":"493-506","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CoMPI: Coordinated Model Merging and Parallel Inference at Edge"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-0226-4952","authenticated-orcid":false,"given":"Shuang","family":"Zeng","sequence":"first","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9131-3517","authenticated-orcid":false,"given":"Haitao","family":"Zhang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8594-856X","authenticated-orcid":false,"given":"Zezhong","family":"Yan","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,1,13]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2025. NVIDIA Dynamo Inference Framework. https:\/\/developer.nvidia.com\/dynamo"},{"key":"e_1_3_2_1_2_1","unstructured":"Amazon Web Services Inc. 2025. AWS Outposts. https:\/\/aws.amazon.com\/outposts\/."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3307334.3328589"},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation (OSDI'20)","author":"Bai Zhihao","year":"2020","unstructured":"Zhihao Bai, Zhen Zhang, Yibo Zhu, and Xin Jin. 2020. PipeSwitch: fast pipelined context switching for deep learning applications. In Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation (OSDI'20). USENIX Association, Article 28, 16 pages."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11554-018-0840-6"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3320060"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the 2011 International Conference on Unsupervised and Transfer Learning Workshop -","volume":"27","author":"Bengio Yoshua","year":"2011","unstructured":"Yoshua Bengio. 2011. Deep learning of representations for unsupervised and transfer learning. In Proceedings of the 2011 International Conference on Unsupervised and Transfer Learning Workshop - Volume 27 (UTLW'11). JMLR.org, 17\u201337."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/IC2E.2019.00-10"},{"key":"e_1_3_2_1_10_1","volume-title":"Dulloor","author":"Canel Christopher","year":"2019","unstructured":"Christopher Canel, Thomas Kim, Giulio Zhou, Conglong Li, Hyeontaek Lim, David G. Andersen, Michael Kaminsky, and Subramanya R. Dulloor. 2019. Scaling Video Analytics on Constrained Edge Nodes. ArXiv abs\/1905.13536 (2019). https:\/\/api.semanticscholar.org\/CorpusID:92991297"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the 8th International Conference on Neural Information Processing Systems (NIPS '94)","author":"Caruana Rich","year":"1994","unstructured":"Rich Caruana. 1994. Learning many related tasks at the same time with backprop-agation. In Proceedings of the 8th International Conference on Neural Information Processing Systems (NIPS '94). MIT Press, 657\u2013664."},{"key":"e_1_3_2_1_13_1","volume-title":"Clipper: A Low-Latency Online Prediction Serving System. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Crankshaw Daniel","year":"2017","unstructured":"Daniel Crankshaw, Xin Wang, Guilio Zhou, Michael J. Franklin, Joseph E. Gonzalez, and Ion Stoica. 2017. Clipper: A Low-Latency Online Prediction Serving System. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17). USENIX Association, 613\u2013627. https:\/\/www.usenix.org\/conference\/nsdi17\/technical-sessions\/presentation\/crankshaw"},{"key":"e_1_3_2_1_14_1","volume-title":"DVABatch: Diversity-aware Multi-Entry Multi-Exit Batching for Efficient Processing of DNN Services on GPUs. In 2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Cui Weihao","year":"2022","unstructured":"Weihao Cui, Han Zhao, Quan Chen, Hao Wei, Zirui Li, Deze Zeng, Chao Li, and Minyi Guo. 2022. DVABatch: Diversity-aware Multi-Entry Multi-Exit Batching for Efficient Processing of DNN Services on GPUs. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). USENIX Association, 183\u2013198. https:\/\/www.usenix.org\/conference\/atc22\/presentation\/cui"},{"key":"e_1_3_2_1_15_1","volume-title":"Oh (Eds.)","volume":"35","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. In Advances in Neural Information Processing Systems, S. Koyejo, S. Mohamed, A. Agarwal, D. Belgrave, K. Cho, and A. Oh (Eds.), Vol. 35. Curran Associates, Inc., 16344\u201316359. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/67d57c32e20fd0a7a302cb81d36e40d5-Paper-Conference.pdf"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems (NIPS '22)","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. 2022. LLM.int8(): 8-bit matrix multiplication for transformers at scale. In Proceedings of the 36th International Conference on Neural Information Processing Systems (NIPS '22). Curran Associates Inc., Article 2198, 15 pages."},{"key":"e_1_3_2_1_18_1","unstructured":"ONNX Runtime developers. 2025. ONNX Runtime. https:\/\/onnxruntime.ai\/."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627535.3638466"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441578"},{"key":"e_1_3_2_1_22_1","volume-title":"Low-Power Computer Vision","author":"Gholami Amir","unstructured":"Amir Gholami, Sehoon Kim, Zhen Dong, Zhewei Yao, Michael W Mahoney, and Kurt Keutzer. 2022. A survey of quantization methods for efficient neural network inference. In Low-Power Computer Vision. Chapman and Hall\/CRC, 291\u2013326."},{"key":"e_1_3_2_1_23_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati, Reza Karimi, Safya Alzayat, Wei Hao, Antoine Kaufmann, Ymir Vigfusson, and Jonathan Mace. 2020. Serving DNNs like Clockwork: Performance Predictability from the Bottom Up. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 443\u2013462. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/gujarati"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3514221.3526173"},{"key":"e_1_3_2_1_25_1","unstructured":"Gurobi Optimization LLC. 2025. Gurobi Optimizer Reference Manual. https:\/\/www.gurobi.com"},{"key":"e_1_3_2_1_26_1","volume-title":"Microsecond-scale Preemption for Concurrent GPU-accelerated DNN Inferences. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Han Mingcong","year":"2022","unstructured":"Mingcong Han, Hanze Zhang, Rong Chen, and Haibo Chen. 2022. Microsecond-scale Preemption for Concurrent GPU-accelerated DNN Inferences. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, 539\u2013558. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/han"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_28_1","volume-title":"Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V. Le, Yonghui Wu, and Zhifeng Chen.","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Mia Xu Chen, Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V. Le, Yonghui Wu, and Zhifeng Chen. 2019. GPipe: efficient training of giant neural networks using pipeline parallelism. Curran Associates Inc."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/IC2E.2018.00052"},{"key":"e_1_3_2_1_30_1","first-page":"711","article-title":"Data movement is all you need: A case study on optimizing transformers","volume":"3","author":"Ivanov Andrei","year":"2021","unstructured":"Andrei Ivanov, Nikoli Dryden, Tal Ben-Nun, Shigang Li, and Torsten Hoefler. 2021. Data movement is all you need: A case study on optimizing transformers. Proceedings of Machine Learning and Systems 3 (2021), 711\u2013732.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_31_1","volume-title":"International Conference on Learning Representations.","author":"Jia Hengrui","unstructured":"Hengrui Jia, Hongyu Chen, Jonas Guan, Ali Shahin Shamsabadi, and Nicolas Papernot.2021. A zest of lime: Towards architecture-independent model distances. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_32_1","volume-title":"2018 USENIX Annual Technical Conference (USENIX ATC 18)","author":"Jiang Angela H","year":"2018","unstructured":"Angela H Jiang, Daniel L-K Wong, Christopher Canel, Lilia Tang, Ishan Misra, Michael Kaminsky, Michael A Kozuch, Padmanabhan Pillai, David G Andersen, and Gregory R Ganger. 2018. Mainstream: Dynamic {Stem-Sharing} for {Multi-Tenant} Video Processing. In 2018 USENIX Annual Technical Conference (USENIX ATC 18). 29\u201342."},{"key":"e_1_3_2_1_33_1","volume-title":"An Approximation for Job Scheduling on Cloud with Synchronization and Slowdown Constraints. In IEEE INFOCOM 2023-IEEE Conference on Computer Communications. IEEE, 1\u201310","author":"Kong Dejun","year":"2023","unstructured":"Dejun Kong, Zhongrui Zhang, Yangguang Shi, and Xiaofeng Gao. 2023. An Approximation for Job Scheduling on Cloud with Synchronization and Slowdown Constraints. In IEEE INFOCOM 2023-IEEE Conference on Computer Communications. IEEE, 1\u201310."},{"key":"e_1_3_2_1_34_1","volume-title":"PRETZEL: Opening the Black Box of Machine Learning Prediction Serving Systems. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Lee Yunseong","year":"2018","unstructured":"Yunseong Lee, Alberto Scolari, Byung-Gon Chun, Marco Domenico Santambrogio, Markus Weimer, and Matteo Interlandi. 2018. PRETZEL: Opening the Black Box of Machine Learning Prediction Serving Systems. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). USENIX Association, 611\u2013626. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/lee"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3460319.3464816"},{"key":"e_1_3_2_1_36_1","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E Gonzalez, et al. 2023. { AlpaServe} : Statistical multiplexing with model parallelism for deep learning serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). 663\u2013679."},{"key":"e_1_3_2_1_37_1","volume-title":"International Conference on Machine Learning. PMLR, 6543\u20136552","author":"Li Zhuohan","year":"2021","unstructured":"Zhuohan Li, Siyuan Zhuang, Shiyuan Guo, Danyang Zhuo, Hao Zhang, Dawn Song, and Ion Stoica. 2021. Terapipe: Token-level pipeline parallelism for training large-scale language models. In International Conference on Machine Learning. PMLR, 6543\u20136552."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651335"},{"key":"e_1_3_2_1_39_1","unstructured":"Microsoft Corporation. 2025. Azure Stack Edge. https:\/\/azure.microsoft.com\/en-us\/products\/azure-stack\/edge\/."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_41_1","volume-title":"International Conference on Machine Learning. PMLR, 7937\u20137947","author":"Narayanan Deepak","year":"2021","unstructured":"Deepak Narayanan, Amar Phanishayee, Kaiyu Shi, Xie Chen, and Matei Zaharia. 2021. Memory-efficient pipeline-parallel dnn training. In International Conference on Machine Learning. PMLR, 7937\u20137947."},{"key":"e_1_3_2_1_42_1","unstructured":"NVIDIA Corporation. 2025. NVIDIA Jetson: The AI platform for edge computing. https:\/\/www.nvidia.com\/en-us\/autonomous-machines\/embedded-systems\/."},{"key":"e_1_3_2_1_43_1","unstructured":"NVIDIA Corporation. 2025. NVIDIA TensorRT: High Performance Deep Learning Inference. https:\/\/developer.nvidia.com\/tensorrt."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640383"},{"key":"e_1_3_2_1_45_1","unstructured":"Christopher Olston Noah Fiedel Kiril Gorovoy Jeremiah Harmsen Li Lao Fangwei Li Vinu Rajashekhar Sukriti Ramesh and Jordan Soyke. 2017. TensorFlow-Serving: Flexible High-Performance ML Serving. arXiv:1712.06139 [cs.DC] https:\/\/arxiv.org\/abs\/1712.06139"},{"key":"e_1_3_2_1_46_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Padmanabhan Arthi","year":"2023","unstructured":"Arthi Padmanabhan, Neil Agarwal, Anand Iyer, Ganesh Ananthanarayanan, Yuanchao Shu, Nikolaos Karianakis, Guoqing Harry Xu, and Ravi Netravali. 2023. Gemel: Model Merging for {Memory-Efficient},{Real-Time} Video Analytics at the Edge. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 973\u2013994."},{"key":"e_1_3_2_1_47_1","volume-title":"High-Performance Deep Learning Library. In Advances in Neural Information Processing Systems (NeurIPS)","volume":"32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas K\u00f6pf, Edward Yang, Zach DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Advances in Neural Information Processing Systems (NeurIPS), Vol. 32. Curran Associates, Inc. http:\/\/arxiv.org\/abs\/1912.01703"},{"key":"e_1_3_2_1_48_1","volume-title":"Proceedings of machine learning and systems 5","author":"Pope Reiner","year":"2023","unstructured":"Reiner Pope, Sholto Douglas, Aakanksha Chowdhery, Jacob Devlin, James Bradbury, Jonathan Heek, Kefan Xiao, Shivani Agrawal, and Jeff Dean. 2023. Efficiently scaling transformer inference. Proceedings of machine learning and systems 5 (2023), 606\u2013624."},{"key":"e_1_3_2_1_49_1","volume-title":"INFaaS: Automated Model-less Inference Serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Romero Francisco","year":"2021","unstructured":"Francisco Romero, Qian Li, Neeraja J. Yadwadkar, and Christos Kozyrakis. 2021. INFaaS: Automated Model-less Inference Serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). USENIX Association, 397\u2013411. https:\/\/www.usenix.org\/conference\/atc21\/presentation\/romero"},{"key":"e_1_3_2_1_50_1","unstructured":"Victor Sanh Lysandre Debut Julien Chaumond and Thomas Wolf. 2020. DistilBERT a distilled version of BERT: smaller faster cheaper and lighter. arXiv:1910.01108 [cs.CL] https:\/\/arxiv.org\/abs\/1910.01108"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2963056"},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of the 2020 USENIX Conference on Usenix Annual Technical Conference (USENIX ATC'20). USENIX Association, Article 14","author":"Shahrad Mohammad","year":"2020","unstructured":"Mohammad Shahrad, Rodrigo Fonseca, \u00cd\u00f1igo Goiri, Gohar Chaudhry, Paul Batum, Jason Cooke, Eduardo Laureano, Colby Tresness, Mark Russinovich, and Ricardo Bianchini. 2020. Serverless in the wild: characterizing and optimizing the serverless workload at a large cloud provider. In Proceedings of the 2020 USENIX Conference on Usenix Annual Technical Conference (USENIX ATC'20). USENIX Association, Article 14, 14 pages."},{"key":"e_1_3_2_1_53_1","volume-title":"Proceedings of the 32nd International Conference on Neural Information Processing Systems (NIPS'18)","author":"Shazeer Noam","year":"2018","unstructured":"Noam Shazeer, Youlong Cheng, Niki Parmar, Dustin Tran, Ashish Vaswani, Pen-porn Koanantakool, Peter Hawkins, HyoukJoong Lee, Mingsheng Hong, Cliff Young, Ryan Sepassi, and Blake Hechtman. 2018. Mesh-TensorFlow: deep learning for supercomputers. In Proceedings of the 32nd International Conference on Neural Information Processing Systems (NIPS'18). Curran Associates Inc., 10435\u201310444."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359658"},{"key":"e_1_3_2_1_55_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_56_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Shubha Sudipta Saha","year":"2024","unstructured":"Sudipta Saha Shubha, Haiying Shen, and Anand Iyer. 2024. {USHER}: Holistic Interference Avoidance for Resource Optimized {ML} Inference. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 947\u2013964."},{"key":"e_1_3_2_1_57_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_58_1","volume-title":"Proceedings of the 2024 USENIX Conference on Usenix Annual Technical Conference (USENIX ATC'24). USENIX Association, Article 35","author":"Um Taegeon","year":"2024","unstructured":"Taegeon Um, Byungsoo Oh, Minyoung Kang, Woo-Yeon Lee, Goeun Kim, Dongseob Kim, Youngtaek Kim, Mohd Muzzammil, and Myeongjae Jeon. 2024. Metis: fast automatic distributed training on heterogeneous GPUs. In Proceedings of the 2024 USENIX Conference on Usenix Annual Technical Conference (USENIX ATC'24). USENIX Association, Article 35, 16 pages."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigData52589.2021.9671658"},{"key":"e_1_3_2_1_60_1","volume-title":"Yolov10: Real-time end-to-end object detection. arXiv preprint arXiv:2405.14458","author":"Wang Ao","year":"2024","unstructured":"Ao Wang, Hui Chen, Lihao Liu, Kai Chen, Zijia Lin, Jungong Han, and Guiguang Ding. 2024. Yolov10: Real-time end-to-end object detection. arXiv preprint arXiv:2405.14458 (2024)."},{"key":"e_1_3_2_1_61_1","volume-title":"European conference on computer vision. Springer, 1\u201321","author":"Wang Chien-Yao","year":"2024","unstructured":"Chien-Yao Wang, I-Hau Yeh, and Hong-Yuan Mark Liao. 2024. Yolov9: Learning what you want to learn using programmable gradient information. In European conference on computer vision. Springer, 1\u201321."},{"key":"e_1_3_2_1_62_1","volume-title":"ParaLoupe: Real-time Video Analytics on Edge Cluster via Mini Model Parallelization","author":"Wang Hanling","year":"2024","unstructured":"Hanling Wang, Qing Li, Haidong Kang, Dieli Hu, Lianbo Ma, Gareth Tyson, Zhenhui Yuan, and Yong Jiang. 2024. ParaLoupe: Real-time Video Analytics on Edge Cluster via Mini Model Parallelization. IEEE Transactions on Mobile Computing(2024)."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00215"},{"key":"e_1_3_2_1_64_1","volume-title":"Transparent GPU Sharing in Container Clouds for Deep Learning Workloads. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Wu Bingyang","year":"2023","unstructured":"Bingyang Wu, Zili Zhang, Zhihao Bai, Xuanzhe Liu, and Xin Jin. 2023. Transparent GPU Sharing in Container Clouds for Deep Learning Workloads. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, 69\u201385. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/wu"},{"key":"e_1_3_2_1_65_1","volume-title":"[n. d.]. GSPMD: General and scalable parallelization for ML computation graphs","author":"Xu Yuanzhong","year":"2021","unstructured":"Yuanzhong Xu, HyoukJoong Lee, Dehao Chen, Blake Hechtman, Yanping Huang, Rahul Joshi, Maxim Krikun, Dmitry Lepikhin, Andy Ly, Marcello Maggioni, et al. [n. d.]. GSPMD: General and scalable parallelization for ML computation graphs (2021). URL http:\/\/arxiv.org\/abs\/2105.04663 ([n.d.])."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/MWC.001.2000292"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3650074"},{"key":"e_1_3_2_1_68_1","volume-title":"Proceedings of the 28th International Conference on Neural Information Processing Systems -","volume":"2","author":"Yosinski Jason","year":"2014","unstructured":"Jason Yosinski, Jeff Clune, Yoshua Bengio, and Hod Lipson. 2014. How transferable are features in deep neural networks?. In Proceedings of the 28th International Conference on Neural Information Processing Systems - Volume 2 (NIPS'14). MIT Press, 3320\u20133328."},{"key":"e_1_3_2_1_69_1","volume-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, 521\u2013538. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/yu"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSC.2017.2653116"},{"key":"e_1_3_2_1_71_1","volume-title":"SHEPHERD: Serving DNNs in the Wild. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Zhang Hong","year":"2023","unstructured":"Hong Zhang, Yupeng Tang, Anurag Khandelwal, and Ion Stoica. 2023. SHEPHERD: Serving DNNs in the Wild. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, 787\u2013808. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/zhang-hong"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2019.2925910"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477132.3483580"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477132.3483580"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544216.3544224"},{"key":"e_1_3_2_1_76_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P Xing, et al. 2022. Alpa: Automating inter-and {Intra-Operator} parallelism for distributed deep learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 559\u2013578."},{"key":"e_1_3_2_1_77_1","volume-title":"PetS: A Unified Framework for Parameter-Efficient Transformers Serving. In 2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Zhou Zhe","year":"2022","unstructured":"Zhe Zhou, Xuechao Wei, Jiejing Zhang, and Guangyu Sun. 2022. PetS: A Unified Framework for Parameter-Efficient Transformers Serving. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). USENIX Association, 489\u2013504. https:\/\/www.usenix.org\/conference\/atc22\/presentation\/zhou-zhe"}],"event":{"name":"SoCC '25: ACM Symposium on Cloud Computing","location":"Online USA","acronym":"SoCC '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 2025 ACM Symposium on Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772052.3772249","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:25:46Z","timestamp":1768321546000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772052.3772249"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,19]]},"references-count":77,"alternative-id":["10.1145\/3772052.3772249","10.1145\/3772052"],"URL":"https:\/\/doi.org\/10.1145\/3772052.3772249","relation":{},"subject":[],"published":{"date-parts":[[2025,11,19]]},"assertion":[{"value":"2026-01-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}