{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T09:05:05Z","timestamp":1768986305030,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","funder":[{"name":"Hong Kong Research Grant Council","award":["C6015-23G, 16217124, 16210822"],"award-info":[{"award-number":["C6015-23G, 16217124, 16210822"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,19]]},"DOI":"10.1145\/3772052.3772229","type":"proceedings-article","created":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:19:00Z","timestamp":1768321140000},"page":"240-254","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["ZipBatch: Multi-Tenant GPU Batching with Dual-Resource Regulation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-9677-7960","authenticated-orcid":false,"given":"Haoxuan","family":"Yu","sequence":"first","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5143-8352","authenticated-orcid":false,"given":"Sheng","family":"Yao","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4585-4152","authenticated-orcid":false,"given":"Wei","family":"Wang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,1,13]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2025. ROCm\/ROCm. https:\/\/github.com\/ROCm\/ROCm."},{"key":"e_1_3_2_1_2_1","volume-title":"SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked Prefills. arXiv:2308.16369 [cs.LG]","author":"Agrawal Amey","year":"2023","unstructured":"Amey Agrawal, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S. Gulavani, and Ramachandran Ramjee. 2023. SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked Prefills. arXiv:2308.16369 [cs.LG]"},{"key":"e_1_3_2_1_3_1","volume-title":"Anderson","author":"Bakita Joshua","year":"2025","unstructured":"Joshua Bakita and James H. Anderson. 2025. Hardware Compute Partitioning on NVIDIA GPUs for Composable Systems. 335 (2025), 21:1\u201321:25. https:\/\/drops.dagstuhl.de\/entities\/document\/10.4230\/LIPIcs.ECRTS.2025.21"},{"key":"e_1_3_2_1_4_1","volume-title":"Anderson","author":"Bakita Joshua","year":"2023","unstructured":"Joshua Bakita and James H. Anderson. 2023. Hardware Compute Partitioning on NVIDIA GPUs*. In Proc. IEEE RTAS."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3018743.3018748"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2308.07470"},{"key":"e_1_3_2_1_7_1","unstructured":"Tianqi Chen and Carlos Guestrin. 2016. XGBoost: A Scalable Tree Boosting System. In KDD (2016-08-13). https:\/\/dl.acm.org\/doi\/10.1145\/2939672.2939785"},{"key":"e_1_3_2_1_8_1","volume-title":"TVM: An Automated End-to-End Optimizing Compiler for Deep Learning.","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696074"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607060"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00049"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071121"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3731569.3764818"},{"key":"e_1_3_2_1_14_1","volume-title":"DVABatch: Diversity-aware Multi-Entry Multi-Exit Batching for Efficient Processing of DNN Services on GPUs. In 2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Cui Weihao","year":"2022","unstructured":"Weihao Cui, Han Zhao, Quan Chen, Hao Wei, Zirui Li, Deze Zeng, Chao Li, and Minyi Guo. 2022. DVABatch: Diversity-aware Multi-Entry Multi-Exit Batching for Efficient Processing of DNN Services on GPUs. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). 183\u2013198."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171\u20134186."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2021. An Image Is Worth 16x16 Words: Transformers for Image Recognition at Scale. arXiv:2010.11929 [cs] doi:10.48550\/arXiv.2010.11929","DOI":"10.48550\/arXiv.2010.11929"},{"key":"e_1_3_2_1_18_1","unstructured":"Ruwen Fan Tingxu Ren Shiwei Gao Jiwu Shu and Youyou Lu. [n. d.]. GPreempt: GPU Preemptive Scheduling Made General and Efficient. ([n. d.])."},{"key":"e_1_3_2_1_19_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati, Reza Karimi, Safya Alzayat, Wei Hao, Antoine Kaufmann, Ymir Vigfusson, and Jonathan Mace. 2020. Serving DNNs like Clockwork: Performance Predictability from the Bottom Up. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 443\u2013462."},{"key":"e_1_3_2_1_20_1","volume-title":"Microsecond-Scale Preemption for Concurrent GPU-accelerated DNN Inferences. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Han Mingcong","year":"2022","unstructured":"Mingcong Han, Hanze Zhang, Rong Chen, and Haibo Chen. 2022. Microsecond-Scale Preemption for Concurrent GPU-accelerated DNN Inferences. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 539\u2013558."},{"key":"e_1_3_2_1_21_1","volume-title":"Deep Residual Learning for Image Recognition. CoRR abs\/1512.03385","author":"He Kaiming","year":"2015","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2015. Deep Residual Learning for Image Recognition. CoRR abs\/1512.03385 (2015). arXiv:1512.03385"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_23_1","volume-title":"Nikolopoulos","author":"Hong Cheol-Ho","year":"2017","unstructured":"Cheol-Ho Hong, Ivor Spence, and Dimitrios S. Nikolopoulos. 2017. GPU Virtualization and Scheduling Methods: A Comprehensive Survey. ACM Comput. Surv. 50, 3 (2017), 35:1\u201335:37."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575705"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/RTAS.2019.00011"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_27_1","volume-title":"Deep Learning. Nature 521, 7553","author":"LeCun Yann","year":"2015","unstructured":"Yann LeCun, Yoshua Bengio, and Geoffrey Hinton. 2015. Deep Learning. Nature 521, 7553 (2015), 436\u2013444."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","unstructured":"Munkyu Lee Sihoon Seong Minki Kang Jihyuk Lee Gap-Joo Na In-Geol Chun Dimitrios Nikolopoulos and Cheol-Ho Hong. 2024. ParvaGPU: Efficient Spatial GPU Sharing for Large-Scale DNN Inference in Cloud Environments. arXiv:2409.14447 doi:10.48550\/arXiv.2409.14447","DOI":"10.48550\/arXiv.2409.14447"},{"key":"e_1_3_2_1_29_1","unstructured":"Bingyao Li Yueqi Wang Tianyu Wang Lieven Eeckhout Jun Yang Aamer Jaleel and Xulong Tang. 2024. Improving Multi-Instance GPU Efficiency via Sub-Entry Sharing TLB Design. arXiv:2404.18361 [cs]"},{"key":"e_1_3_2_1_30_1","volume-title":"Bullet: Boosting GPU Utilization for LLM Serving via Dynamic Spatial-Temporal Orchestration.","author":"Lin Zejia","year":"2025","unstructured":"Zejia Lin, Hongxin Xu, Guanyi Chen, Xianwei Zhang, and Yutong Lu. 2025. Bullet: Boosting GPU Utilization for LLM Serving via Dynamic Spatial-Temporal Orchestration."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613163"},{"key":"e_1_3_2_1_32_1","unstructured":"NVIDIA. 2017. Volta Architecture Whitepaper."},{"key":"e_1_3_2_1_33_1","unstructured":"NVIDIA. 2019. NVIDIA Nsight Compute. https:\/\/developer.nvidia.com\/nsight-compute"},{"key":"e_1_3_2_1_34_1","unstructured":"NVIDIA. 2019. Triton Inference Server. https:\/\/github.com\/triton-inference-server."},{"key":"e_1_3_2_1_35_1","unstructured":"NVIDIA. 2021. NVIDIA Ampere GA102 GPU Architecture Whitepaper."},{"key":"e_1_3_2_1_36_1","unstructured":"NVIDIA. 2021. NVIDIA Multi-Instance GPU (MIG). https:\/\/www.nvidia.com\/en-us\/technologies\/multi-instance-gpu\/."},{"key":"e_1_3_2_1_37_1","unstructured":"NVIDIA. 2025. Green Contexts. https:\/\/docs.nvidia.com\/cuda\/cuda-driver-api\/index.html."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","unstructured":"OpenAI Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat Red Avila Igor Babuschkin Suchir Balaji Valerie Balcom Paul Baltescu Haiming Bao Mohammad Bavarian Jeff Belgum Irwan Bello Jake Berdine Gabriel Bernadett-Shapiro Christopher Berner Lenny Bogdonoff Oleg Boiko Madelaine Boyd Anna-Luisa Brakman Greg Brockman Tim Brooks Miles Brundage Kevin Button Trevor Cai Rosie Campbell Andrew Cann Brittany Carey Chelsea Carlson Rory Carmichael Brooke Chan Che Chang Fotis Chantzis Derek Chen Sully Chen Ruby Chen Jason Chen Mark Chen Ben Chess Chester Cho Casey Chu Hyung Won Chung Dave Cummings Jeremiah Currier Yunxing Dai Cory Decareaux Thomas Degry Noah Deutsch Damien Deville Arka Dhar David Dohan Steve Dowling Sheila Dunning Adrien Ecoffet Atty Eleti Tyna Eloundou David Farhi Liam Fedus Niko Felix Sim\u00f3n Posada Fishman Juston Forte Isabella Fulford Leo Gao Elie Georges Christian Gibson Vik Goel Tarun Gogineni Gabriel Goh Rapha Gontijo-Lopes Jonathan Gordon Morgan Grafstein Scott Gray Ryan Greene Joshua Gross Shixiang Shane Gu Yufei Guo Chris Hallacy Jesse Han Jeff Harris Yuchen He Mike Heaton Johannes Heidecke Chris Hesse Alan Hickey Wade Hickey Peter Hoeschele Brandon Houghton Kenny Hsu Shengli Hu Xin Hu Joost Huizinga Shantanu Jain Shawn Jain Joanne Jang Angela Jiang Roger Jiang Haozhun Jin Denny Jin Shino Jomoto Billie Jonn Heewoo Jun Tomer Kaftan \u0141ukasz Kaiser Ali Kamali Ingmar Kanitscheider Nitish Shirish Keskar Tabarak Khan Logan Kilpatrick Jong Wook Kim Christina Kim Yongjik Kim Jan Hendrik Kirchner Jamie Kiros Matt Knight Daniel Kokotajlo \u0141ukasz Kondraciuk Andrew Kondrich Aris Konstantinidis Kyle Kosic Gretchen Krueger Vishal Kuo Michael Lampe Ikai Lan Teddy Lee Jan Leike Jade Leung Daniel Levy Chak Ming Li Rachel Lim Molly Lin Stephanie Lin Mateusz Litwin Theresa Lopez Ryan Lowe Patricia Lue Anna Makanju Kim Malfacini Sam Manning Todor Markov Yaniv Markovski Bianca Martin Katie Mayer Andrew Mayne Bob McGrew Scott Mayer McKinney Christine McLeavey Paul McMillan Jake McNeil David Medina Aalok Mehta Jacob Menick Luke Metz Andrey Mishchenko Pamela Mishkin Vinnie Monaco Evan Morikawa Daniel Mossing Tong Mu Mira Murati Oleg Murk David M\u00e9ly Ashvin Nair Reiichiro Nakano Rajeev Nayak Arvind Neelakantan Richard Ngo Hyeonwoo Noh Long Ouyang Cullen O'Keefe Jakub Pachocki Alex Paino Joe Palermo Ashley Pantuliano Giambattista Parascandolo Joel Parish Emy Parparita Alex Passos Mikhail Pavlov Andrew Peng Adam Perelman Filipe de Avila Belbute Peres Michael Petrov Henrique Ponde de Oliveira Pinto Michael Pokorny Michelle Pokrass Vitchyr H. Pong Tolly Powell Alethea Power Boris Power Elizabeth Proehl Raul Puri Alec Radford Jack Rae Aditya Ramesh Cameron Raymond Francis Real Kendra Rimbach Carl Ross Bob Rotsted Henri Roussez Nick Ryder Mario Saltarelli Ted Sanders Shibani Santurkar Girish Sastry Heather Schmidt David Schnurr John Schulman Daniel Selsam Kyla Sheppard Toki Sherbakov Jessica Shieh Sarah Shoker Pranav Shyam Szymon Sidor Eric Sigler Maddie Simens Jordan Sitkin Katarina Slama Ian Sohl Benjamin Sokolowsky Yang Song Natalie Staudacher Felipe Petroski Such Natalie Summers Ilya Sutskever Jie Tang Nikolas Tezak Madeleine B. Thompson Phil Tillet Amin Tootoonchian Elizabeth Tseng Preston Tuggle Nick Turley Jerry Tworek Juan Felipe Cer\u00f3n Uribe Andrea Vallone Arun Vijayvergiya Chelsea Voss Carroll Wainwright Justin Jay Wang Alvin Wang Ben Wang Jonathan Ward Jason Wei C. J. Weinmann Akila Welihinda Peter Welinder Jiayi Weng Lilian Weng Matt Wiethoff Dave Willner Clemens Winter Samuel Wolrich Hannah Wong Lauren Workman Sherwin Wu Jeff Wu Michael Wu Kai Xiao Tao Xu Sarah Yoo Kevin Yu Qiming Yuan Wojciech Zaremba Rowan Zellers Chong Zhang Marvin Zhang Shengjia Zhao Tianhao Zheng Juntang Zhuang William Zhuk and Barret Zoph. 2024. GPT-4 Technical Report. arXiv:2303.08774 [cs] doi:10.48550\/arXiv.2303.08774","DOI":"10.48550\/arXiv.2303.08774"},{"key":"e_1_3_2_1_39_1","volume-title":"An Introduction to Convolutional Neural Networks. CoRR abs\/1511.08458","author":"O'Shea Keiron","year":"2015","unstructured":"Keiron O'Shea and Ryan Nash. 2015. An Introduction to Convolutional Neural Networks. CoRR abs\/1511.08458 (2015). arXiv:1511.08458"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/2490301.2451160"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3652892.3700768"},{"key":"e_1_3_2_1_42_1","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Romero Francisco","year":"2021","unstructured":"Francisco Romero, Qian Li, Neeraja J. Yadwadkar, and Christos Kozyrakis. 2021. {INFaaS}: Automated Model-less Inference Serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). 397\u2013411."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Olaf Ronneberger Philipp Fischer and Thomas Brox. 2015. U-Net: Convolutional Networks for Biomedical Image Segmentation. arXiv:1505.04597 [cs.CV]","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"e_1_3_2_1_44_1","volume-title":"Proc. USENIX ATC.","author":"Shahrad Mohammad","year":"2020","unstructured":"Mohammad Shahrad, Rodrigo Fonseca, \u00cd\u00f1igo Goiri, Gohar Chaudhry, Paul Batum, Jason Cooke, Eduardo Laureano, Colby Tresness, Mark Russinovich, and Ricardo Bianchini. 2020. Serverless in the Wild: Characterizing and Optimizing the Serverless Workload at a Large Cloud Provider. In Proc. USENIX ATC."},{"key":"e_1_3_2_1_45_1","volume-title":"Welder: Scheduling Deep Learning Memory Access via Tile-graph.","author":"Shi Yining","year":"2023","unstructured":"Yining Shi, Zhi Yang, Jilong Xue, Lingxiao Ma, Yuqing Xia, Ziming Miao, Yuxiao Guo, Fan Yang, and Lidong Zhou. 2023. Welder: Scheduling Deep Learning Memory Access via Tile-graph."},{"key":"e_1_3_2_1_46_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Shubha Sudipta Saha","year":"2024","unstructured":"Sudipta Saha Shubha, Haiying Shen, and Anand Iyer. 2024. {USHER}: Holistic Interference Avoidance for Resource Optimized {ML} Inference. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 947\u2013964."},{"key":"e_1_3_2_1_47_1","unstructured":"Benjamin Spector Jordan Juravsky Stuart Sul Owen Dugan Dylan Lim Dan Fu Simran Arora and Chris R\u00e9. 2025. Look ma no bubbles! designing a low-latency Megakernel for LLAMA-1B. https:\/\/hazyresearch.stanford.edu\/blog\/2025-05-27-no-bubbles"},{"key":"e_1_3_2_1_48_1","volume-title":"Orion: Interference-aware, Fine-grained GPU Sharing for ML Applications.","author":"Strati Foteini","year":"2024","unstructured":"Foteini Strati. 2024. Orion: Interference-aware, Fine-grained GPU Sharing for ML Applications. (2024)."},{"key":"e_1_3_2_1_49_1","unstructured":"Veraxus. 2024. Nouveau. https:\/\/github.com\/Veraxus\/nouveau"},{"key":"e_1_3_2_1_50_1","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Weng Qizhen","year":"2022","unstructured":"Qizhen Weng, Wencong Xiao, Yinghao Yu, Wei Wang, Cheng Wang, Jian He, Yong Li, Liping Zhang, Wei Lin, and Yu Ding. 2022. {MLaaS}in the Wild: Workload Analysis and Scheduling in {Large-Scale} Heterogeneous {GPU} Clusters. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). 945\u2013960."},{"key":"e_1_3_2_1_51_1","first-page":"995","volume-title":"2023 USENIX Annual Technical Conference (USENIX ATC 23)","author":"Weng Qizhen","year":"2023","unstructured":"Qizhen Weng, Lingyun Yang, Yinghao Yu, Wei Wang, Xiaochuan Tang, Guodong Yang, and Liping Zhang. 2023. Beware of Fragmentation: Scheduling {GPU-Sharing} Workloads with Fragmentation Gradient Descent. In 2023 USENIX Annual Technical Conference (USENIX ATC 23).995-1008."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037742"},{"key":"e_1_3_2_1_53_1","unstructured":"Bingyang Wu Zili Zhang Zhihao Bai Xuanzhe Liu and Xin Jin. 2023. Transparent GPU Sharing in Container Clouds for Deep Learning Workloads."},{"key":"e_1_3_2_1_54_1","volume-title":"Proc. IEEE RTSS.","author":"Yandrofski Tyler","unstructured":"Tyler Yandrofski, Jingyuan Chen, Nathan Otterness, James H. Anderson, and F. Donelson Smith. 2022. Making Powerful Enemies on NVIDIA GPUs. In Proc. IEEE RTSS."},{"key":"e_1_3_2_1_55_1","volume-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 521\u2013538."},{"key":"e_1_3_2_1_56_1","volume-title":"SHEPHERD: Serving DNNs in the Wild. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Zhang Hong","year":"2023","unstructured":"Hong Zhang, Yupeng Tang, Anurag Khandelwal, and Ion Stoica. 2023. SHEPHERD: Serving DNNs in the Wild. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 787\u2013808."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Shulai Zhang Quan Chen Weihao Cui Han Zhao Chunyu Xue Zhen Zheng Wei Lin and Minyi Guo. 2025. Improving GPU Sharing Performance through Adaptive Bubbleless Spatial-Temporal Sharing. In EuroSys (2025-03-30). https:\/\/dl.acm.org\/doi\/10.1145\/3689031.3696070","DOI":"10.1145\/3689031.3696070"},{"key":"e_1_3_2_1_58_1","unstructured":"Shulai Zhang Ao Xu Quan Chen Han Zhao Weihao Cui Zhen Wang Yan Li Limin Xiao and Minyi Guo. 2025. Efficient {Performance-Aware} {GPU} Sharing with Compatibility and Isolation through Kernel Space Interception."},{"key":"e_1_3_2_1_59_1","volume-title":"SGDRC: Software-Defined Dynamic Resource Control for Concurrent DNN Inference on NVIDIA GPUs. In PPoPP (2025-02-28). https:\/\/dl.acm.org\/doi\/10.1145\/3710848.3710863","author":"Zhang Yongkang","year":"2025","unstructured":"Yongkang Zhang, Haoxuan Yu, Chenxia Han, Cheng Wang, Baotong Lu, Yunzhe Li, Zhifeng Jiang, Yang Li, Xiaowen Chu, and Huaicheng Li. 2025. SGDRC: Software-Defined Dynamic Resource Control for Concurrent DNN Inference on NVIDIA GPUs. In PPoPP (2025-02-28). https:\/\/dl.acm.org\/doi\/10.1145\/3710848.3710863"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2410.07381"},{"key":"e_1_3_2_1_61_1","volume-title":"UGPU: Dynamically Constructing Unbalanced GPUs for Enhanced Resource Efficiency. In ISCA (2025-06-21). https:\/\/dl.acm.org\/doi\/10.1145\/3695053.3731103","author":"Zhao Xia","year":"2025","unstructured":"Xia Zhao, Guangda Zhang, Lu Wang, and Huadong Dai. 2025. UGPU: Dynamically Constructing Unbalanced GPUs for Enhanced Resource Efficiency. In ISCA (2025-06-21). https:\/\/dl.acm.org\/doi\/10.1145\/3695053.3731103"},{"key":"e_1_3_2_1_62_1","volume-title":"Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, Joseph E. Gonzalez, and Ion Stoica.","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, Joseph E. Gonzalez, and Ion Stoica. 2020. Ansor: Generating High-Performance Tensor Programs for Deep Learning."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/RTAS.2015.7108420"}],"event":{"name":"SoCC '25: ACM Symposium on Cloud Computing","location":"Online USA","acronym":"SoCC '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 2025 ACM Symposium on Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772052.3772229","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T19:57:18Z","timestamp":1768939038000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772052.3772229"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,19]]},"references-count":63,"alternative-id":["10.1145\/3772052.3772229","10.1145\/3772052"],"URL":"https:\/\/doi.org\/10.1145\/3772052.3772229","relation":{},"subject":[],"published":{"date-parts":[[2025,11,19]]},"assertion":[{"value":"2026-01-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}