{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T14:34:08Z","timestamp":1774449248409,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":72,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,5,8]],"date-time":"2023-05-08T00:00:00Z","timestamp":1683504000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2021ZD0110202"],"award-info":[{"award-number":["2021ZD0110202"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,5,8]]},"DOI":"10.1145\/3552326.3567499","type":"proceedings-article","created":{"date-parts":[[2023,5,5]],"date-time":"2023-05-05T17:33:02Z","timestamp":1683307982000},"page":"883-898","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":15,"title":["SiloD: A Co-design of Caching and Scheduling for Deep Learning Clusters"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2536-0016","authenticated-orcid":false,"given":"Hanyu","family":"Zhao","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2880-7100","authenticated-orcid":false,"given":"Zhenhua","family":"Han","sequence":"additional","affiliation":[{"name":"Microsoft Research, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8219-4499","authenticated-orcid":false,"given":"Zhi","family":"Yang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0557-1104","authenticated-orcid":false,"given":"Quanlu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0792-8966","authenticated-orcid":false,"given":"Mingxia","family":"Li","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0378-060X","authenticated-orcid":false,"given":"Fan","family":"Yang","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0646-5365","authenticated-orcid":false,"given":"Qianxi","family":"Zhang","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9295-6530","authenticated-orcid":false,"given":"Binyang","family":"Li","sequence":"additional","affiliation":[{"name":"Microsoft, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3518-5212","authenticated-orcid":false,"given":"Yuqing","family":"Yang","sequence":"additional","affiliation":[{"name":"Microsoft Research, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1590-9749","authenticated-orcid":false,"given":"Lili","family":"Qiu","sequence":"additional","affiliation":[{"name":"Microsoft Research, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2727-2703","authenticated-orcid":false,"given":"Lintao","family":"Zhang","sequence":"additional","affiliation":[{"name":"BaseBit Technologies, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7258-3116","authenticated-orcid":false,"given":"Lidong","family":"Zhou","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,5,8]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"https:\/\/ai.facebook.com\/blog\/deepfake-detection-challenge\/","author":"Creating","year":"2020","unstructured":"Creating a dataset and a challenge for deepfakes. https:\/\/ai.facebook.com\/blog\/deepfake-detection-challenge\/, 2020."},{"key":"e_1_3_2_1_2_1","volume-title":"https:\/\/opensource.google\/projects\/open-images-dataset","author":"Open","year":"2020","unstructured":"Open images dataset. https:\/\/opensource.google\/projects\/open-images-dataset, 2020."},{"key":"e_1_3_2_1_3_1","volume-title":"https:\/\/aws.amazon.com\/s3\/","author":"Amazon","year":"2021","unstructured":"Amazon s3. https:\/\/aws.amazon.com\/s3\/, 2021."},{"key":"e_1_3_2_1_4_1","volume-title":"https:\/\/aws.amazon.com\/sagemaker\/","author":"Amazon","year":"2021","unstructured":"Amazon sagemaker. https:\/\/aws.amazon.com\/sagemaker\/, 2021."},{"key":"e_1_3_2_1_5_1","volume-title":"https:\/\/go.aws\/3DkWUgY","author":"Aws","year":"2021","unstructured":"Aws gpu instances. https:\/\/go.aws\/3DkWUgY, 2021."},{"key":"e_1_3_2_1_6_1","volume-title":"https:\/\/azure.microsoft.com\/en-us\/services\/storage\/blobs\/","author":"Azure","year":"2021","unstructured":"Azure blob storage. https:\/\/azure.microsoft.com\/en-us\/services\/storage\/blobs\/, 2021."},{"key":"e_1_3_2_1_7_1","volume-title":"https:\/\/azure.microsoft.com\/en-us\/solutions\/data-lake\/","author":"Azure","year":"2021","unstructured":"Azure data lake. https:\/\/azure.microsoft.com\/en-us\/solutions\/data-lake\/, 2021."},{"key":"e_1_3_2_1_8_1","volume-title":"https:\/\/ml.azure.com","author":"Azure","year":"2021","unstructured":"Azure machine learning. https:\/\/ml.azure.com, 2021."},{"key":"e_1_3_2_1_9_1","volume-title":"https:\/\/docs.microsoft.com\/en-us\/azure\/virtual-machines\/sizes-gpu","author":"Gpu","year":"2021","unstructured":"Gpu optimized virtual machine sizes. https:\/\/docs.microsoft.com\/en-us\/azure\/virtual-machines\/sizes-gpu, 2021."},{"key":"e_1_3_2_1_10_1","volume-title":"https:\/\/docs.microsoft.com\/en-us\/azure\/storage\/common\/scalability-targets-standard-account","author":"Azure","year":"2022","unstructured":"Azure storage scalability and performance targets for standard storage accounts. https:\/\/docs.microsoft.com\/en-us\/azure\/storage\/common\/scalability-targets-standard-account, 2022."},{"key":"e_1_3_2_1_11_1","volume-title":"https:\/\/www.nvidia.com\/en-gb\/data-center\/products\/","author":"Nvidia","year":"2022","unstructured":"Nvidia gpu generations. https:\/\/www.nvidia.com\/en-gb\/data-center\/products\/, 2022."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/248156.248182"},{"key":"e_1_3_2_1_13_1","volume-title":"Youtube-8m: A large-scale video classification benchmark. arXiv preprint arXiv:1609.08675","author":"Abu-El-Haija Sami","year":"2016","unstructured":"Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, and Sudheendra Vijayanarasimhan. Youtube-8m: A large-scale video classification benchmark. arXiv preprint arXiv:1609.08675, 2016."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/69.755618"},{"key":"e_1_3_2_1_15_1","volume-title":"High performance i\/o for large scale deep learning","author":"Aizman Alex","year":"2020","unstructured":"Alex Aizman, Gavin Maltby, and Thomas Breuel. High performance i\/o for large scale deep learning, 2020."},{"key":"e_1_3_2_1_16_1","first-page":"267","volume-title":"9th {USENIX} Symposium on Networked Systems Design and Implementation ({NSDI} 12)","author":"Ananthanarayanan Ganesh","year":"2012","unstructured":"Ganesh Ananthanarayanan, Ali Ghodsi, Andrew Warfield, Dhruba Borthakur, Srikanth Kandula, Scott Shenker, and Ion Stoica. Pacman: Coordinated memory caching for parallel jobs. In 9th {USENIX} Symposium on Networked Systems Design and Implementation ({NSDI} 12), pages 267--280, 2012."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/346000.346003"},{"key":"e_1_3_2_1_18_1","first-page":"389","volume-title":"15th {USENIX} Symposium on Networked Systems Design and Implementation ({NSDI} 18)","author":"Beckmann Nathan","year":"2018","unstructured":"Nathan Beckmann, Haoxian Chen, and Asaf Cidon. {LHD}: Improving cache hit rate by maximizing hit density. In 15th {USENIX} Symposium on Networked Systems Design and Implementation ({NSDI} 18), pages 389--403, 2018."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3286062.3286082"},{"key":"e_1_3_2_1_20_1","first-page":"285","volume-title":"11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14)","author":"Boutin Eric","year":"2014","unstructured":"Eric Boutin, Jaliya Ekanayake, Wei Lin, Bing Shi, Jingren Zhou, Zhengping Qian, Ming Wu, and Lidong Zhou. Apollo: Scalable and coordinated scheduling for cloud-scale computing. In 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14), pages 285--300, 2014."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/2890784"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3342195.3387555"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-48228-8_12"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_25_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805, 2018."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/2038916.2038939"},{"key":"e_1_3_2_1_27_1","volume-title":"8th USENIX Symposium on Networked Systems Design and Implementation (NSDI 11)","author":"Ghodsi Ali","year":"2011","unstructured":"Ali Ghodsi, Matei Zaharia, Benjamin Hindman, Andy Konwinski, Scott Shenker, and Ion Stoica. Dominant resource fairness: Fair allocation of multiple resource types. In 8th USENIX Symposium on Networked Systems Design and Implementation (NSDI 11), 2011."},{"key":"e_1_3_2_1_28_1","first-page":"365","volume-title":"Choosy: Maxmin fair sharing for datacenter jobs with constraints","author":"Ghodsi Ali","year":"2013","unstructured":"Ali Ghodsi, Matei Zaharia, Scott Shenker, and Ion Stoica. Choosy: Maxmin fair sharing for datacenter jobs with constraints. pages 365--378, 04 2013."},{"key":"e_1_3_2_1_29_1","first-page":"99","volume-title":"12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","author":"Gog Ionel","year":"2016","unstructured":"Ionel Gog, Malte Schwarzkopf, Adam Gleave, Robert NM Watson, and Steven Hand. Firmament: Fast, centralized cluster scheduling at scale. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16), pages 99--115, 2016."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/2740070.2626334"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/2740070.2626334"},{"key":"e_1_3_2_1_32_1","first-page":"689","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Graur Dan","year":"2022","unstructured":"Dan Graur, Damien Aymon, Dan Kluser, Tanguy Albrici, Chandramohan A Thekkath, and Ana Klimovic. Cachew: Machine learning input data processing as a service. In 2022 USENIX Annual Technical Conference (USENIX ATC 22), pages 689--706, 2022."},{"key":"e_1_3_2_1_33_1","volume-title":"Introducing petastorm: Uber atg's data access library for deep learning. https:\/\/eng.uber.com\/petastorm\/","author":"Gruener Robbie","year":"2018","unstructured":"Robbie Gruener, Owen Cheng, and Yevgeni Litvin. Introducing petastorm: Uber atg's data access library for deep learning. https:\/\/eng.uber.com\/petastorm\/, 2018."},{"key":"e_1_3_2_1_34_1","first-page":"485","volume-title":"16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)","author":"Gu Juncheng","year":"2019","unstructured":"Juncheng Gu, Mosharaf Chowdhury, Kang G Shin, Yibo Zhu, Myeongjae Jeon, Junjie Qian, Hongqiang Liu, and Chuanxiong Guo. Tiresias: A GPU cluster manager for distributed deep learning. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19), pages 485--500, 2019."},{"key":"e_1_3_2_1_35_1","first-page":"2535","volume-title":"International Conference on Machine Learning","author":"Hacohen Guy","year":"2019","unstructured":"Guy Hacohen and Daphna Weinshall. On the power of curriculum learning in training deep networks. In International Conference on Machine Learning, pages 2535--2544. PMLR, 2019."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_37_1","first-page":"57","volume-title":"Optimized locality-aware memory allocation for key-value cache. In 2015 {USENIX} Annual Technical Conference ({USENIX}{ATC} 15)","author":"Hu Xiameng","year":"2015","unstructured":"Xiameng Hu, Xiaolin Wang, Yechen Li, Lan Zhou, Yingwei Luo, Chen Ding, Song Jiang, and Zhenlin Wang. {LAMA}: Optimized locality-aware memory allocation for key-value cache. In 2015 {USENIX} Annual Technical Conference ({USENIX}{ATC} 15), pages 57--69, 2015."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/1629575.1629601"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3267809.3267827"},{"key":"e_1_3_2_1_40_1","volume-title":"Aggregating local image descriptors into compact codes","author":"J\u00e9gou Herv\u00e9","year":"2011","unstructured":"Herv\u00e9 J\u00e9gou, Florent Perronnin, Matthijs Douze, Jorge S\u00e1nchez, Patrick P\u00e9rez, and Cordelia Schmid. Aggregating local image descriptors into compact codes. IEEE transactions on pattern analysis and machine intelligence, 34(9):1704--1716, 2011."},{"key":"e_1_3_2_1_41_1","volume-title":"May","author":"Jeon Myeongjae","year":"2018","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. Multi-tenant GPU clusters for deep learning workloads: Analysis and implications. MSR-TR-2018-13, May 2018."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.5555\/3357034.3357049"},{"key":"e_1_3_2_1_43_1","volume-title":"Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems, 25:1097--1105","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E Hinton. Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems, 25:1097--1105, 2012."},{"key":"e_1_3_2_1_44_1","volume-title":"18th USENIX Conference on File and Storage Technologies (FAST 2020","author":"Kumar Abhishek Vijaya","year":"2020","unstructured":"Abhishek Vijaya Kumar and Muthian Sivathanu. Quiver: An informed storage cache for deep learning. In 18th USENIX Conference on File and Storage Technologies (FAST 2020). USENIX, February 2020."},{"key":"e_1_3_2_1_45_1","first-page":"537","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Lee Gyewon","year":"2021","unstructured":"Gyewon Lee, Irene Lee, Hyeonmin Ha, Kyunggeun Lee, Hwarim Hyun, Ahnjae Shin, and Byung-Gon Chun. Refurbish your training data: Reusing partially augmented samples for faster deep neural network training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21), pages 537--550, 2021."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_12"},{"key":"e_1_3_2_1_48_1","first-page":"289","volume-title":"17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20)","author":"Mahajan Kshiteej","year":"2020","unstructured":"Kshiteej Mahajan, Arjun Balasubramanian, Arjun Singhvi, Shivaram Venkataraman, Aditya Akella, Amar Phanishayee, and Shuchi Chawla. Themis: Fair and efficient gpu cluster scheduling. In 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20), pages 289--304, 2020."},{"key":"e_1_3_2_1_49_1","first-page":"579","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Mohan Jayashree","year":"2022","unstructured":"Jayashree Mohan, Amar Phanishayee, Janardhan Kulkarni, and Vijay Chidambaram. Looking beyond GPUs for DNN scheduling on Multi-Tenant clusters. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 579--596, Carlsbad, CA, July 2022. USENIX Association."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.14778\/3446095.3446100"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.14778\/3476311.3476374"},{"key":"e_1_3_2_1_52_1","first-page":"481","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Narayanan Deepak","year":"2020","unstructured":"Deepak Narayanan, Keshav Santhanam, Fiodar Kazhamiaka, Amar Phanishayee, and Matei Zaharia. Heterogeneity-aware cluster scheduling policies for deep learning workloads. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20), pages 481--498. USENIX Association, November 2020."},{"key":"e_1_3_2_1_53_1","volume-title":"Deep learning and ai in the cloud with nfs storage. https:\/\/cloud.netapp.com\/blog\/ai-and-deep-learning-in-the-cloud","year":"2019","unstructured":"NetApp. Deep learning and ai in the cloud with nfs storage. https:\/\/cloud.netapp.com\/blog\/ai-and-deep-learning-in-the-cloud, 2019."},{"key":"e_1_3_2_1_54_1","volume-title":"The 10th USENIX Symposium on Operating Systems Design and Implementation (OSDI '12)","author":"Nightingale Ed","year":"2012","unstructured":"Ed Nightingale, Jeremy Elson, Jinliang Fan, Owen Hofmann, Jon Howell, and Yutaka Suzue. Flat datacenter storage. In The 10th USENIX Symposium on Operating Systems Design and Implementation (OSDI '12). USENIX, October 2012."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/170036.170081"},{"key":"e_1_3_2_1_56_1","first-page":"234","volume-title":"Proceedings of the 2006 international conference on Compilers, architecture and synthesis for embedded systems","author":"Jung Dawoon","year":"2006","unstructured":"Seon-yeong Park, Dawoon Jung, Jeong-uk Kang, Jin-soo Kim, and Joonwon Lee. Cflru: a replacement algorithm for flash memory. In Proceedings of the 2006 international conference on Compilers, architecture and synthesis for embedded systems, pages 234--241, 2006."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"key":"e_1_3_2_1_58_1","volume-title":"Hoard: A distributed data caching system to accelerate deep learning training on the cloud","author":"Pinto Christian","year":"2018","unstructured":"Christian Pinto, Yiannis Gkoufas, Andrea Reale, Seetharami Seelam, and Steven Eliuk. Hoard: A distributed data caching system to accelerate deep learning training on the cloud, 2018."},{"key":"e_1_3_2_1_59_1","volume-title":"et al. Singularity: Planet-scale, preemptible, elastic scheduling of ai workloads. arXiv preprint arXiv:2202.07848","author":"Shukla Dharma","year":"2022","unstructured":"Dharma Shukla, Muthian Sivathanu, Srinidhi Viswanatha, Bhargav Gulavani, Rimma Nehme, Amey Agrawal, Chen Chen, Nipun Kwatra, Ramachandran Ramjee, Pankaj Sharma, et al. Singularity: Planet-scale, preemptible, elastic scheduling of ai workloads. arXiv preprint arXiv:2202.07848, 2022."},{"key":"e_1_3_2_1_60_1","first-page":"529","volume-title":"17th {USENIX} Symposium on Networked Systems Design and Implementation ({NSDI} 20)","author":"Song Zhenyu","year":"2020","unstructured":"Zhenyu Song, Daniel S Berger, Kai Li, Anees Shaikh, Wyatt Lloyd, Soudeh Ghorbani, Changhoon Kim, Aditya Akella, Arvind Krishnamurthy, Emmett Witchel, et al. Learning relaxed belady for content distribution network caching. In 17th {USENIX} Symposium on Networked Systems Design and Implementation ({NSDI} 20), pages 529--544, 2020."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10586-017-1055-5"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737396"},{"key":"e_1_3_2_1_64_1","first-page":"6105","volume-title":"International Conference on Machine Learning","author":"Tan Mingxing","year":"2019","unstructured":"Mingxing Tan and Quoc Le. Efficientnet: Rethinking model scaling for convolutional neural networks. In International Conference on Machine Learning, pages 6105--6114. PMLR, 2019."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/2901318.2901355"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.5555\/2685048.2685072"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404397.3404472"},{"key":"e_1_3_2_1_68_1","first-page":"595","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Quanlu Zhang, et al. Gandiva: Introspective cluster scheduling for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18), pages 595--610, 2018."},{"key":"e_1_3_2_1_69_1","first-page":"533","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Xiao Wencong","year":"2020","unstructured":"Wencong Xiao, Shiru Ren, Yong Li, Yang Zhang, Pengyang Hou, Zhi Li, Yihui Feng, Wei Lin, and Yangqing Jia. Antman: Dynamic scaling on GPU clusters for deep learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20), pages 533--548. USENIX Association, November 2020."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/1755913.1755940"},{"key":"e_1_3_2_1_71_1","first-page":"515","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Zhao Hanyu","year":"2020","unstructured":"Hanyu Zhao, Zhenhua Han, Zhi Yang, Quanlu Zhang, Fan Yang, Lidong Zhou, Mao Yang, Francis CM Lau, Yuqi Wang, Yifan Xiong, et al. Hived: Sharing a gpu cluster for deep learning with guarantees. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20), pages 515--532, 2020."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3533044"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/CISS.2018.8362276"}],"event":{"name":"EuroSys '23: Eighteenth European Conference on Computer Systems","location":"Rome Italy","acronym":"EuroSys '23","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Eighteenth European Conference on Computer Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3552326.3567499","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3552326.3567499","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:47:39Z","timestamp":1750178859000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3552326.3567499"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,5,8]]},"references-count":72,"alternative-id":["10.1145\/3552326.3567499","10.1145\/3552326"],"URL":"https:\/\/doi.org\/10.1145\/3552326.3567499","relation":{},"subject":[],"published":{"date-parts":[[2023,5,8]]},"assertion":[{"value":"2023-05-08","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}