{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T16:22:50Z","timestamp":1772727770436,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":83,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,11,20]],"date-time":"2024-11-20T00:00:00Z","timestamp":1732060800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,11,20]]},"DOI":"10.1145\/3698038.3698522","type":"proceedings-article","created":{"date-parts":[[2024,11,14]],"date-time":"2024-11-14T06:32:43Z","timestamp":1731565963000},"page":"302-321","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Scheduling for Reduced Tail Task Latencies in Highly Utilized Datacenters"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-8525-6305","authenticated-orcid":false,"given":"Smita","family":"Vijayakumar","sequence":"first","affiliation":[{"name":"University of Cambridge, Cambridge, UK"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8954-2428","authenticated-orcid":false,"given":"Anil","family":"Madhavapeddy","sequence":"additional","affiliation":[{"name":"University of Cambridge, Cambridge, UK"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0753-1261","authenticated-orcid":false,"given":"Evangelia","family":"Kalyvianaki","sequence":"additional","affiliation":[{"name":"University of Cambridge, Cambridge, UK"}]}],"member":"320","published-online":{"date-parts":[[2024,11,20]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/2150976.2150984"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3199524.3199564"},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the 2018 USENIX Conference on Usenix Annual Technical Conference (Boston, MA, USA) (USENIX ATC '18). USENIX Association, USA, 533--546","author":"Amvrosiadis George","year":"2018","unstructured":"George Amvrosiadis, Jun Woo Park, Gregory R. Ganger, Garth A. Gibson, Elisabeth Baseman, and Nathan DeBardeleben. 2018. On the Diversity of Cluster Workloads and Its Impact on Research Results. In Proceedings of the 2018 USENIX Conference on Usenix Annual Technical Conference (Boston, MA, USA) (USENIX ATC '18). USENIX Association, USA, 533--546."},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the 10th USENIX Conference on Networked Systems Design and Implementation","author":"Ananthanarayanan Ganesh","year":"2013","unstructured":"Ganesh Ananthanarayanan, Ali Ghodsi, Scott Shenker, and Ion Stoica. 2013. Effective Straggler Mitigation: Attack of the Clones. In Proceedings of the 10th USENIX Conference on Networked Systems Design and Implementation (Lombard, IL) (NSDI'13). USENIX Association, USA, 185--198."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3387514.3406221"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447786.3456259"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CANOPIE-HPC49598.2019.00007"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the 11th USENIX Conference on Operating Systems Design and Implementation","author":"Boutin Eric","year":"2014","unstructured":"Eric Boutin, Jaliya Ekanayake, Wei Lin, Bing Shi, Jingren Zhou, Zhengping Qian, Ming Wu, and Lidong Zhou. 2014. Apollo: Scalable and Coordinated Scheduling for Cloud-Scale Computing. In Proceedings of the 11th USENIX Conference on Operating Systems Design and Implementation (Broomfield, CO) (OSDI'14). USENIX Association, USA, 285--300."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458336.3465286"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3344341.3368798"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/2670979.2670999"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.14778\/2367502.2367519"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/MASCOTS.2011.12"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2806777.2806843"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/90.650143"},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the 16th USENIX Conference on Networked Systems Design and Implementation","author":"Curino Carlo","unstructured":"Carlo Curino, Subru Krishnan, Konstantinos Karanasos, Sriram Rao, Giovanni M. Fumarola, Botong Huang, Kishore Chaliparambil, Arun Suresh, Young Chen, Solom Heddaya, and et al. 2019. Hydra: A Federated Resource Manager for Data-Center Scale Analytics. In Proceedings of the 16th USENIX Conference on Networked Systems Design and Implementation (Boston, MA, USA) (NSDI'19). USENIX Association, USA, 177--191."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/2408776.2408794"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/1327452.1327492"},{"key":"e_1_3_2_1_19_1","unstructured":"Pamela Delgado. 2015. Eagle Simulator. https:\/\/github.com\/epfl-labos\/eagle"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/2987550.2987563"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3267809.3267838"},{"key":"e_1_3_2_1_22_1","volume-title":"Hawk: Hybrid Datacenter Scheduling. In 2015 USENIX Annual Technical Conference (USENIX ATC 15)","author":"Delgado Pamela","year":"2015","unstructured":"Pamela Delgado, Florin Dinu, Anne-Marie Kermarrec, and Willy Zwaenepoel. 2015. Hawk: Hybrid Datacenter Scheduling. In 2015 USENIX Annual Technical Conference (USENIX ATC 15). USENIX Association, Santa Clara, CA, 499--510. https:\/\/www.usenix.org\/conference\/atc15\/technical-session\/presentation\/delgado"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477132.3483571"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/2619239.2626322"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Dmitry Duplyakin Robert Ricci Aleksander Maricq Gary Wong Jonathon Duerig Eric Eide Leigh Stoller Mike Hibler David Johnson Kirk Webb et al. 2019. The design and operation of {CloudLab}. In 2019 USENIX annual technical conference (USENIX ATC 19). 1--14.","DOI":"10.1109\/ICNP.2019.8888128"},{"key":"e_1_3_2_1_26_1","unstructured":"etcd. 2024. A distributed reliable key-value store for the most critical data of a distributed system. https:\/\/etcd.io"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190549"},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the 8th USENIX Conference on Networked Systems Design and Implementation","author":"Ghodsi Ali","year":"2011","unstructured":"Ali Ghodsi, Matei Zaharia, Benjamin Hindman, Andy Konwinski, Scott Shenker, and Ion Stoica. 2011. Dominant Resource Fairness: Fair Allocation of Multiple Resource Types. In Proceedings of the 8th USENIX Conference on Networked Systems Design and Implementation (Boston, MA) (NSDI'11). USENIX Association, USA, 323--336."},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation","author":"Gog Ionel","year":"2016","unstructured":"Ionel Gog, Malte Schwarzkopf, Adam Gleave, Robert N. M. Watson, and Steven Hand. 2016. Firmament: Fast, Centralized Cluster Scheduling at Scale. In Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation (Savannah, GA, USA) (OSDI'16). USENIX Association, USA, 99--115."},{"key":"e_1_3_2_1_30_1","unstructured":"Google. 2017. Addressing Cascading Failures Site Reliability Engineering. https:\/\/sre.google\/sre-book\/addressing-cascading-failures\/"},{"key":"e_1_3_2_1_31_1","volume-title":"Pronto: Federated Task Scheduling. arXiv:2104.13429 [cs.DC]","author":"Grammenos Andreas","year":"2021","unstructured":"Andreas Grammenos, Evangelia Kalyvianaki, and Peter Pietzuch. 2021. Pronto: Federated Task Scheduling. arXiv:2104.13429 [cs.DC]"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/2740070.2626334"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.5555\/3026877.3026884"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3570612"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/2694344.2694384"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/2694344.2694384"},{"key":"e_1_3_2_1_37_1","volume-title":"Performance Modeling and Design of Computer Systems: Queueing Theory in Action","author":"Harchol-Balter Mor","unstructured":"Mor Harchol-Balter. 2013. Performance Modeling and Design of Computer Systems: Queueing Theory in Action (1st ed.). Cambridge University Press, USA.","edition":"1"},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of the 8th USENIX Conference on Networked Systems Design and Implementation","author":"Hindman Benjamin","year":"2011","unstructured":"Benjamin Hindman, Andy Konwinski, Matei Zaharia, Ali Ghodsi, Anthony D. Joseph, Randy Katz, Scott Shenker, and Ion Stoica. 2011. Mesos: A Platform for Fine-Grained Resource Sharing in the Data Center. In Proceedings of the 8th USENIX Conference on Networked Systems Design and Implementation (Boston, MA) (NSDI'11). USENIX Association, USA, 295--308."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2017.105"},{"key":"e_1_3_2_1_40_1","volume-title":"Metastable Failures in the Wild. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Huang Lexiang","year":"2022","unstructured":"Lexiang Huang, Matthew Magnusson, Abishek Bangalore Muralikrishna, Salman Estyak, Rebecca Isaacs, Abutalib Aghayev, Timothy Zhu, and Aleksey Charapko. 2022. Metastable Failures in the Wild. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 73--90. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/huang-lexiang"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/1629575.1629601"},{"key":"e_1_3_2_1_42_1","volume-title":"Jette and Tim Wickberg","author":"Morris","year":"2023","unstructured":"Morris A. Jette and Tim Wickberg. 2023. Architecture of the Slurm Workload Manager. In Job Scheduling Strategies for Parallel Processing, Dalibor Klus\u00e1\u010dek, Julita Corbal\u00e1n, and Gonzalo P. Rodrigo (Eds.). Springer Nature Switzerland, Cham, 3--23."},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the 16th USENIX Conference on Networked Systems Design and Implementation","author":"Chong Kostis","year":"2019","unstructured":"Kaffes, Kostis and Chong, Timothy and Humphries, Jack Tigar and Belay, Adam and Mazi\u00e8res, David and Kozyrakis, Christos. 2019. Shinjuku: Preemptive Scheduling for Msecond-Scale Tail Latency. In Proceedings of the 16th USENIX Conference on Networked Systems Design and Implementation (Boston, MA, USA) (NSDI'19). USENIX Association, USA, 345--359."},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of the 2015 USENIX Conference on Usenix Annual Technical Conference (Santa Clara, CA) (USENIX ATC '15). USENIX Association, USA, 485--497","author":"Karanasos Konstantinos","year":"2015","unstructured":"Konstantinos Karanasos, Sriram Rao, Carlo Curino, Chris Douglas, Kishore Chaliparambil, Giovanni Matteo Fumarola, Solom Heddaya, Raghu Ramakrishnan, and Sarvesh Sakalanaga. 2015. Mercury: Hybrid Centralized and Distributed Scheduling in Large Shared Clusters. In Proceedings of the 2015 USENIX Conference on Usenix Annual Technical Conference (Santa Clara, CA) (USENIX ATC '15). USENIX Association, USA, 485--497."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541944"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-96983-1_13"},{"key":"e_1_3_2_1_47_1","unstructured":"Kubernetes. 2021. https:\/\/github.com\/kubernetes\/community\/blob\/master\/contributors\/devel\/sig-scheduling\/scheduling_code_hierarchy_overview.md."},{"key":"e_1_3_2_1_48_1","unstructured":"Kubernetes. 2023. Why the number of pods per node should not exceed 110? https:\/\/github.com\/kubernetes\/kubernetes\/issues\/119391"},{"key":"e_1_3_2_1_49_1","unstructured":"Kubernetes. 2024. An open-source system for automating deployment scaling and management of containerized applications. https:\/\/kubernetes.io\/docs\/home"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/UCC48980.2020.00056"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/2670979.2670988"},{"key":"e_1_3_2_1_52_1","unstructured":"The Amazon Builders' Library. 2024. Using load shedding to avoid overload. https:\/\/aws.amazon.com\/builders-library\/using-load-shedding-to-avoid-overload\/."},{"key":"e_1_3_2_1_53_1","unstructured":"L. Mai E. Kalyvianaki and P. Costa. 2013. Exploiting Time-Malleability in Cloud-based Batch Processing Systems. (2013). https:\/\/openaccess.city.ac.uk\/id\/eprint\/8179\/ Unpublished."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341302.3342080"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3278532.3278566"},{"key":"e_1_3_2_1_56_1","volume-title":"Defcon: Preventing Overload with Graceful Feature Degradation. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Meza Justin J.","year":"2023","unstructured":"Justin J. Meza, Thote Gowda, Ahmed Eid, Tomiwa Ijaware, Dmitry Chernyshev, Yi Yu, Md Nazim Uddin, Rohan Das, Chad Nachiappan, Sari Tran, Shuyang Shi, Tina Luo, David Ke Hong, Sankaralingam Panneerselvam, Hans Ragas, Svetlin Manavski, Weidong Wang, and Francois Richard. 2023. Defcon: Preventing Overload with Graceful Feature Degradation. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). USENIX Association, Boston, MA, 607- 622. https:\/\/www.usenix.org\/conference\/osdi23\/presentation\/meza"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/71.932708"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.5555\/3488766.3488793"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/2517349.2522716"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190515"},{"key":"e_1_3_2_1_61_1","volume-title":"How to Trade off Server Utilization and Tail Latency","author":"Plenz Julius","unstructured":"Julius Plenz. 2019. How to Trade off Server Utilization and Tail Latency. USENIX Association, Singapore."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544216.3544265"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/2901318.2901354"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/2465351.2465386"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1186\/s13677-023-00471-1"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/2829988.2787508"},{"key":"e_1_3_2_1_67_1","unstructured":"Apache Spark. 2024. A unified engine for large-scale data analytics. https:\/\/spark.apache.org\/docs\/latest\/job-scheduling.html"},{"key":"e_1_3_2_1_68_1","unstructured":"The Apache Software Foundation. 2022. The Hadoop Fair Scheduler. https:\/\/hadoop.apache.org\/docs\/stable\/hadoop-yarn\/hadoop-yarn-site\/FairScheduler.html"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3342195.3387517"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2007.70606"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/2901318.2901355"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341325.3341992"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER.2014.6968735"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1145\/2741948.2741964"},{"key":"e_1_3_2_1_75_1","unstructured":"Smita Vijayakumar. 2023. Mumuration's Prototype. https:\/\/github.com\/csesmita\/murmuration_prototype."},{"key":"e_1_3_2_1_76_1","unstructured":"Smita Vijayakumar. 2023. Mumuration's Simulator. https:\/\/github.com\/csesmita\/murmuration_simulator."},{"key":"e_1_3_2_1_77_1","unstructured":"Smita Vijayakumar. 2024. Mumuration's Helper Scripts. https:\/\/github.com\/csesmita\/kubernetes-helper."},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/2847220.2847223"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357223.3362728"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1145\/2007477.1952709"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1145\/1755913.1755940"},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3495861"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1145\/3337821.3337851"}],"event":{"name":"SoCC '24: ACM Symposium on Cloud Computing","location":"Redmond WA USA","acronym":"SoCC '24","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the ACM Symposium on Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3698038.3698522","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3698038.3698522","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T18:59:45Z","timestamp":1755889185000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3698038.3698522"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,20]]},"references-count":83,"alternative-id":["10.1145\/3698038.3698522","10.1145\/3698038"],"URL":"https:\/\/doi.org\/10.1145\/3698038.3698522","relation":{},"subject":[],"published":{"date-parts":[[2024,11,20]]},"assertion":[{"value":"2024-11-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}