{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T23:44:24Z","timestamp":1768347864306,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,19]]},"DOI":"10.1145\/3772052.3772240","type":"proceedings-article","created":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:19:00Z","timestamp":1768321140000},"page":"375-387","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["From Bottleneck to Breakthrough: Optimizing Scheduling for Hyperscale Containerized Clusters"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-7817-5479","authenticated-orcid":false,"given":"Bing","family":"Li","sequence":"first","affiliation":[{"name":"ByteDance Inc., San Jose, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0537-6977","authenticated-orcid":false,"given":"Yuquan","family":"Ren","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0107-580X","authenticated-orcid":false,"given":"Xinyi","family":"Song","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6003-6167","authenticated-orcid":false,"given":"Zhilei","family":"Liu","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5657-3291","authenticated-orcid":false,"given":"Cong","family":"Xu","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Seattle, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9581-1807","authenticated-orcid":false,"given":"Jingyuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"ByteDance Inc., San Jose, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7683-1079","authenticated-orcid":false,"given":"Caixue","family":"Lin","sequence":"additional","affiliation":[{"name":"ByteDance Inc., San Jose, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6733-0947","authenticated-orcid":false,"given":"Wu","family":"Xiang","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9122-4703","authenticated-orcid":false,"given":"Rui","family":"Shi","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,1,13]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2012. CNCF. https:\/\/www.cncf.io\/."},{"key":"e_1_3_2_1_2_1","unstructured":"2015. etcd. https:\/\/etcd.io\/."},{"key":"e_1_3_2_1_3_1","unstructured":"2016. Docker Swarm. https:\/\/dockerswarrn.rocks\/."},{"key":"e_1_3_2_1_4_1","unstructured":"2016. Kubernetes. https:\/\/kubernetes.io\/."},{"key":"e_1_3_2_1_5_1","unstructured":"2018. Volcano. https:\/\/github.com\/volcano-sh\/volcano."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737460"},{"key":"e_1_3_2_1_7_1","volume-title":"11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14)","author":"Boutin Eric","year":"2014","unstructured":"Eric Boutin, Jaliya Ekanayake, Wei Lin, Bing Shi, Jingren Zhou, Zhengping Qian, Ming Wu, and Lidong Zhou. 2014. Apollo: Scalable and Coordinated Scheduling for {Cloud-Scale} Computing. In 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14). 285\u2013300."},{"key":"e_1_3_2_1_8_1","volume-title":"2019 24th IEEE international conference on emerging technologies and factory automation (ETFA). IEEE, 1213\u20131217","author":"Casquero Oskar","year":"2019","unstructured":"Oskar Casquero, Aintzane Armentia, Isabel Sarachaga, Federico P\u00e9rez, Dar\u00edo Orive, and Marga Marcos. 2019. Distributed scheduling in Kubernetes based on MAS for Fog-in-the-loop applications. In 2019 24th IEEE international conference on emerging technologies and factory automation (ETFA). IEEE, 1213\u20131217."},{"key":"e_1_3_2_1_9_1","volume-title":"2017 USENIX Annual Technical Conference (USENIX ATC 17)","author":"Chen Wei","year":"2017","unstructured":"Wei Chen, Jia Rao, and Xiaobo Zhou. 2017. Preemptive, Low Latency Datacenter Scheduling via Lightweight Virtualization. In 2017 USENIX Annual Technical Conference (USENIX ATC 17). USENIX Association, Santa Clara, CA, 251\u2013263. https:\/\/www.usenix.org\/conference\/atc17\/technical-sessions\/presentation\/chen-wei"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","first-page":"85","DOI":"10.24138\/jcomss.v16i1.1027","article-title":"Context-aware Kubernetes scheduler for edge-native applications on 5G","volume":"16","author":"Ogbuachi Michael Chima","year":"2020","unstructured":"Michael Chima Ogbuachi, Anna Reale, P\u00e9ter Suskovics, and Benedek Kov\u00e1cs. 2020. Context-aware Kubernetes scheduler for edge-native applications on 5G. Journal of communications software and systems 16, 1 (2020), 85\u201394.","journal-title":"Journal of communications software and systems"},{"key":"e_1_3_2_1_11_1","volume-title":"2015 USENIX Annual Technical Conference (USENIX ATC 15)","author":"Delgado Pamela","year":"2015","unstructured":"Pamela Delgado, Florin Dinu, Anne-Marie Kermarrec, and Willy Zwaenepoel. 2015. Hawk: Hybrid datacenter scheduling. In 2015 USENIX Annual Technical Conference (USENIX ATC 15). 499\u2013510."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/2499368.2451125"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2644865.2541941"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2806777.2806779"},{"key":"e_1_3_2_1_15_1","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Feng Yihui","year":"2021","unstructured":"Yihui Feng, Zhi Liu, Yunjian Zhao, Tatiana Jin, Yidi Wu, Yang Zhang, James Cheng, Chao Li, and Tao Guan. 2021. Scaling large production clusters with partitioned synchronization. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). 81\u201397."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.5555\/313651.313812"},{"key":"e_1_3_2_1_17_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Hadary Ori","year":"2020","unstructured":"Ori Hadary, Luke Marshall, Ishai Menache, Abhisek Pan, Esaias E Greeff, David Dion, Star Dorminey, Shailesh Joshi, Yang Chen, Mark Russinovich, et al. 2020. Protean:{ VM} allocation service at scale. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 845\u2013861."},{"key":"e_1_3_2_1_18_1","unstructured":"HashiCorp. 2025. Scheduling workloads. https:\/\/developer.hashicorp.com\/nomad\/docs\/concepts\/scheduling"},{"key":"e_1_3_2_1_19_1","volume-title":"Proc. 35th Symp. Mass Storage Syst. Technol. 1\u201314","author":"He Yuan","year":"2020","unstructured":"Yuan He, Lingfeng Xiang, Wen Xia, Hong Jiang, Zhenhua Li, Xuan Wang, and Xiangyu Zou. 2020. Dsync: A lightweight delta synchronization approach for cloud storage services. In Proc. 35th Symp. Mass Storage Syst. Technol. 1\u201314."},{"key":"e_1_3_2_1_20_1","volume-title":"8th USENIX Symposium on Networked Systems Design and Implementation (NSDI 11)","author":"Hindman Benjamin","year":"2011","unstructured":"Benjamin Hindman, Andy Konwinski, Matei Zaharia, Ali Ghodsi, Anthony D Joseph, Randy Katz, Scott Shenker, and Ion Stoica. 2011. Mesos: A Platform for {Fine-Grained} Resource Sharing in the Data Center. In 8th USENIX Symposium on Networked Systems Design and Implementation (NSDI 11)."},{"key":"e_1_3_2_1_21_1","volume-title":"2020 IEEE International Conference on Cloud Engineering (IC2E). IEEE, 116\u2013123","author":"Huang Jiaming","year":"2020","unstructured":"Jiaming Huang, Chuming Xiao, and WeigangWu. 2020. Rlsk: A job scheduler for federated kubernetes clusters based on reinforcement learning. In 2020 IEEE International Conference on Cloud Engineering (IC2E). IEEE, 116\u2013123."},{"key":"e_1_3_2_1_22_1","volume-title":"2015 USENIX Annual Technical Conference (USENIX ATC 15)","author":"Karanasos Konstantinos","year":"2015","unstructured":"Konstantinos Karanasos, Sriram Rao, Carlo Curino, Chris Douglas, Kishore Chaliparambil, Giovanni Matteo Fumarola, Solom Heddaya, Raghu Ramakrishnan, and Sarvesh Sakalanaga. 2015. Mercury: Hybrid centralized and distributed scheduling in large shared clusters. In 2015 USENIX Annual Technical Conference (USENIX ATC 15). 485\u2013497."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587445"},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the ACM Symposium on Cloud Computing. 258\u2013272","author":"Li Suyi","year":"2021","unstructured":"Suyi Li, Luping Wang, Wei Wang, Yinghao Yu, and Bo Li. 2021. George: Learning to place long-lived containers in large clusters with operation constraints. In Proceedings of the ACM Symposium on Cloud Computing. 258\u2013272."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341302.3342080"},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the ACM SIGOPS 28th Symposium on Operating Systems Principles","author":"Newell Andrew","year":"2021","unstructured":"Andrew Newell, Dimitrios Skarlatos, Jingyuan Fan, Pavan Kumar, Maxim Khutornenko, Mayank Pundir, Yirui Zhang, Mingjun Zhang, Yuanlai Liu, Linh Le, Brendon Daugherty, Apurva Samudra, Prashasti Baid, James Kneeland, Igor Kabiljo, Dmitry Shchukin, Andr\u00e9 Rodrigues, Scott Michelson, Ben Christensen, Kaushik Veeraraghavan, and Chunqiang Tang. 2021. RAS: Continuously Optimized Region-Wide Datacenter Resource Allocation. Proceedings of the ACM SIGOPS 28th Symposium on Operating Systems Principles (2021). https:\/\/api.semanticscholar.org\/CorpusID:239028963"},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the Twenty-Fourth ACM Symposium on Operating Systems Principles. 69\u201384","author":"Ousterhout Kay","year":"2013","unstructured":"Kay Ousterhout, Patrick Wendell, Matei Zaharia, and Ion Stoica. 2013. Sparrow: distributed, low latency scheduling. In Proceedings of the Twenty-Fourth ACM Symposium on Operating Systems Principles. 69\u201384."},{"key":"e_1_3_2_1_28_1","volume-title":"Pollux: Co-adaptive Cluster Scheduling for Goodput-Optimized Deep Learning. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21)","author":"Qiao Aurick","unstructured":"Aurick Qiao, Sang Keun Choe, Suhas Jayaram Subramanya, Willie Neiswanger, Qirong Ho, Hao Zhang, Gregory R. Ganger, and Eric P. Xing. 2021. Pollux: Co-adaptive Cluster Scheduling for Goodput-Optimized Deep Learning. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21). USENIX Association, 1\u201318. https:\/\/www.usenix.org\/conference\/osdi21\/presentation\/qiao"},{"key":"e_1_3_2_1_29_1","unstructured":"SchedMD. 2025. Slurm Workload Manager - Preemption. https:\/\/slurm.schedmd.com\/preempt.html"},{"key":"e_1_3_2_1_30_1","volume-title":"Coscheduling based on PodGroup CRD. https:\/\/github.com\/kubernetes-sigs\/scheduler-plugins\/tree\/master\/kep\/42-podgroup-coscheduling. Accessed","author":"Scheduling Kubernetes SIG","year":"2024","unstructured":"Kubernetes SIG Scheduling. 2023. Coscheduling based on PodGroup CRD. https:\/\/github.com\/kubernetes-sigs\/scheduler-plugins\/tree\/master\/kep\/42-podgroup-coscheduling. Accessed: October 7, 2024."},{"key":"e_1_3_2_1_31_1","volume-title":"Trimaran: Real Load Aware Scheduling. https:\/\/github.com\/kubernetes-sigs\/scheduler-plugins\/tree\/master\/kep\/61-Trimaran- real- load- aware- scheduling. Accessed","author":"Scheduling Kubernetes SIG","year":"2023","unstructured":"Kubernetes SIG Scheduling. 2023. Trimaran: Real Load Aware Scheduling. https:\/\/github.com\/kubernetes-sigs\/scheduler-plugins\/tree\/master\/kep\/61-Trimaran- real- load- aware- scheduling. Accessed: October 7, 2024."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/2465351.2465386"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3828.3835"},{"key":"e_1_3_2_1_34_1","unstructured":"Abel Souza Kristiaan Pelckmans Devarshi Ghoshal Lavanya Ramakrishnan and Johan Tordsson. 2024. ASA - The Adaptive Scheduling Algorithm. arXiv:2401.09733 [cs.DC] https:\/\/arxiv.org\/abs\/2401.09733"},{"key":"e_1_3_2_1_35_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Tang Chunqiang","year":"2020","unstructured":"Chunqiang Tang, Kenny Yu, Kaushik Veeraraghavan, Jonathan Kaldor, Scott Michelson, Thawan Kooburat, Aravind Anbudurai, Matthew Clark, Kabir Gogia, Long Cheng, et al. 2020. Twine: A unified cluster management system for shared infrastructure. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 787\u2013803."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3342195.3387517"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/2523616.2523633"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/2523616.2523633"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Abhishek Verma Luis Pedrosa Madhukar Korupolu David Oppenheimer Eric Tune and John Wilkes. 2015. Large-scale cluster management at Google with Borg. In roceedings of the Tenth European Conference on Computer Systems. 1\u201317.","DOI":"10.1145\/2741948.2741964"},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of the 2023 ACM Symposium on Cloud Computing. 08\u2013323","author":"Xiang Wu","year":"2023","unstructured":"Wu Xiang, Yakun Li, Yuquan Ren, Fan Jiang, Chaohui Xin, Varun Gupta, Chao Xiang, Xinyi Song, Meng Liu, Bing Li, et al. 2023. G\u00f6del: Unified Large-Scale Resource Management and Scheduling at ByteDance. In Proceedings of the 2023 ACM Symposium on Cloud Computing. 08\u2013323."},{"key":"e_1_3_2_1_41_1","volume-title":"2020 20th IEEE\/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID). IEEE, 310\u2013319","author":"Yabuuchi Hidehito","year":"2020","unstructured":"Hidehito Yabuuchi and Takahiro Shinagawa. 2020. Multi-resource Low-latency Cluster Scheduling without Execution Time Estimation. In 2020 20th IEEE\/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID). IEEE, 310\u2013319."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.14778\/2733004.2733012"}],"event":{"name":"SoCC '25: ACM Symposium on Cloud Computing","location":"Online USA","acronym":"SoCC '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 2025 ACM Symposium on Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772052.3772240","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:26:15Z","timestamp":1768321575000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772052.3772240"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,19]]},"references-count":42,"alternative-id":["10.1145\/3772052.3772240","10.1145\/3772052"],"URL":"https:\/\/doi.org\/10.1145\/3772052.3772240","relation":{},"subject":[],"published":{"date-parts":[[2025,11,19]]},"assertion":[{"value":"2026-01-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}