{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,3]],"date-time":"2025-12-03T18:12:03Z","timestamp":1764785523932,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["CNS-2106027","CNS-2214980","CNS-2146909","CCF-2046444."],"award-info":[{"award-number":["CNS-2106027","CNS-2214980","CNS-2146909","CCF-2046444."]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,2]]},"DOI":"10.1145\/3652892.3700767","type":"proceedings-article","created":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T19:36:13Z","timestamp":1732736173000},"page":"299-312","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Cannikin: Optimal Adaptive Distributed DNN Training over Heterogeneous Clusters"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-3806-6310","authenticated-orcid":false,"given":"Chengyi","family":"Nie","sequence":"first","affiliation":[{"name":"Stony Brook University, Stony Brook, US"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6070-7378","authenticated-orcid":false,"given":"Jessica","family":"Maghakian","sequence":"additional","affiliation":[{"name":"Stony Brook University, Stony Brook, US"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8026-4502","authenticated-orcid":false,"given":"Zhenhua","family":"Liu","sequence":"additional","affiliation":[{"name":"Stony Brook University, Stony Brook, US"}]}],"member":"320","published-online":{"date-parts":[[2024,12,2]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/169627.169855"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","unstructured":"2011. Chapter 4 - The CUDA Execution Model. In CUDA Application Design and Development Rob Farber (Ed.). Morgan Kaufmann Boston 85--108. 10.1016\/B978-0-12-388426-8.00004-5","DOI":"10.1016\/B978-0-12-388426-8.00004-5"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","unstructured":"Mart\u00edn Abadi Paul Barham and etc Chen Jianmin. 2016. TensorFlow: A system for large-scale machine learning. 10.48550\/ARXIV.1605.08695","DOI":"10.48550\/ARXIV.1605.08695"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","unstructured":"Dario Amodei Rishita Anubhai and etc Battenberg Eric. 2015. Deep Speech 2: End-to-End Speech Recognition in English and Mandarin. 10.48550\/ARXIV.1512.02595","DOI":"10.48550\/ARXIV.1512.02595"},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.)","volume":"4","author":"Barham Paul","year":"2022","unstructured":"Paul Barham, Aakanksha Chowdhery, and etc Dean, Jeff. 2022. Pathways: Asynchronous Distributed Dataflow for ML. In Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.), Vol. 4. 430--449. https:\/\/proceedings.mlsys.org\/paper\/2022\/file\/98dce83da57b0395e163467c9dae521b-Paper.pdf"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1137\/0721041"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3342195.3387555"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421299"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3502181.3531462"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1147\/JRD.2019.2947013"},{"key":"e_1_3_2_1_11_1","volume-title":"Serving Heterogeneous Machine Learning Models on Multi-GPU Servers with Spatio-Temporal Sharing. In 2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Choi Seungbeom","year":"2022","unstructured":"Seungbeom Choi, Sunho Lee, Yeonjae Kim, Jongse Park, Youngjin Kwon, and Jaehyuk Huh. 2022. Serving Heterogeneous Machine Learning Models on Multi-GPU Servers with Spatio-Temporal Sharing. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). USENIX Association, Carlsbad, CA, 199--216. https:\/\/www.usenix.org\/conference\/atc22\/presentation\/choi-seungbeom"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","unstructured":"Aditya Devarakonda Maxim Naumov and Michael Garland. 2017. AdaBatch: Adaptive Batch Sizes for Training Deep Neural Networks. 10.48550\/ARXIV.1712.02029","DOI":"10.48550\/ARXIV.1712.02029"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1810.04805"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i17.17813"},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation (OSDI'20)","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati, Reza Karimi, and etc Alzayat, Safya. 2020. Serving DNNs like Clockwork: Performance Predictability from the Bottom Up. In Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation (OSDI'20). USENIX Association, USA, Article 25, 20 pages."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","unstructured":"Aaron Harlap Deepak Narayanan and etc Phanishayee Amar. 2018. PipeDream: Fast and Efficient Pipeline Parallel DNN Training. 10.48550\/ARXIV.1806.03377","DOI":"10.48550\/ARXIV.1806.03377"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/2827872"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2015. Deep Residual Learning for Image Recognition. 10.48550\/ARXIV.1512.03385","DOI":"10.48550\/ARXIV.1512.03385"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","unstructured":"Xiangnan He Lizi Liao Hanwang Zhang Liqiang Nie Xia Hu and Tat-Seng Chua. 2017. Neural Collaborative Filtering. 10.48550\/ARXIV.1708.05031","DOI":"10.48550\/ARXIV.1708.05031"},{"key":"e_1_3_2_1_21_1","unstructured":"Facebook incubator. [n. d.]. Facebookincubator\/gloo: Collective Communications Library with various primitives for multi-machine training. https:\/\/github.com\/facebookincubator\/gloo."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613175"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.5555\/3488766.3488792"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3140659.3080246"},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of the 2020 USENIX Annual Technical Conference (USENIX ATC '20). USENIX Association.","author":"Keahey Kate","year":"2020","unstructured":"Kate Keahey, Jason Anderson, and etc Zhuo Zhen. 2020. Lessons Learned from the Chameleon Testbed. In Proceedings of the 2020 USENIX Annual Technical Conference (USENIX ATC '20). USENIX Association."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3524059.3532370"},{"key":"e_1_3_2_1_27_1","unstructured":"Alex Krizhevsky. 2009. Learning Multiple Layers of Features from Tiny Images."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3342195.3387547"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","unstructured":"Mingzhen Li Wencong Xiao and etc Sun Biao. 2022. EasyScale: Accuracy-consistent Elastic Training for Deep Learning. 10.48550\/ARXIV.2208.14228","DOI":"10.48550\/ARXIV.2208.14228"},{"key":"e_1_3_2_1_30_1","volume-title":"Pytorch distributed: Experiences on accelerating data parallel training. arXiv preprint arXiv:2006.15704","author":"Li Shen","year":"2020","unstructured":"Shen Li, Yanli Zhao, and etc Varma, Rohan. 2020. Pytorch distributed: Experiences on accelerating data parallel training. arXiv preprint arXiv:2006.15704 (2020)."},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.)","volume":"4","author":"Luo Liang","year":"2022","unstructured":"Liang Luo, Peter West, and etc Patel, Pratyush. 2022. SRIFTY: Swift and Thrifty Distributed Neural Network Training on the Cloud. In Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.), Vol. 4. 833--847. https:\/\/proceedings.mlsys.org\/paper\/2022\/file\/f457c545a9ded88f18ecee47145a72c0-Paper.pdf"},{"key":"e_1_3_2_1_32_1","volume-title":"KungFu: Making Training in Distributed Machine Learning Adaptive. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Mai Luo","year":"2020","unstructured":"Luo Mai, Guo Li, and etc Wagenl\u00e4nder. 2020. KungFu: Making Training in Distributed Machine Learning Adaptive. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 937--954."},{"key":"e_1_3_2_1_33_1","volume-title":"An empirical model of large-batch training. arXiv preprint arXiv:1812.06162","author":"McCandlish Sam","year":"2018","unstructured":"Sam McCandlish, Jared Kaplan, Dario Amodei, and OpenAI Dota Team. 2018. An empirical model of large-batch training. arXiv preprint arXiv:1812.06162 (2018)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.5555\/2600239.2600241"},{"key":"e_1_3_2_1_35_1","volume-title":"Heterogeneity-Aware Cluster Scheduling Policies for Deep Learning Workloads. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Narayanan Deepak","year":"2020","unstructured":"Deepak Narayanan, Keshav Santhanam, Fiodar Kazhamiaka, Amar Phanishayee, and Matei Zaharia. 2020. Heterogeneity-Aware Cluster Scheduling Policies for Deep Learning Workloads. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 481--498. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/narayanan-deepak"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","unstructured":"Eriko Nurvitadhi Jeffrey Cook and etc Mishra Asit. 2018. In-Package Domain-Specific ASICs for Intel\u00ae Stratix\u00ae 10 FPGAs: A Case Study of Accelerating Deep Learning Using TensorTile ASIC. In 2018 28th International Conference on Field Programmable Logic and Applications (FPL). 106--1064. 10.1109\/FPL.2018.00027","DOI":"10.1109\/FPL.2018.00027"},{"key":"e_1_3_2_1_37_1","unstructured":"Nvidia. [n. d.]. Nvidia\/NCCL: Optimized Primitives for collective multi-gpu communication. https:\/\/github.com\/NVIDIA\/nccl."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1080\/00031305.1992.10475842"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_1_40_1","volume-title":"2020 USENIX Annual Technical Conference (USENIX ATC 20)","author":"Park Jay H.","year":"2020","unstructured":"Jay H. Park, Gyeongchan Yun, and etc Chang M. Yi. 2020. HetPipe: Enabling Large DNN Training on (Whimpy) Heterogeneous GPU Clusters through Integration of Pipelined Model Parallelism and Data Parallelism. In 2020 USENIX Annual Technical Conference (USENIX ATC 20). USENIX Association, 307--321. https:\/\/www.usenix.org\/conference\/atc20\/presentation\/park"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2008.09.002"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2008.09.002"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"volume-title":"Paleo: A Performance Model for Deep Neural Networks. In International Conference on Learning Representations.","author":"Sparks Evan R.","key":"e_1_3_2_1_44_1","unstructured":"Qi, Evan R. Sparks, and Ameet S. Talwalkar. 2016. Paleo: A Performance Model for Deep Neural Networks. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_45_1","volume-title":"15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21)","author":"Qiao Aurick","year":"2021","unstructured":"Aurick Qiao, Sang Keun Choe, and etc Subramanya, Suhas Jayaram. 2021. Pollux: Co-adaptive cluster scheduling for goodput-optimized deep learning. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1264"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/6.591665"},{"key":"e_1_3_2_1_48_1","volume-title":"Horovod: fast and easy distributed deep learning in TensorFlow. CoRR abs\/1802.05799","author":"Sergeev Alexander","year":"2018","unstructured":"Alexander Sergeev and Mike Del Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. CoRR abs\/1802.05799 (2018). arXiv:1802.05799 http:\/\/arxiv.org\/abs\/1802.05799"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN55064.2022.9891914"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","unstructured":"Mohammad Shoeybi Mostofa Patwary and etc Puri Raul. 2019. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. 10.48550\/ARXIV.1909.08053","DOI":"10.48550\/ARXIV.1909.08053"},{"key":"e_1_3_2_1_51_1","volume-title":"Advances in Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, and etc Parmar, Niki. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems, I. Guyon, U. Von Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett (Eds.), Vol. 30. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_2_1_52_1","volume-title":"Transparent GPU Sharing in Container Clouds for Deep Learning Workloads. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Wu Bingyang","year":"2023","unstructured":"Bingyang Wu, Zili Zhang, Zhihao Bai, Xuanzhe Liu, and Xin Jin. 2023. Transparent GPU Sharing in Container Clouds for Deep Learning Workloads. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 69--85. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/wu"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","unstructured":"Yonghui Wu Mike Schuster and etc Chen Zhifeng. 2016. Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation. 10.48550\/ARXIV.1609.08144","DOI":"10.48550\/ARXIV.1609.08144"},{"key":"e_1_3_2_1_54_1","volume-title":"AntMan: Dynamic Scaling on GPU Clusters for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Xiao Wencong","year":"2020","unstructured":"Wencong Xiao, Shiru Ren, and etc Yong Li. 2020. AntMan: Dynamic Scaling on GPU Clusters for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 533--548. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/xiao"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/TETCI.2022.3220224"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3386367.3432728"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3225058.3225069"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3127479.3127490"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3285029"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240765.3240801"}],"event":{"name":"Middleware '24: 25th International Middleware Conference","sponsor":["IFIP","Usenix"],"location":"Hong Kong Hong Kong","acronym":"Middleware '24"},"container-title":["Proceedings of the 25th International Middleware Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652892.3700767","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652892.3700767","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652892.3700767","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T22:53:57Z","timestamp":1750287237000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652892.3700767"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"references-count":60,"alternative-id":["10.1145\/3652892.3700767","10.1145\/3652892"],"URL":"https:\/\/doi.org\/10.1145\/3652892.3700767","relation":{},"subject":[],"published":{"date-parts":[[2024,12,2]]},"assertion":[{"value":"2024-12-02","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}