{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T01:14:38Z","timestamp":1780708478878,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,27]],"date-time":"2024-04-27T00:00:00Z","timestamp":1714176000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,27]]},"DOI":"10.1145\/3620666.3651379","type":"proceedings-article","created":{"date-parts":[[2024,4,24]],"date-time":"2024-04-24T12:08:21Z","timestamp":1713960501000},"page":"178-191","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":35,"title":["Centauri: Enabling Efficient Scheduling for Communication-Computation Overlap in Large Model Training via Communication Partitioning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-6416-7172","authenticated-orcid":false,"given":"Chang","family":"Chen","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4896-121X","authenticated-orcid":false,"given":"Xiuhong","family":"Li","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5021-2912","authenticated-orcid":false,"given":"Qianchao","family":"Zhu","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6327-2033","authenticated-orcid":false,"given":"Jiangfei","family":"Duan","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hongkong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8456-0491","authenticated-orcid":false,"given":"Peng","family":"Sun","sequence":"additional","affiliation":[{"name":"Shanghai AI Lab, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8525-0608","authenticated-orcid":false,"given":"Xingcheng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shanghai AI Lab, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7426-6248","authenticated-orcid":false,"given":"Chao","family":"Yang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,4,27]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"1877","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. Language models are few-shot learners. In H. Larochelle, M. Ranzato, R. Hadsell, M.F. Balcan, and H. Lin, editors, Advances in Neural Information Processing Systems, volume 33, pages 1877--1901. Curran Associates, Inc., 2020."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441620"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS57875.2023.00015"},{"key":"e_1_3_2_1_4_1","volume-title":"Gc3: An optimizing compiler for gpu collective communication","author":"Cowan Meghan","year":"2022","unstructured":"Meghan Cowan, Saeed Maleki, Madan Musuvathi, Olli Saarikivi, and Yifan Xiong. Gc3: An optimizing compiler for gpu collective communication. 2022."},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. BERT: Pre-training of deep bidirectional transformers for language understanding. In Jill Burstein, Christy Doran, and Thamar Solorio, editors, Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pages 4171--4186, Minneapolis, Minnesota, June 2019. Association for Computational Linguistics."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"e_1_3_2_1_7_1","volume-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. CoRR, abs\/2101.03961","author":"Fedus William","year":"2021","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. CoRR, abs\/2101.03961, 2021."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTR.2006.311904"},{"key":"e_1_3_2_1_9_1","first-page":"418","volume-title":"Proceedings of Machine Learning and Systems","volume":"1","author":"Hashemi Sayed Hadi","year":"2019","unstructured":"Sayed Hadi Hashemi, Sangeetha Abdu Jyothi, and Roy Campbell. Tic-tac: Accelerating distributed deep learning with communication scheduling. In A. Talwalkar, V. Smith, and M. Zaharia, editors, Proceedings of Machine Learning and Systems, volume 1, pages 418--430, 2019."},{"key":"e_1_3_2_1_10_1","volume-title":"Curran Associates Inc.","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Mia Xu Chen, Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V. Le, Yonghui Wu, and Zhifeng Chen. GPipe: Efficient Training of Giant Neural Networks Using Pipeline Parallelism. Curran Associates Inc., Red Hook, NY, USA, 2019."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507778"},{"key":"e_1_3_2_1_12_1","volume-title":"Beyond data and model parallelism for deep neural networks. CoRR, abs\/1807.05358","author":"Jia Zhihao","year":"2018","unstructured":"Zhihao Jia, Matei Zaharia, and Alex Aiken. Beyond data and model parallelism for deep neural networks. CoRR, abs\/1807.05358, 2018."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.5555\/3488766.3488792"},{"key":"e_1_3_2_1_14_1","volume-title":"Scaling laws for neural language models. CoRR, abs\/2001.08361","author":"Kaplan Jared","year":"2020","unstructured":"Jared Kaplan, Sam McCandlish, Tom Henighan, Tom B. Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. Scaling laws for neural language models. CoRR, abs\/2001.08361, 2020."},{"key":"e_1_3_2_1_15_1","volume-title":"Reducing activation recomputation in large transformer models","author":"Korthikanti Vijay","year":"2022","unstructured":"Vijay Korthikanti, Jared Casper, Sangkug Lym, Lawrence McAfee, Michael Andersch, Mohammad Shoeybi, and Bryan Catanzaro. Reducing activation recomputation in large transformer models, 2022."},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems, NIPS'20","author":"Kwon Woosuk","year":"2020","unstructured":"Woosuk Kwon, Gyeong-In Yu, Eunji Jeong, and Byung-Gon Chun. Nimble: Lightweight and parallel gpu task scheduling for deep learning. In Proceedings of the 34th International Conference on Neural Information Processing Systems, NIPS'20, Red Hook, NY, USA, 2020. Curran Associates Inc."},{"key":"e_1_3_2_1_17_1","volume-title":"Breadth-first pipeline parallelism","author":"Lamy-Poirier Joel","year":"2023","unstructured":"Joel Lamy-Poirier. Breadth-first pipeline parallelism, 2023."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476145"},{"key":"e_1_3_2_1_20_1","first-page":"809","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Mahajan Kshiteej","year":"2023","unstructured":"Kshiteej Mahajan, Ching-Hsiang Chu, Srinivas Sridharan, and Aditya Akella. Better together: Jointly optimizing ML collective scheduling and execution planning using SYNDICATE. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23), pages 809--824, Boston, MA, April 2023. USENIX Association."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_22_1","unstructured":"NVDIA. Massively scale your deep learning training with nccl 2.4. https:\/\/developer.nvidia.com\/blog\/massively-scale-deep-learning-training-nccl-2-4\/."},{"key":"e_1_3_2_1_23_1","volume-title":"Nvidia dgx a100 system architecture. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/dgx-a100\/dgxa100-system-architecture-white-paper.pdf","author":"NVIDIA.","year":"2020","unstructured":"NVIDIA. Nvidia dgx a100 system architecture. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/dgx-a100\/dgxa100-system-architecture-white-paper.pdf, 2020."},{"key":"e_1_3_2_1_24_1","volume-title":"NVIDIA Collective Communication Library (NCCL) Documentation. https:\/\/docs.nvidia.com\/deeplearning\/nccl\/user-guide\/docs\/index.html","author":"NVIDIA.","year":"2022","unstructured":"NVIDIA. NVIDIA Collective Communication Library (NCCL) Documentation. https:\/\/docs.nvidia.com\/deeplearning\/nccl\/user-guide\/docs\/index.html, 2022."},{"key":"e_1_3_2_1_25_1","volume-title":"https:\/\/www.nvidia.com\/en-us\/data-center\/nvlink\/","author":"NVIDIA.","year":"2022","unstructured":"NVIDIA. NVLINK. https:\/\/www.nvidia.com\/en-us\/data-center\/nvlink\/, 2022."},{"key":"e_1_3_2_1_26_1","volume-title":"NVSWITCH: The world's highest-bandwidth on-node switch. https:\/\/images.nvidia.com\/content\/pdf\/nvswitch-technical-overview.pdf","author":"NVIDIA.","year":"2022","unstructured":"NVIDIA. NVSWITCH: The world's highest-bandwidth on-node switch. https:\/\/images.nvidia.com\/content\/pdf\/nvswitch-technical-overview.pdf, 2022."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519563"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_31_1","volume-title":"Synthesizing collective communication algorithms for heterogeneous networks with taccl. arXiv preprint","author":"Shah Aashaka","year":"2021","unstructured":"Aashaka Shah, Vijay Chidambaram, Meghan Cowan, Saeed Maleki, Madan Musuvathi, Todd Mytkowicz, Jacob Nelson, Olli Saarikivi, and Rachee Singh. Synthesizing collective communication algorithms for heterogeneous networks with taccl. arXiv preprint, 2021."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM42981.2021.9488803"},{"key":"e_1_3_2_1_33_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism. CoRR, abs\/1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. Megatron-lm: Training multi-billion parameter language models using model parallelism. CoRR, abs\/1909.08053, 2019."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-39924-7_38"},{"key":"e_1_3_2_1_35_1","volume-title":"Llama: Open and efficient foundation language models","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. Llama: Open and efficient foundation language models, 2023."},{"key":"e_1_3_2_1_36_1","first-page":"12","article-title":"Efficient mpi-allreduce for large-scale deep learning on gpu-clusters","volume":"33","author":"Truong Thao Nguyen","year":"2019","unstructured":"Thao Nguyen Truong, Mohamed Wahib, and Ryousei Takano. Efficient mpi-allreduce for large-scale deep learning on gpu-clusters. Concurrency and Computation: Practice and Experience, 33, 12 2019.","journal-title":"Concurrency and Computation: Practice and Experience"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CCGRID.2019.00057"},{"key":"e_1_3_2_1_38_1","volume-title":"Advances in Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141 ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. Von Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, Advances in Neural Information Processing Systems, volume 30. Curran Associates, Inc., 2017."},{"key":"e_1_3_2_1_39_1","volume-title":"Blink: Fast and generic collectives for distributed ML. CoRR, abs\/1910.04940","author":"Wang Guanhua","year":"2019","unstructured":"Guanhua Wang, Shivaram Venkataraman, Amar Phanishayee, Jorgen Thelin, Nikhil R. Devanur, and Ion Stoica. Blink: Fast and generic collectives for distributed ML. CoRR, abs\/1910.04940, 2019."},{"key":"e_1_3_2_1_40_1","first-page":"93","volume-title":"Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"1","author":"Wang Shibo","year":"2023","unstructured":"Shibo Wang, Jinliang Wei, Amit Sabne, Andy Davis, Berkin Ilbeyi, Blake Hechtman, Dehao Chen, Karthik Srinivasa Murthy, Marcello Maggioni, Qiao Zhang, Sameer Kumar, Tongfei Guo, Yuanzhong Xu, and Zongwei Zhou. Overlap communication with dependent computation via decomposition in large deep learning models. In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1, ASPLOS 2023, page 93--106, New York, NY, USA, 2022. Association for Computing Machinery."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS57875.2023.00054"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.14778\/3611540.3611569"},{"key":"e_1_3_2_1_43_1","first-page":"559","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P. Xing, Joseph E. Gonzalez, and Ion Stoica. Alpa: Automating inter- and Intra-Operator parallelism for distributed deep learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 559--578, Carlsbad, CA, July 2022. USENIX Association."}],"event":{"name":"ASPLOS '24: 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3","location":"La Jolla CA USA","acronym":"ASPLOS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620666.3651379","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:03:43Z","timestamp":1750291423000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620666.3651379"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,27]]},"references-count":43,"alternative-id":["10.1145\/3620666.3651379","10.1145\/3620666"],"URL":"https:\/\/doi.org\/10.1145\/3620666.3651379","relation":{},"subject":[],"published":{"date-parts":[[2024,4,27]]},"assertion":[{"value":"2024-04-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}