{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T22:21:14Z","timestamp":1778278874779,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":68,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3689031.3717466","type":"proceedings-article","created":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T06:25:20Z","timestamp":1742970320000},"page":"541-557","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Comprehensive Deadlock Prevention for GPU Collective Communication"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7451-0140","authenticated-orcid":false,"given":"Lichen","family":"Pan","sequence":"first","affiliation":[{"name":"School of Computer Science, Peking University"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8213-7718","authenticated-orcid":false,"given":"Juncheng","family":"Liu","sequence":"additional","affiliation":[{"name":"OneFlow Research"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7564-5239","authenticated-orcid":false,"given":"Yongquan","family":"Fu","sequence":"additional","affiliation":[{"name":"National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0700-2645","authenticated-orcid":false,"given":"Jinhui","family":"Yuan","sequence":"additional","affiliation":[{"name":"OneFlow Research"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1652-7202","authenticated-orcid":false,"given":"Rongkai","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7015-0491","authenticated-orcid":false,"given":"Pengze","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6784-9709","authenticated-orcid":false,"given":"Zhen","family":"Xiao","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Tensorflow: A system for large-scale machine learning. In 12th { USENIX} symposium on operating systems design and implementation ({OSDI} 16). 265--283.","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, et al. 2016. Tensorflow: A system for large-scale machine learning. In 12th { USENIX} symposium on operating systems design and implementation ({OSDI} 16). 265--283."},{"key":"e_1_3_2_1_2_1","unstructured":"AMD. 2023. IOMMU Advisory for Multi-GPU Environments. [Online]. https:\/\/community.amd.com\/t5\/knowledge-base\/iommu-advisory-for-multi-gpu-environments\/ta- p\/477468."},{"key":"e_1_3_2_1_3_1","unstructured":"Dario Amodei Danny Hernandez Girish Sastry Jack Clark Greg Brockman and Ilya Sutskever. 2018. AI and Compute. [Online]. https:\/\/openai.com\/blog\/ai-and-compute\/."},{"key":"e_1_3_2_1_4_1","unstructured":"S Tanenbaum Andrew and Bos Herbert. 2015. Modern operating systems. Pearson Education."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM41043.2020.9155446"},{"key":"e_1_3_2_1_6_1","volume-title":"Pathways: Asynchronous distributed dataflow for ML. arXiv preprint arXiv:2203.12533","author":"Barham Paul","year":"2022","unstructured":"Paul Barham, Aakanksha Chowdhery, Jeff Dean, Sanjay Ghemawat, Steven Hand, Dan Hurt, Michael Isard, Hyeontaek Lim, Ruoming Pang, Sudip Roy, et al. 2022. Pathways: Asynchronous distributed dataflow for ML. arXiv preprint arXiv:2203.12533 (2022)."},{"key":"e_1_3_2_1_7_1","volume-title":"Maximizing Parallelism in Distributed Training for Huge Neural Networks. CoRR abs\/2105.14450","author":"Bian Zhengda","year":"2021","unstructured":"Zhengda Bian, Qifan Xu, Boxiang Wang, and Yang You. 2021. Maximizing Parallelism in Distributed Training for Huge Neural Networks. CoRR abs\/2105.14450 (2021). arXiv:2105.14450 https:\/\/arxiv.org\/abs\/2105.14450"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441620"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3018743.3018748"},{"key":"e_1_3_2_1_10_1","first-page":"241","article-title":"Blueconnect: Decomposing all-reduce for deep learning on heterogeneous network hierarchy","volume":"1","author":"Cho Minsik","year":"2019","unstructured":"Minsik Cho, Ulrich Finkler, David Kung, and Hillery Hunter. 2019. Blueconnect: Decomposing all-reduce for deep learning on heterogeneous network hierarchy. Proceedings of Machine Learning and Systems 1 (2019), 241--251.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_11_1","first-page":"1","article-title":"Palm: Scaling language modeling with pathways","volume":"24","author":"Chowdhery Aakanksha","year":"2023","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. 2023. Palm: Scaling language modeling with pathways. Journal of Machine Learning Research 24, 240 (2023), 1--113.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_12_1","unstructured":"Pytorch Community. 2019. Multi-gpu example freeze and is not killable. [Online]. https:\/\/github.com\/pytorch\/pytorch\/issues\/24081."},{"key":"e_1_3_2_1_13_1","unstructured":"Pytorch Community. 2019. pin_memory stuck in DDP\/Reducer constructor. [Online]. https:\/\/github.com\/pytorch\/pytorch\/issues\/31095."},{"key":"e_1_3_2_1_14_1","unstructured":"Pytorch Community. 2021. Deadlock in a single machine multi-gpu using dataparlel when cpu is AMD. [Online]. https:\/\/github.com\/pytorch\/pytorch\/issues\/52142."},{"key":"e_1_3_2_1_15_1","unstructured":"PyTorch Community. 2023. Waring in Process Group. [Online]. https:\/\/pytorch.org\/docs\/stable\/distributed.html."},{"key":"e_1_3_2_1_16_1","unstructured":"TensorFlow Community. 2020. tf.distribute.MirroredStrategy is stuck. [Online]. https:\/\/github.com\/tensorflow\/tensorflow\/issues\/44976."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575724"},{"key":"e_1_3_2_1_18_1","volume-title":"Multi-objective evolutionary optimisation for product design and manufacturing","author":"Deb Kalyanmoy","unstructured":"Kalyanmoy Deb. 2011. Multi-objective optimisation using evolutionary algorithms: an introduction. In Multi-objective evolutionary optimisation for product design and manufacturing. Springer, 3--34."},{"key":"e_1_3_2_1_19_1","volume-title":"9th International Conference on Learning Representations, ICLR 2021","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3-7, 2021. OpenReview.net. https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1016\/0743-7315(92)90014-E"},{"key":"e_1_3_2_1_21_1","volume-title":"AI and Memory Wall. RiseLab Medium Post","author":"Gholami Amir","year":"2021","unstructured":"Amir Gholami, Zhewei Yao, Sehoon Kim, Michael W. Mahoney, and Kurt Keutzer. 2021. AI and Memory Wall. RiseLab Medium Post (2021). Retrieved Feb. 2, 2023 from https:\/\/medium.com\/riselab\/ai-and-memory-wall-2cb4265cb0b8"},{"key":"e_1_3_2_1_22_1","volume-title":"Microsecond-scale Preemption for Concurrent GPU-accelerated DNN Inferences. In 16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022","author":"Han Mingcong","year":"2022","unstructured":"Mingcong Han, Hanze Zhang, Rong Chen, and Haibo Chen. 2022. Microsecond-scale Preemption for Concurrent GPU-accelerated DNN Inferences. In 16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022, Carlsbad, CA, USA, July 11-13, 2022, Marcos K. Aguilera and Hakim Weatherspoon (Eds.). USENIX Association, 539--558. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/han"},{"key":"e_1_3_2_1_23_1","first-page":"418","article-title":"Tictac: Accelerating distributed deep learning with communication scheduling","volume":"1","author":"Hashemi Sayed Hadi","year":"2019","unstructured":"Sayed Hadi Hashemi, Sangeetha Abdu Jyothi, and Roy Campbell. 2019. Tictac: Accelerating distributed deep learning with communication scheduling. Proceedings of Machine Learning and Systems 1 (2019), 418--430.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_25_1","volume-title":"Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems 32","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al. 2019. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_26_1","unstructured":"Hugging Face. 2021. CodeParrot. [Online]. https:\/\/huggingface.co\/codeparrot\/codeparrot."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507778"},{"key":"e_1_3_2_1_28_1","first-page":"132","article-title":"Priority-based parameter propagation for distributed DNN training","volume":"1","author":"Jayarajan Anand","year":"2019","unstructured":"Anand Jayarajan, Jinliang Wei, Garth Gibson, Alexandra Fedorova, and Gennady Pekhimenko. 2019. Priority-based parameter propagation for distributed DNN training. Proceedings of Machine Learning and Systems 1 (2019), 132--145.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.5555\/3488766.3488792"},{"key":"e_1_3_2_1_30_1","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Jiang Ziheng","year":"2024","unstructured":"Ziheng Jiang, Haibin Lin, Yinmin Zhong, Qi Huang, Yangrui Chen, Zhi Zhang, Yanghua Peng, Xiang Li, Cong Xie, Shibiao Nong, et al. 2024. {MegaScale}: Scaling Large Language Model Training to More Than 10,000 {GPUs}. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). 745--760."},{"key":"e_1_3_2_1_31_1","unstructured":"Jiri Kraus. 2013. An Introduction to CUDA-Aware MPI. [Online]. https:\/\/developer.nvidia.com\/blog\/introduction-cuda-aware-mpi\/."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2020.2988251"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/RTAS.2018.00029"},{"key":"e_1_3_2_1_34_1","volume-title":"Scaling Distributed Machine Learning with the Parameter Server. In 11th USENIX Symposium on Operating Systems Design and Implementation, OSDI '14","author":"Li Mu","year":"2014","unstructured":"Mu Li, David G. Andersen, Jun Woo Park, Alexander J. Smola, Amr Ahmed, Vanja Josifovski, James Long, Eugene J. Shekita, and Bor-Yiing Su. 2014. Scaling Distributed Machine Learning with the Parameter Server. In 11th USENIX Symposium on Operating Systems Design and Implementation, OSDI '14, Broomfield, CO, USA, October 6-8, 2014, Jason Flinn and Hank Levy (Eds.). USENIX Association, 583--598. https:\/\/www.usenix.org\/conference\/osdi14\/technical-sessions\/presentation\/li_mu"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2016.76"},{"key":"e_1_3_2_1_37_1","first-page":"82","article-title":"Plink: Discovering and exploiting locality for accelerated distributed training on the public cloud","volume":"2","author":"Luo Liang","year":"2020","unstructured":"Liang Luo, Peter West, Jacob Nelson, Arvind Krishnamurthy, and Luis Ceze. 2020. Plink: Discovering and exploiting locality for accelerated distributed training on the public cloud. Proceedings of Machine Learning and Systems 2 (2020), 82--97.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_38_1","volume-title":"KungFu: Making Training in Distributed Machine Learning Adaptive. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Mai Luo","year":"2020","unstructured":"Luo Mai, Guo Li, Marcel Wagenl\u00e4nder, Konstantinos Fertakis, Andrei-Octavian Brabete, and Peter Pietzuch. 2020. KungFu: Making Training in Distributed Machine Learning Adaptive. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 937--954."},{"key":"e_1_3_2_1_39_1","volume-title":"Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. 2021. CheckFreq: Frequent, Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21). USENIX Association, 203--216."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_42_1","unstructured":"NVIDIA. 2021. NCCL Tests. [Online]. https:\/\/github.com\/NVIDIA\/nccl-tests."},{"key":"e_1_3_2_1_43_1","unstructured":"NVIDIA. 2023. CUDA C++ Programming Guide. [Online]. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html."},{"key":"e_1_3_2_1_44_1","unstructured":"NVIDIA. 2023. CUDA Stream Management. [Online]. https:\/\/docs.nvidia.com\/cuda\/cuda-runtime-api\/group__CUDART__STREAM.html."},{"key":"e_1_3_2_1_45_1","unstructured":"NVIDIA. 2023. Inter-GPU Communication with CUDA-aware MPI. [Online]. https:\/\/docs.nvidia.com\/deeplearning\/nccl\/user-guide\/docs\/mpi.html."},{"key":"e_1_3_2_1_46_1","unstructured":"NVIDIA. 2023. NVIDIA NCCL. [Online]. https:\/\/github.com\/nvidia\/nccl."},{"key":"e_1_3_2_1_47_1","unstructured":"NVIDIA. 2023. PCI Access Control Services (ACS). [Online]. https:\/\/docs.nvidia.com\/deeplearning\/nccl\/user-guide\/docs\/troubleshooting.html."},{"key":"e_1_3_2_1_48_1","unstructured":"NVIDIA. 2023. Using Multiple NCCL Communicators Concurrently. [Online]. https:\/\/docs.nvidia.com\/deeplearning\/nccl\/user-guide\/docs\/usage\/communicators.html."},{"key":"e_1_3_2_1_49_1","first-page":"22","article-title":"Scheduling Techniques for Concurrent Systems","volume":"82","author":"Ousterhout John K","year":"1982","unstructured":"John K Ousterhout et al. 1982. Scheduling Techniques for Concurrent Systems.. In ICDCS, Vol. 82. 22--30.","journal-title":"ICDCS"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/2694344.2694346"},{"key":"e_1_3_2_1_51_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019), 8026--8037."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"e_1_3_2_1_53_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527382"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_56_1","volume-title":"Horovod: fast and easy distributed deep learning in TensorFlow. arXiv preprint arXiv:1802.05799","author":"Sergeev Alexander","year":"2018","unstructured":"Alexander Sergeev and Mike Del Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. arXiv preprint arXiv:1802.05799 (2018)."},{"key":"e_1_3_2_1_57_1","unstructured":"Jaime Sevilla and Pablo Villalobos. 2021. Parameter Counts in Machine Learning. [Online]. https:\/\/www.alignmentforum.org\/posts\/GzoWcYibWYwJva8aL\/parametercounts-in-machine-learning."},{"key":"e_1_3_2_1_58_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Shah Aashaka","year":"2023","unstructured":"Aashaka Shah, Vijay Chidambaram, Meghan Cowan, Saeed Maleki, Madan Musuvathi, Todd Mytkowicz, Jacob Nelson, and Olli Saarikivi. 2023. TACCL: Guiding Collective Algorithm Synthesis using Communication Sketches. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 593--612."},{"key":"e_1_3_2_1_59_1","volume-title":"Megatron-lm: Training multibillion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multibillion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2014.6853208"},{"key":"e_1_3_2_1_61_1","volume-title":"MVAPICH: MPI over InfiniBand, Omni-Path, Ethernet\/iWARP, RoCE, and Slingshot. [Online]. https:\/\/mvapich.cse.ohio-state.edu\/.","author":"Team MVAPICH","year":"2023","unstructured":"MVAPICH Team. 2023. MVAPICH: MPI over InfiniBand, Omni-Path, Ethernet\/iWARP, RoCE, and Slingshot. [Online]. https:\/\/mvapich.cse.ohio-state.edu\/."},{"key":"e_1_3_2_1_62_1","first-page":"172","article-title":"Blink: Fast and generic collectives for distributed ml","volume":"2","author":"Wang Guanhua","year":"2020","unstructured":"Guanhua Wang, Shivaram Venkataraman, Amar Phanishayee, Nikhil Devanur, Jorgen Thelin, and Ion Stoica. 2020. Blink: Fast and generic collectives for distributed ml. Proceedings of Machine Learning and Systems 2 (2020), 172--186.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037742"},{"key":"e_1_3_2_1_64_1","unstructured":"Yuxin Wu Alexander Kirillov Francisco Massa Wan-Yen Lo and Ross Girshick. 2019. Detectron2. [Online]. https:\/\/github.com\/facebookresearch\/detectron2."},{"key":"e_1_3_2_1_65_1","volume-title":"An Efficient 2D Method for Training Super-Large Deep Learning Models. CoRR abs\/2104.05343","author":"Xu Qifan","year":"2021","unstructured":"Qifan Xu, Shenggui Li, Chaoyu Gong, and Yang You. 2021. An Efficient 2D Method for Training Super-Large Deep Learning Models. CoRR abs\/2104.05343 (2021). arXiv:2104.05343 https:\/\/arxiv.org\/abs\/2104.05343"},{"key":"e_1_3_2_1_66_1","unstructured":"Jinhui Yuan Xinqi Li Cheng Cheng Juncheng Liu Ran Guo Shenghang Cai Chi Yao Fei Yang Xiaodong Yi Chuan Wu et al. 2021. OneFlow: Redesign the Distributed Deep Learning Framework from Scratch. arXiv preprint arXiv:2110.15032 (2021)."},{"key":"e_1_3_2_1_67_1","volume-title":"2017 USENIX Annual Technical Conference (USENIX ATC 17)","author":"Zhang Hao","year":"2017","unstructured":"Hao Zhang, Zeyu Zheng, Shizhen Xu, Wei Dai, Qirong Ho, Xiaodan Liang, Zhiting Hu, Jinliang Wei, Pengtao Xie, and Eric P Xing. 2017. Poseidon: An efficient communication architecture for distributed deep learning on {GPU} clusters. In 2017 USENIX Annual Technical Conference (USENIX ATC 17). 181--193."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/RTAS.2015.7108420"}],"event":{"name":"EuroSys '25: Twentieth European Conference on Computer Systems","location":"Rotterdam Netherlands","acronym":"EuroSys '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Twentieth European Conference on Computer Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689031.3717466","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3689031.3717466","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T11:19:50Z","timestamp":1755775190000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689031.3717466"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":68,"alternative-id":["10.1145\/3689031.3717466","10.1145\/3689031"],"URL":"https:\/\/doi.org\/10.1145\/3689031.3717466","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}