{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T08:34:04Z","timestamp":1777106044968,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,6,11]],"date-time":"2022-06-11T00:00:00Z","timestamp":1654905600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/100002418","name":"Intel Corporation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100002418","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100019827","name":"Meta","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100019827","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,6,18]]},"DOI":"10.1145\/3470496.3527382","type":"proceedings-article","created":{"date-parts":[[2022,5,31]],"date-time":"2022-05-31T19:06:01Z","timestamp":1654023961000},"page":"581-596","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":32,"title":["Themis"],"prefix":"10.1145","author":[{"given":"Saeed","family":"Rashidi","sequence":"first","affiliation":[{"name":"Georgia Institute of Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"William","family":"Won","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sudarshan","family":"Srinivasan","sequence":"additional","affiliation":[{"name":"Intel, Bangalore, Karnataka, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Srinivas","family":"Sridharan","sequence":"additional","affiliation":[{"name":"Meta"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tushar","family":"Krishna","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2022,6,11]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2008. Introduction to InfiniBand\u2122. https:\/\/network.nvidia.com\/related-docs\/whitepapers\/IB_Intro_WP_190.pdf.  2008. Introduction to InfiniBand\u2122. https:\/\/network.nvidia.com\/related-docs\/whitepapers\/IB_Intro_WP_190.pdf."},{"key":"e_1_3_2_1_2_1","unstructured":"2015. MPI: A Message-Passing Interface Standard. https:\/\/www.mpi-forum.org\/docs\/mpi-3.1\/mpi31-report.pdf.  2015. MPI: A Message-Passing Interface Standard. https:\/\/www.mpi-forum.org\/docs\/mpi-3.1\/mpi31-report.pdf."},{"key":"e_1_3_2_1_3_1","unstructured":"2016. Graphcore. https:\/\/www.graphcore.ai\/.  2016. Graphcore. https:\/\/www.graphcore.ai\/."},{"key":"e_1_3_2_1_4_1","unstructured":"2016. Habana. https:\/\/habana.ai.  2016. Habana. https:\/\/habana.ai."},{"key":"e_1_3_2_1_5_1","unstructured":"2017. NVIDIA Collective Communication Library (NCCL). https:\/\/developer.nvidia.com\/nccl  2017. NVIDIA Collective Communication Library (NCCL). https:\/\/developer.nvidia.com\/nccl"},{"key":"e_1_3_2_1_6_1","unstructured":"2018. Cloud TPU. https:\/\/cloud.google.com\/tpu.  2018. Cloud TPU. https:\/\/cloud.google.com\/tpu."},{"key":"e_1_3_2_1_7_1","unstructured":"2019. Gaudi Training Platfrom White Paper. https:\/\/habana.ai\/wp-content\/uploads\/2019\/06\/Habana-Gaudi-Training-Platform-whitepaper.pdf.  2019. Gaudi Training Platfrom White Paper. https:\/\/habana.ai\/wp-content\/uploads\/2019\/06\/Habana-Gaudi-Training-Platform-whitepaper.pdf."},{"key":"e_1_3_2_1_8_1","unstructured":"2019. Introduction to High Bandwidth and Low Latency Network Design with 400GE. https:\/\/www.ciscolive.com\/c\/dam\/r\/ciscolive\/us\/docs\/2019\/pdf\/BRKDCN-2213.pdf.  2019. Introduction to High Bandwidth and Low Latency Network Design with 400GE. https:\/\/www.ciscolive.com\/c\/dam\/r\/ciscolive\/us\/docs\/2019\/pdf\/BRKDCN-2213.pdf."},{"key":"e_1_3_2_1_9_1","unstructured":"2019. NVIDIA DGX-2. https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-2\/  2019. NVIDIA DGX-2. https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-2\/"},{"key":"e_1_3_2_1_10_1","unstructured":"2019. The First Xe-HPC Deployment: Aurora with Xe Link. https:\/\/www.anandtech.com\/show\/15188\/analyzing-intels-discrete-xe-hpc-graphics-disclosure-ponte-vecchio\/5.  2019. The First Xe-HPC Deployment: Aurora with Xe Link. https:\/\/www.anandtech.com\/show\/15188\/analyzing-intels-discrete-xe-hpc-graphics-disclosure-ponte-vecchio\/5."},{"key":"e_1_3_2_1_11_1","unstructured":"2020. ASTRA-SIM: Enabling SW\/HW Co-Design Exploration for Distributed DL Training Platforms. https:\/\/github.com\/astra-sim\/astra-sim.git.  2020. ASTRA-SIM: Enabling SW\/HW Co-Design Exploration for Distributed DL Training Platforms. https:\/\/github.com\/astra-sim\/astra-sim.git."},{"key":"e_1_3_2_1_12_1","unstructured":"2020. Mellanox SHARP. https:\/\/docs.mellanox.com\/display\/sharpv214.  2020. Mellanox SHARP. https:\/\/docs.mellanox.com\/display\/sharpv214."},{"key":"e_1_3_2_1_13_1","unstructured":"2020. NVIDIA A100 Tensor Core GPU. https:\/\/www.nvidia.com\/en-us\/data-center\/a100\/.  2020. NVIDIA A100 Tensor Core GPU. https:\/\/www.nvidia.com\/en-us\/data-center\/a100\/."},{"key":"e_1_3_2_1_14_1","unstructured":"2021. AMD Infinity Architecture. https:\/\/www.amd.com\/en\/technologies\/infinity-architecture.  2021. AMD Infinity Architecture. https:\/\/www.amd.com\/en\/technologies\/infinity-architecture."},{"key":"e_1_3_2_1_15_1","unstructured":"2021. AMD Instinct\u2122 MI250X Accelerator. amd.com\/en\/products\/server-accelerators\/instinct-mi250x.  2021. AMD Instinct\u2122 MI250X Accelerator. amd.com\/en\/products\/server-accelerators\/instinct-mi250x."},{"key":"e_1_3_2_1_16_1","unstructured":"2021. ConnectX SmartNICs. https:\/\/www.nvidia.com\/en-in\/networking\/ethernet-adapters\/.  2021. ConnectX SmartNICs. https:\/\/www.nvidia.com\/en-in\/networking\/ethernet-adapters\/."},{"key":"e_1_3_2_1_17_1","unstructured":"2021. Fully Sharded Data Parallel: faster AI training with fewer GPUs. https:\/\/engineering.fb.com\/2021\/07\/15\/open-source\/fsdp.  2021. Fully Sharded Data Parallel: faster AI training with fewer GPUs. https:\/\/engineering.fb.com\/2021\/07\/15\/open-source\/fsdp."},{"key":"e_1_3_2_1_18_1","unstructured":"2021. Google Open-Sources Trillion-Parameter AI Language Model Switch Transformer. https:\/\/www.infoq.com\/news\/2021\/02\/google-trillion-parameter-ai\/.  2021. Google Open-Sources Trillion-Parameter AI Language Model Switch Transformer. https:\/\/www.infoq.com\/news\/2021\/02\/google-trillion-parameter-ai\/."},{"key":"e_1_3_2_1_19_1","unstructured":"2021. NVIDIA DGX SuperPOD: Instant Infrastructure for AI Leadership. https:\/\/resources.nvidia.com\/en-us-auto-datacenter\/nvpod-superpod-wp-09.  2021. NVIDIA DGX SuperPOD: Instant Infrastructure for AI Leadership. https:\/\/resources.nvidia.com\/en-us-auto-datacenter\/nvpod-superpod-wp-09."},{"key":"e_1_3_2_1_20_1","unstructured":"2022. Intel Ponte Vecchio. https:\/\/www.nextplatform.com\/2021\/08\/24\/intels-ponte-vecchio-gpu-better-not-be-a-bridge-too-far\/.  2022. Intel Ponte Vecchio. https:\/\/www.nextplatform.com\/2021\/08\/24\/intels-ponte-vecchio-gpu-better-not-be-a-bridge-too-far\/."},{"key":"e_1_3_2_1_21_1","unstructured":"2022. NVIDIA H100 Tensor Core GPU. https:\/\/www.nvidia.com\/en-us\/data-center\/h100\/.  2022. NVIDIA H100 Tensor Core GPU. https:\/\/www.nvidia.com\/en-us\/data-center\/h100\/."},{"key":"e_1_3_2_1_22_1","unstructured":"2022. NVLink AND NVSwitch. https:\/\/www.nvidia.com\/en-us\/data-center\/nvlink\/.  2022. NVLink AND NVSwitch. https:\/\/www.nvidia.com\/en-us\/data-center\/nvlink\/."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/1465482.1465560"},{"key":"e_1_3_2_1_24_1","unstructured":"Dario Amodei and Danny Hernandez. 2018. AI and Compute. https:\/\/openai.com\/blog\/ai-and-compute\/  Dario Amodei and Danny Hernandez. 2018. AI and Compute. https:\/\/openai.com\/blog\/ai-and-compute\/"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080231"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPPS.1993.262873"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/SHPCC.1992.232628"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441620"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.1206"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/1122971.1122975"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1147\/JRD.2019.2947013"},{"key":"e_1_3_2_1_32_1","unstructured":"Meghan Cowan Saeed Maleki Madanlal Musuvathi Olli Saarikivi and Yifan Xiong. 2022. GC3: An Optimizing Compiler for GPU Collective Communication. arXiv:2201.11840 [cs.DC]  Meghan Cowan Saeed Maleki Madanlal Musuvathi Olli Saarikivi and Yifan Xiong. 2022. GC3: An Optimizing Compiler for GPU Collective Communication. arXiv:2201.11840 [cs.DC]"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476178"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00056"},{"key":"e_1_3_2_1_35_1","unstructured":"Ethernet Technology Consortium. 2020. 800G Specification. https:\/\/ethernettechnologyconsortium.org\/wp-content\/uploads\/2020\/03\/800G-Specification_r1.0.pdf.  Ethernet Technology Consortium. 2020. 800G Specification. https:\/\/ethernettechnologyconsortium.org\/wp-content\/uploads\/2020\/03\/800G-Specification_r1.0.pdf."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Aaron Harlap Deepak Narayanan Amar Phanishayee Vivek Seshadri Nikhil Devanur Greg Ganger and Phil Gibbons. 2018. PipeDream: Fast and Efficient Pipeline Parallel DNN Training. arXiv:1806.03377 [cs.DC]  Aaron Harlap Deepak Narayanan Amar Phanishayee Vivek Seshadri Nikhil Devanur Greg Ganger and Phil Gibbons. 2018. PipeDream: Fast and Efficient Pipeline Parallel DNN Training. arXiv:1806.03377 [cs.DC]","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_37_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2015. Deep Residual Learning for Image Recognition. arXiv:1512.03385 [cs.CV]  Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2015. Deep Residual Learning for Image Recognition. arXiv:1512.03385 [cs.CV]"},{"key":"e_1_3_2_1_38_1","volume-title":"Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V. Le, Yonghui Wu, and Zhifeng Chen.","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang , Youlong Cheng , Ankur Bapna , Orhan Firat , Mia Xu Chen , Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V. Le, Yonghui Wu, and Zhifeng Chen. 2019 . GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism . arXiv:1811.06965 [cs.CV] Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Mia Xu Chen, Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V. Le, Yonghui Wu, and Zhifeng Chen. 2019. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. arXiv:1811.06965 [cs.CV]"},{"key":"e_1_3_2_1_39_1","unstructured":"Intel. 2020. Intel oneAPI Collective Communications Library. https:\/\/software.intel.com\/content\/www\/us\/en\/develop\/tools\/oneapi\/components\/oneccl.html  Intel. 2020. Intel oneAPI Collective Communications Library. https:\/\/software.intel.com\/content\/www\/us\/en\/develop\/tools\/oneapi\/components\/oneccl.html"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.2200\/S00772ED1V01Y201704CAC040"},{"key":"e_1_3_2_1_41_1","unstructured":"Xianyan Jia Shutao Song Wei He Yangzihao Wang Haidong Rong Feihu Zhou Liqiang Xie Zhenyu Guo Yuanzhou Yang Liwei Yu Tiegang Chen Guangxiao Hu Shaohuai Shi and Xiaowen Chu. 2018. Highly Scalable Deep Learning Training System with Mixed-Precision: Training ImageNet in Four Minutes. arXiv:1807.11205 [cs.LG]  Xianyan Jia Shutao Song Wei He Yangzihao Wang Haidong Rong Feihu Zhou Liqiang Xie Zhenyu Guo Yuanzhou Yang Liwei Yu Tiegang Chen Guangxiao Hu Shaohuai Shi and Xiaowen Chu. 2018. Highly Scalable Deep Learning Training System with Mixed-Precision: Training ImageNet in Four Minutes. arXiv:1807.11205 [cs.LG]"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/BIBE50027.2020.00181"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jneumeth.2022.109478"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00085"},{"key":"e_1_3_2_1_45_1","volume-title":"Jieyang Chen, Jiajia Li, Xu Liu, Nathan R. Tallent, and Kevin J. Barker.","author":"Li Ang","year":"2019","unstructured":"Ang Li , Shuaiwen Leon Song , Jieyang Chen, Jiajia Li, Xu Liu, Nathan R. Tallent, and Kevin J. Barker. 2019 . Evaluating Modern GPU Interconnect: PC Ie , NVLink, NV-SLI, NVSwitch and GPUDirect . arXiv:1903.04611 [cs.AR] Ang Li, Shuaiwen Leon Song, Jieyang Chen, Jiajia Li, Xu Liu, Nathan R. Tallent, and Kevin J. Barker. 2019. Evaluating Modern GPU Interconnect: PCIe, NVLink, NV-SLI, NVSwitch and GPUDirect. arXiv:1903.04611 [cs.AR]"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322259"},{"key":"e_1_3_2_1_47_1","unstructured":"Liang Luo Peter West Jacob Nelson Arvind Krishnamurthy and Luis Ceze. 2020. PLink: Discovering and Exploiting Locality for Accelerated Distributed Training on the public Cloud. In 2022 Machine Learning and Systems (MLSys). 82--97. https:\/\/proceedings.mlsys.org\/paper\/2020\/file\/182be0c5cdcd5072bb1864cdee4d3d6e-Paper.pdf  Liang Luo Peter West Jacob Nelson Arvind Krishnamurthy and Luis Ceze. 2020. PLink: Discovering and Exploiting Locality for Accelerated Distributed Training on the public Cloud. In 2022 Machine Learning and Systems (MLSys). 82--97. https:\/\/proceedings.mlsys.org\/paper\/2020\/file\/182be0c5cdcd5072bb1864cdee4d3d6e-Paper.pdf"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Dheevatsa Mudigere Yuchen Hao Jianyu Huang Zhihao Jia Andrew Tulloch Srinivas Sridharan Xing Liu Mustafa Ozdal Jade Nie Jongsoo Park Liang Luo Jie Amy Yang Leon Gao Dmytro Ivchenko Aarti Basant Yuxi Hu Jiyan Yang Ehsan K. Ardestani Xiaodong Wang Rakesh Komuravelli Ching-Hsiang Chu Serhat Yilmaz Huayu Li Jiyuan Qian Zhuobo Feng Yinbin Ma Junjie Yang Ellie Wen Hong Li Lin Yang Chonglin Sun Whitney Zhao Dimitry Melts Krishna Dhulipala KR Kishore Tyler Graf Assaf Eisenman Kiran Kumar Matam Adi Gangidi Guoqiang Jerry Chen Manoj Krishnan Avinash Nayak Krishnakumar Nair Bharath Muthiah Mahmoud khorashadi Pallab Bhattacharya Petr Lapukhov Maxim Naumov Ajit Mathews Lin Qiao Mikhail Smelyanskiy Bill Jia and Vijay Rao. 2021. Software-Hardware Co-design for Fast and Scalable Training of Deep Learning Recommendation Models. arXiv:2104.05158 [cs.DC]  Dheevatsa Mudigere Yuchen Hao Jianyu Huang Zhihao Jia Andrew Tulloch Srinivas Sridharan Xing Liu Mustafa Ozdal Jade Nie Jongsoo Park Liang Luo Jie Amy Yang Leon Gao Dmytro Ivchenko Aarti Basant Yuxi Hu Jiyan Yang Ehsan K. Ardestani Xiaodong Wang Rakesh Komuravelli Ching-Hsiang Chu Serhat Yilmaz Huayu Li Jiyuan Qian Zhuobo Feng Yinbin Ma Junjie Yang Ellie Wen Hong Li Lin Yang Chonglin Sun Whitney Zhao Dimitry Melts Krishna Dhulipala KR Kishore Tyler Graf Assaf Eisenman Kiran Kumar Matam Adi Gangidi Guoqiang Jerry Chen Manoj Krishnan Avinash Nayak Krishnakumar Nair Bharath Muthiah Mahmoud khorashadi Pallab Bhattacharya Petr Lapukhov Maxim Naumov Ajit Mathews Lin Qiao Mikhail Smelyanskiy Bill Jia and Vijay Rao. 2021. Software-Hardware Co-design for Fast and Scalable Training of Deep Learning Recommendation Models. arXiv:2104.05158 [cs.DC]","DOI":"10.1145\/3470496.3533727"},{"key":"e_1_3_2_1_49_1","unstructured":"Maxim Naumov Dheevatsa Mudigere Hao-Jun Michael Shi Jianyu Huang Narayanan Sundaraman Jongsoo Park Xiaodong Wang Udit Gupta Carole-Jean Wu Alisson G. Azzolini Dmytro Dzhulgakov Andrey Mallevich Ilia Cherniavskii Yinghai Lu Raghuraman Krishnamoorthi Ansha Yu Volodymyr Kondratenko Stephanie Pereira Xianjie Chen Wenlin Chen Vijay Rao Bill Jia Liang Xiong and Misha Smelyanskiy. 2019. Deep Learning Recommendation Model for Personalization and Recommendation Systems. arXiv:1906.00091 [cs.IR]  Maxim Naumov Dheevatsa Mudigere Hao-Jun Michael Shi Jianyu Huang Narayanan Sundaraman Jongsoo Park Xiaodong Wang Udit Gupta Carole-Jean Wu Alisson G. Azzolini Dmytro Dzhulgakov Andrey Mallevich Ilia Cherniavskii Yinghai Lu Raghuraman Krishnamoorthi Ansha Yu Volodymyr Kondratenko Stephanie Pereira Xianjie Chen Wenlin Chen Vijay Rao Bill Jia Liang Xiong and Misha Smelyanskiy. 2019. Deep Learning Recommendation Model for Personalization and Recommendation Systems. arXiv:1906.00091 [cs.IR]"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2008.09.002"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2005.335"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Samyam Rajbhandari Jeff Rasley Olatunji Ruwase and Yuxiong He. 2020. ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. arXiv:1910.02054 [cs.LG]  Samyam Rajbhandari Jeff Rasley Olatunji Ruwase and Yuxiong He. 2020. ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. arXiv:1910.02054 [cs.LG]","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/HOTI51249.2020.00020"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00049"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS48437.2020.00018"},{"key":"e_1_3_2_1_56_1","unstructured":"Aashaka Shah Vijay Chidambaram Meghan Cowan Saeed Maleki Madan Musuvathi Todd Mytkowicz Jacob Nelson Olli Saarikivi and Rachee Singh. 2021. Synthesizing Collective Communication Algorithms for Heterogeneous Networks with TACCL. arXiv:2111.04867 [cs.DC]  Aashaka Shah Vijay Chidambaram Meghan Cowan Saeed Maleki Madan Musuvathi Todd Mytkowicz Jacob Nelson Olli Saarikivi and Rachee Singh. 2021. Synthesizing Collective Communication Algorithms for Heterogeneous Networks with TACCL. arXiv:2111.04867 [cs.DC]"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358302"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.1999.10010"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342005051521"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00048"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2003.1213188"},{"key":"e_1_3_2_1_62_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention Is All You Need. arXiv:1706.03762 [cs.CL]  Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention Is All You Need. arXiv:1706.03762 [cs.CL]"},{"key":"e_1_3_2_1_63_1","volume-title":"Blink: Fast and Generic Collectives for Distributed ML. In 2020 Machine Learning and Systems (MLSys). 172--186. https:\/\/proceedings.mlsys.org\/paper\/2020\/file\/43ec517d68b6edd3015b3edc9a11367b-Paper.pdf","author":"Wang Guanhua","year":"2020","unstructured":"Guanhua Wang , Shivaram Venkataraman , Amar Phanishayee , Nikhil Devanur , Jorgen Thelin , and Ion Stoica . 2020 . Blink: Fast and Generic Collectives for Distributed ML. In 2020 Machine Learning and Systems (MLSys). 172--186. https:\/\/proceedings.mlsys.org\/paper\/2020\/file\/43ec517d68b6edd3015b3edc9a11367b-Paper.pdf Guanhua Wang, Shivaram Venkataraman, Amar Phanishayee, Nikhil Devanur, Jorgen Thelin, and Ion Stoica. 2020. Blink: Fast and Generic Collectives for Distributed ML. In 2020 Machine Learning and Systems (MLSys). 172--186. https:\/\/proceedings.mlsys.org\/paper\/2020\/file\/43ec517d68b6edd3015b3edc9a11367b-Paper.pdf"},{"key":"e_1_3_2_1_64_1","unstructured":"Yonghui Wu Mike Schuster Zhifeng Chen Quoc V. Le Mohammad Norouzi Wolfgang Macherey Maxim Krikun Yuan Cao Qin Gao Klaus Macherey Jeff Klingner Apurva Shah Melvin Johnson Xiaobing Liu Lukasz Kaiser Stephan Gouws Yoshikiyo Kato Taku Kudo Hideto Kazawa Keith Stevens George Kurian Nishant Patil Wei Wang Cliff Young Jason Smith Jason Riesa Alex Rudnick Oriol Vinyals Greg Corrado Macduff Hughes and Jeffrey Dean. 2016. Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation. arXiv:1609.08144 [cs.CL]  Yonghui Wu Mike Schuster Zhifeng Chen Quoc V. Le Mohammad Norouzi Wolfgang Macherey Maxim Krikun Yuan Cao Qin Gao Klaus Macherey Jeff Klingner Apurva Shah Melvin Johnson Xiaobing Liu Lukasz Kaiser Stephan Gouws Yoshikiyo Kato Taku Kudo Hideto Kazawa Keith Stevens George Kurian Nishant Patil Wei Wang Cliff Young Jason Smith Jason Riesa Alex Rudnick Oriol Vinyals Greg Corrado Macduff Hughes and Jeffrey Dean. 2016. Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation. arXiv:1609.08144 [cs.CL]"},{"key":"e_1_3_2_1_65_1","unstructured":"Yasaman Ghadar and Tim Williams. 2020. An Overview of Aurora Argonne's Upcoming Exascale System. https:\/\/ecpannualmeeting.com\/assets\/overview\/sessions\/Aurora-Public-FULL-talk-Feb-4-2020_for_posting_c.pdf  Yasaman Ghadar and Tim Williams. 2020. An Overview of Aurora Argonne's Upcoming Exascale System. https:\/\/ecpannualmeeting.com\/assets\/overview\/sessions\/Aurora-Public-FULL-talk-Feb-4-2020_for_posting_c.pdf"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-03770-2_41"}],"event":{"name":"ISCA '22: The 49th Annual International Symposium on Computer Architecture","location":"New York New York","acronym":"ISCA '22","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","IEEE CS TCAA IEEE CS technical committee on architectural acoustics"]},"container-title":["Proceedings of the 49th Annual International Symposium on Computer Architecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3470496.3527382","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3470496.3527382","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:30:27Z","timestamp":1750188627000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3470496.3527382"}},"subtitle":["a network bandwidth-aware collective scheduling policy for distributed training of DL models"],"short-title":[],"issued":{"date-parts":[[2022,6,11]]},"references-count":66,"alternative-id":["10.1145\/3470496.3527382","10.1145\/3470496"],"URL":"https:\/\/doi.org\/10.1145\/3470496.3527382","relation":{},"subject":[],"published":{"date-parts":[[2022,6,11]]},"assertion":[{"value":"2022-06-11","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}