{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T16:03:29Z","timestamp":1772726609634,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":87,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,27]],"date-time":"2024-04-27T00:00:00Z","timestamp":1714176000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"funder":[{"DOI":"10.13039\/100007015","name":"University of Wisconsin-Madison","doi-asserted-by":"publisher","award":["AAK4964-2022-23"],"award-info":[{"award-number":["AAK4964-2022-23"]}],"id":[{"id":"10.13039\/100007015","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100007015","name":"University of Wisconsin-Madison","doi-asserted-by":"publisher","award":["133-AAK8232"],"award-info":[{"award-number":["133-AAK8232"]}],"id":[{"id":"10.13039\/100007015","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["ENS-1925485"],"award-info":[{"award-number":["ENS-1925485"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,27]]},"DOI":"10.1145\/3620665.3640410","type":"proceedings-article","created":{"date-parts":[[2024,4,22]],"date-time":"2024-04-22T14:18:06Z","timestamp":1713795486000},"page":"1146-1164","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["T3: Transparent Tracking &amp; Triggering for Fine-grained Overlap of Compute &amp; Collectives"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-1083-1146","authenticated-orcid":false,"given":"Suchita","family":"Pati","sequence":"first","affiliation":[{"name":"Computer Sciences Department, University of Wisconsin-Madison, Madison, Wisconsin, USA"},{"name":"Research and Advanced Development (RAD), Advanced Micro Devices, Inc, Santa Clara, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9552-0508","authenticated-orcid":false,"given":"Shaizeen","family":"Aga","sequence":"additional","affiliation":[{"name":"Research and Advanced Development (RAD), Advanced Micro Devices, Inc, Santa Clara, California, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9980-9720","authenticated-orcid":false,"given":"Mahzabeen","family":"Islam","sequence":"additional","affiliation":[{"name":"Research and Advanced Development (RAD), Advanced Micro Devices, Inc, Austin, Texas, United States"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2973-9479","authenticated-orcid":false,"given":"Nuwan","family":"Jayasena","sequence":"additional","affiliation":[{"name":"Research and Advanced Development (RAD), Advanced Micro Devices, Inc, Santa Clara, California, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0189-7895","authenticated-orcid":false,"given":"Matthew D.","family":"Sinclair","sequence":"additional","affiliation":[{"name":"Computer Sciences Department, University of Wisconsin-Madison, Madison, Wisconsin, United States"},{"name":"Research and Advanced Development (RAD), Advanced Micro Devices, Inc, Austin, Texas, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,4,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"AMD. 2018. AMD's ROCm Communication Collectives Library. \"https:\/\/github.com\/ROCmSoftwarePlatform\/rccl\/wiki\"."},{"key":"e_1_3_2_1_2_1","unstructured":"AMD. 2019. AMD's BLAS Library. \"https:\/\/github.com\/ROCmSoftwarePlatform\/rocBLAS\"."},{"key":"e_1_3_2_1_3_1","unstructured":"AMD. 2020. AMD's tool for creating a benchmark-driven backend library for GEMMs. \"https:\/\/github.com\/ROCmSoftwarePlatform\/Tensile\/\"."},{"key":"e_1_3_2_1_4_1","unstructured":"AMD. 2021. AMD HSA Code Object Format. \"https:\/\/rocmdocs.amd.com\/en\/latest\/ROCm_Compiler_SDK\/ROCm-Codeobj-format.html\"."},{"key":"e_1_3_2_1_5_1","unstructured":"AMD. 2022. AMD INSTINCT\u2122 MI210 ACCELERATOR. https:\/\/www.amd.com\/en\/products\/server-accelerators\/amd-instinct-mi210."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3559009.3569666"},{"key":"e_1_3_2_1_8_1","volume-title":"AI Report","author":"Benaich Nathan","year":"2022","unstructured":"Nathan Benaich and Ian Hogarth. 2022. State of AI Report 2022. https:\/\/www.stateof.ai\/."},{"key":"e_1_3_2_1_9_1","volume-title":"Advances in Neural Information Processing Systems (NeurIPS","volume":"1901","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems (NeurIPS, Vol. 33), H. Larochelle, M. Ranzato, R. Hadsell, M. F. Balcan, and H. Lin (Eds.). Curran Associates Inc., Red Hook, NY, USA, 1877--1901."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441620"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.58"},{"key":"e_1_3_2_1_12_1","unstructured":"Aakanksha Chowdhery Sharan Narang Jacob Devlin Maarten Bosma Gaurav Mishra Adam Roberts Paul Barham Hyung Won Chung Charles Sutton Sebastian Gehrmann Parker Schuh Kensen Shi Sasha Tsvyashchenko Joshua Maynez Abhishek Rao Parker Barnes Yi Tay Noam Shazeer Vinodkumar Prabhakaran Emily Reif Nan Du Ben Hutchinson Reiner Pope James Bradbury Jacob Austin Michael Isard Guy Gur-Ari Pengcheng Yin Toju Duke Anselm Levskaya Sanjay Ghemawat Sunipa Dev Henryk Michalewski Xavier Garcia Vedant Misra Kevin Robinson Liam Fedus Denny Zhou Daphne Ippolito David Luan Hyeontaek Lim Barret Zoph Alexander Spiridonov Ryan Sepassi David Dohan Shivani Agrawal Mark Omernick Andrew M. Dai Thanumalayan Sankaranarayana Pillai Marie Pellat Aitor Lewkowycz Erica Moreira Rewon Child Oleksandr Polozov Katherine Lee Zongwei Zhou Xuezhi Wang Brennan Saeta Mark Diaz Orhan Firat Michele Catasta Jason Wei Kathy Meier-Hellstern Douglas Eck Jeff Dean Slav Petrov and Noah Fiedel. 2022. PaLM: Scaling Language Modeling with Pathways. arXiv preprint arXiv:2204.02311 (2022) 87 pages."},{"key":"e_1_3_2_1_13_1","first-page":"16344","article-title":"FlashAttention: Fast and Memory-efficient Exact Attention with IO-Awareness","volume":"35","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. FlashAttention: Fast and Memory-efficient Exact Attention with IO-Awareness. Advances in Neural Information Processing Systems 35 (2022), 16344--16359.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/n19-1423"},{"key":"e_1_3_2_1_15_1","unstructured":"Shraf Eassa and Sukru Burc Eryilmaz. 2022. The Full Stack Optimization Powering NVIDIA MLPerf Training v2.0 Performance. https:\/\/developer.nvidia.com\/blog\/boosting-mlperf-training-performance-with-full-stack-optimization\/."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783716"},{"key":"e_1_3_2_1_17_1","article-title":"Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity","volume":"23","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. 2022. Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity. The Journal of Machine Learning Research 23, 1, Article 120 (jan 2022), 39 pages.","journal-title":"The Journal of Machine Learning Research"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/2082156.2082183"},{"key":"e_1_3_2_1_19_1","unstructured":"Amir Gholami. 2021. AI and Memory Wall."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00058"},{"key":"e_1_3_2_1_21_1","volume-title":"Achieving Human Parity on Automatic Chinese to English News Translation. arXiv preprint arXiv:1803.05567 (March","author":"Awadalla Hany Hassan","year":"2018","unstructured":"Hany Hassan Awadalla, Anthony Aue, Chang Chen, Vishal Chowdhary, Jonathan Clark, Christian Federmann, Xuedong Huang, Marcin Junczys-Dowmunt, Will Lewis, Mu Li, Shujie Liu, Tie-Yan Liu, Renqian Luo, Arul Menezes, Tao Qin, Frank Seide, Xu Tan, Fei Tian, Lijun Wu, Shuangzhi Wu, Yingce Xia, Dongdong Zhang, Zhirui Zhang, and Ming Zhou. 2018. Achieving Human Parity on Automatic Chinese to English News Translation. arXiv preprint arXiv:1803.05567 (March 2018), 25 pages. arXiv:1803.05567 [cs.CL]"},{"key":"e_1_3_2_1_22_1","volume-title":"Deep Residual Learning for Image Recognition. CoRR abs\/1512.03385","author":"He Kaiming","year":"2015","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2015. Deep Residual Learning for Image Recognition. CoRR abs\/1512.03385 (2015), 12 pages. arXiv:1512.03385 http:\/\/arxiv.org\/abs\/1512.03385"},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems (NeurIPS","volume":"10","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Mia Xu Chen, Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V. Le, Yonghui Wu, and Zhifeng Chen. 2019. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. In Proceedings of the 33rd International Conference on Neural Information Processing Systems (NeurIPS, Vol. 32). Curran Associates Inc., Red Hook, NY, USA, Article 10, 10 pages."},{"key":"e_1_3_2_1_24_1","volume-title":"ARK: GPU-driven Code Execution for Distributed Deep Learning. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI). USENIX Association","author":"Hwang Changho","year":"2023","unstructured":"Changho Hwang, KyoungSoo Park, Ran Shu, Xinyuan Qu, Peng Cheng, and Yongqiang Xiong. 2023. ARK: GPU-driven Code Execution for Distributed Deep Learning. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI). USENIX Association, Boston, MA, 87--101. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/hwang"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507778"},{"key":"e_1_3_2_1_26_1","unstructured":"Sylvain Jeaugey. 2022. How is tree reduction implemented? https:\/\/github.com\/NVIDIA\/nccl\/issues\/545#issuecomment-1006361565."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/2588768.2576780"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/2896377.2901468"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00010"},{"key":"e_1_3_2_1_30_1","unstructured":"Andrew Kerr Duane Merrill Julien Demouth and John Tran. 2017. cuTLASS: Fast linear algebra in CUDA C++."},{"key":"e_1_3_2_1_31_1","volume-title":"Rogers","author":"Khairy Mahmoud","year":"2018","unstructured":"Mahmoud Khairy, Akshay Jain, Tor M. Aamodt, and Timothy G. Rogers. 2018. Exploring Modern GPU Memory System Design Challenges through Accurate Modeling. CoRR abs\/1810.07269 (2018), 10 pages. arXiv:1810.07269 http:\/\/arxiv.org\/abs\/1810.07269"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00086"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00047"},{"key":"e_1_3_2_1_34_1","volume-title":"Grad-PIM: A Practical Processing-in-DRAM Architecture for Gradient Descent. In 27th IEEE International Symposium on High-Performance Computer Architecture (HPCA). IEEE Computer Society","author":"Kim Heesu","year":"2021","unstructured":"Heesu Kim, Hanmin Park, Taehyun Kim, Kwanheum cho, Eojin Lee, Soojung Ryu, Hyuk-Jae Lee, Kiyoung Choi, and Jinho Lee. 2021. Grad-PIM: A Practical Processing-in-DRAM Architecture for Gradient Descent. In 27th IEEE International Symposium on High-Performance Computer Architecture (HPCA). IEEE Computer Society, Washington, DC, USA, 14 pages."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2109.10465"},{"key":"e_1_3_2_1_36_1","volume-title":"An In-Network Architecture for Accelerating Shared-Memory Multiprocessor Collectives. In ACM\/IEEE 47th Annual International Symposium on Computer Architecture (ISCA). IEEE, IEEE Computer Society","author":"Klenk Benjamin","year":"2020","unstructured":"Benjamin Klenk, Nan Jiang, Greg Thorson, and Larry Dennison. 2020. An In-Network Architecture for Accelerating Shared-Memory Multiprocessor Collectives. In ACM\/IEEE 47th Annual International Symposium on Computer Architecture (ISCA). IEEE, IEEE Computer Society, Washington, DC, USA, 996--1009."},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of the 25th International Conference on Neural Information Processing Systems -","volume":"1","author":"Krizhevsky Alex","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E. Hinton. 2012. ImageNet Classification with Deep Convolutional Neural Networks. In Proceedings of the 25th International Conference on Neural Information Processing Systems - Volume 1 (Lake Tahoe, Nevada) (NIPS'12). Curran Associates Inc., USA, 1097--1105. http:\/\/dl.acm.org\/citation.cfm?id=2999134.2999257"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00013"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00028"},{"key":"e_1_3_2_1_40_1","volume-title":"Network In Network. In 2nd International Conference on Learning Representations (ICLR), Yoshua Bengio and Yann LeCun (Eds.). OpenReview.net, 10 pages. http:\/\/arxiv.org\/abs\/1312","author":"Lin Min","year":"2014","unstructured":"Min Lin, Qiang Chen, and Shuicheng Yan. 2014. Network In Network. In 2nd International Conference on Learning Representations (ICLR), Yoshua Bengio and Yann LeCun (Eds.). OpenReview.net, 10 pages. http:\/\/arxiv.org\/abs\/1312.4400"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173162.3173191"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2013.6522332"},{"key":"e_1_3_2_1_43_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI). USENIX Association","author":"Mahajan Kshiteej","year":"2023","unstructured":"Kshiteej Mahajan, Ching-Hsiang Chu, Srinivas Sridharan, and Aditya Akella. 2023. Better Together: Jointly Optimizing ML Collective Scheduling and Execution Planning using SYNDICATE. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI). USENIX Association, Boston, MA, 809--824. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/mahajan"},{"key":"e_1_3_2_1_44_1","volume-title":"Taylor Robie, Tom St. John, Carole-Jean Wu, Lingjie Xu, Cliff Young, and Matei Zaharia.","author":"Mattson Peter","year":"2019","unstructured":"Peter Mattson, Christine Cheng, Cody Coleman, Greg Diamos, Paulius Micikevicius, David A. Patterson, Hanlin Tang, Gu-Yeon Wei, Peter Bailis, Victor Bittorf, David Brooks, Dehao Chen, Debojyoti Dutta, Udit Gupta, Kim M. Hazelwood, Andrew Hock, Xinyuan Huang, Bill Jia, Daniel Kang, David Kanter, Naveen Kumar, Jeffery Liao, Guokai Ma, Deepak Narayanan, Tayo Oguntebi, Gennady Pekhimenko, Lillian Pentecost, Vijay Janapa Reddi, Taylor Robie, Tom St. John, Carole-Jean Wu, Lingjie Xu, Cliff Young, and Matei Zaharia. 2019. MLPerf Training Benchmark. CoRR abs\/1910.01500 (2019), 14 pages. arXiv:1910.01500 http:\/\/arxiv.org\/abs\/1910.01500"},{"key":"e_1_3_2_1_45_1","unstructured":"Paulius Micikevicius Sharan Narang Jonah Alben Gregory Diamos Erich Elsen David Garcia Boris Ginsburg Michael Houston Oleksii Kuchaiev Ganesh Venkatesh and Hao Wu. 2018. Mixed Precision Training. arXiv:1710.03740 [cs.AI] http:\/\/arxiv.org\/abs\/1710.03740"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2209.05433"},{"key":"e_1_3_2_1_47_1","volume-title":"Turing-NLG: A 17-billion-parameter language model by Microsoft. Microsoft Research Blog 1, 8","year":"2020","unstructured":"Microsoft. 2020. Turing-NLG: A 17-billion-parameter language model by Microsoft. Microsoft Research Blog 1, 8 (2020), 8 pages. https:\/\/www.microsoft.com\/en-us\/research\/blog\/turing-nlg-a-17-billion-parameter-language-model-by-microsoft\/"},{"key":"e_1_3_2_1_48_1","unstructured":"MLPerf. 2018. MLPerf Benchmark Suite. https:\/\/mlperf.org\/."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS57527.2023.00037"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480088"},{"key":"e_1_3_2_1_51_1","volume-title":"ACM\/IEEE 48th Annual International Symposium on Computer Architecture (ISCA). IEEE, IEEE Computer Society","author":"Muthukrishnan Harini","year":"2021","unstructured":"Harini Muthukrishnan, David Nellans, Daniel Lustig, Jeffrey A Fessler, and Thomas F Wenisch. 2021. Efficient Multi-GPU Shared Memory via Automatic Optimization of Fine-grained Rransfers. In ACM\/IEEE 48th Annual International Symposium on Computer Architecture (ISCA). IEEE, IEEE Computer Society, Washington, DC, USA, 139--152."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.54"},{"key":"e_1_3_2_1_53_1","unstructured":"NVIDIA. 2017. NVIDIA DGX-1 With Tesla V100 System Architecture. https:\/\/images.nvidia.com\/content\/pdf\/dgx1-v100-system-architecture-whitepaper.pdf."},{"key":"e_1_3_2_1_54_1","unstructured":"NVIDIA. 2018. NVIDIA TESLA V100 GPU ACCELERATOR. https:\/\/images.nvidia.com\/content\/technologies\/volta\/pdf\/tesla-volta-v100-datasheet-letter-fnl-web.pdf."},{"key":"e_1_3_2_1_55_1","unstructured":"NVIDIA. 2020. NVIDIA NCCL."},{"key":"e_1_3_2_1_56_1","unstructured":"NVIDIA. 2021. NVIDIA A100 TENSOR CORE GPU. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/a100\/pdf\/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf."},{"key":"e_1_3_2_1_57_1","unstructured":"NVIDIA. 2022. GPUDirect. \"https:\/\/developer.nvidia.com\/gpudirect\"."},{"key":"e_1_3_2_1_58_1","unstructured":"NVIDIA. 2023. Efficient GEMM in CUDA. https:\/\/github.com\/NVIDIA\/cutlass\/blob\/main\/media\/docs\/efficient_gemm.md#parallelized-reductions."},{"key":"e_1_3_2_1_59_1","unstructured":"NVIDIA. 2023. NVIDIA Announces DGX GH200 AI Supercomputer. https:\/\/nvidianews.nvidia.com\/news\/nvidia-grace-hopper-superchips-designed-for-accelerated-generative-ai-enter-full-production."},{"key":"e_1_3_2_1_60_1","unstructured":"NVIDIA. 2023. NVIDIA H100 TENSOR CORE GPU. https:\/\/resources.nvidia.com\/en-us-tensor-core\/nvidia-tensor-core-gpu-datasheet."},{"key":"e_1_3_2_1_61_1","volume-title":"NVIDIA cuBLAS. https:\/\/developer.nvidia.com\/cublas. Accessed","author":"NVIDIA Corp. 2016.","year":"2016","unstructured":"NVIDIA Corp. 2016. NVIDIA cuBLAS. https:\/\/developer.nvidia.com\/cublas. Accessed August 6, 2016."},{"key":"e_1_3_2_1_62_1","volume-title":"Splitwise: Efficient Generative LLM Inference Using Phase Splitting. arXiv preprint arXiv:2311.18677","author":"Patel Pratyush","year":"2023","unstructured":"Pratyush Patel, Esha Choukse, Chaojie Zhang, \u00cd\u00f1igo Goiri, Aashaka Shah, Saeed Maleki, and Ricardo Bianchini. 2023. Splitwise: Efficient Generative LLM Inference Using Phase Splitting. arXiv preprint arXiv:2311.18677 (2023), 12 pages. arXiv:2311.18677 [cs.AR]"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC55918.2022.00033"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC59245.2023.00026"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322212"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2011.7477494"},{"key":"e_1_3_2_1_67_1","volume-title":"GPU-initiated Fine-grained Overlap of Collective Communication with Computation. arXiv preprint arXiv:2305.06942","author":"Punniyamurthy Kishore","year":"2023","unstructured":"Kishore Punniyamurthy, Bradford M Beckmann, and Khaled Hamidouche. 2023. GPU-initiated Fine-grained Overlap of Collective Communication with Computation. arXiv preprint arXiv:2305.06942 (2023), 13 pages. arXiv:2305.06942 [cs.DC]"},{"key":"e_1_3_2_1_68_1","volume-title":"Language Models are Unsupervised Multitask Learners. OpenAI blog 1, 8","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners. OpenAI blog 1, 8 (2019), 9."},{"key":"e_1_3_2_1_69_1","volume-title":"International Conference on Machine Learning (ICML). PMLR, PMLR","author":"Rajbhandari Samyam","year":"2022","unstructured":"Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, and Yuxiong He. 2022. DeepSpeed-MOE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale. In International Conference on Machine Learning (ICML). PMLR, PMLR, 18332--18346."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00049"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00045"},{"key":"e_1_3_2_1_73_1","article-title":"A Generalist Agent","volume":"2022","author":"Reed Scott","year":"2022","unstructured":"Scott Reed, Konrad Zolna, Emilio Parisotto, Sergio Gomez Colmenarejo, Alexander Novikov, Gabriel Barth-Maron, Mai Gimenez, Yury Sulsky, Jackie Kay, Jost Tobias Springenberg, Tom Eccles, Jake Bruce, Ali Razavi, Ashley Edwards, Nicolas Heess, Yutian Chen, Raia Hadsell, Oriol Vinyals, Mahyar Bordbar, and Nando de Freitas. 2022. A Generalist Agent. Transactions on Machine Learning Research 2022 (2022), 42 pages. https:\/\/openreview.net\/forum?id=1ikK0kHjvj","journal-title":"Transactions on Machine Learning Research"},{"key":"e_1_3_2_1_74_1","volume-title":"Sinclair","author":"Roarty Kyle","year":"2020","unstructured":"Kyle Roarty and Matthew D. Sinclair. 2020. Modeling Modern GPU Applications in gem5. In 3rd gem5 Users' Workshop. 2 pages."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3496585"},{"key":"e_1_3_2_1_76_1","unstructured":"Aarush Selvan and Pankaj Kanwar. 2022. Google showcases Cloud TPU v4 Pods for large model training. https:\/\/cloud.google.com\/blog\/topics\/tpus\/google-showcases-cloud-tpu-v4-pods-for-large-model-training."},{"key":"e_1_3_2_1_77_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI). USENIX Association","author":"Shah Aashaka","year":"2023","unstructured":"Aashaka Shah, Vijay Chidambaram, Meghan Cowan, Saeed Maleki, Madan Musuvathi, Todd Mytkowicz, Jacob Nelson, Olli Saarikivi, and Rachee Singh. 2023. TACCL: Guiding Collective Algorithm Synthesis using Communication Sketches. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI). USENIX Association, Boston, MA, 593--612. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/shah"},{"key":"e_1_3_2_1_78_1","volume-title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. CoRR abs\/1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. CoRR abs\/1909.08053 (2019), 9 pages. arXiv:1909.08053 [cs.CL] http:\/\/arxiv.org\/abs\/1909.08053"},{"key":"e_1_3_2_1_79_1","volume-title":"3rd International Conference on Learning Representations (ICLR), Yoshua Bengio and Yann LeCun (Eds.). OpenReview.net, 14 pages","author":"Simonyan Karen","unstructured":"Karen Simonyan and Andrew Zisserman. 2015. Very Deep Convolutional Networks for Large-Scale Image Recognition. In 3rd International Conference on Learning Representations (ICLR), Yoshua Bengio and Yann LeCun (Eds.). OpenReview.net, 14 pages. http:\/\/arxiv.org\/abs\/1409.1556"},{"key":"e_1_3_2_1_80_1","volume-title":"Julie Bernauer, Xia Song, Mohammad Shoeybi, Yuxiong He, Michael Houston, Saurabh Tiwary, and Bryan Catanzaro.","author":"Smith Shaden","year":"2022","unstructured":"Shaden Smith, Mostofa Patwary, Brandon Norick, Patrick LeGresley, Samyam Rajbhandari, Jared Casper, Zhun Liu, Shrimai Prabhumoye, George Zerveas, Vijay Korthikanti, Elton Zhang, Rewon Child, Reza Yazdani Aminabadi, Julie Bernauer, Xia Song, Mohammad Shoeybi, Yuxiong He, Michael Houston, Saurabh Tiwary, and Bryan Catanzaro. 2022. Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530b, a Large-scale Generative Language Model. arXiv preprint arXiv:2201.11990 (2022), 44 pages. arXiv:2201.11990 [cs.CL]"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1145\/3091966.3091974"},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"publisher","DOI":"10.1109\/GreenCom-CPSCom.2010.102"},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","DOI":"10.1145\/3567955.3567959"},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2017.2756440"}],"event":{"name":"ASPLOS '24: 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","location":"La Jolla CA USA","acronym":"ASPLOS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620665.3640410","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3620665.3640410","content-type":"text\/html","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3620665.3640410","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3620665.3640410","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:03:42Z","timestamp":1750291422000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620665.3640410"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,27]]},"references-count":87,"alternative-id":["10.1145\/3620665.3640410","10.1145\/3620665"],"URL":"https:\/\/doi.org\/10.1145\/3620665.3640410","relation":{},"subject":[],"published":{"date-parts":[[2024,4,27]]},"assertion":[{"value":"2024-04-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}