{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T05:26:13Z","timestamp":1768886773988,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,8,12]],"date-time":"2024-08-12T00:00:00Z","timestamp":1723420800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,8,12]]},"DOI":"10.1145\/3673038.3673136","type":"proceedings-article","created":{"date-parts":[[2024,8,8]],"date-time":"2024-08-08T18:29:01Z","timestamp":1723141741000},"page":"84-96","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["The Case for Co-Designing Model Architectures with Hardware"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6823-9080","authenticated-orcid":false,"given":"Quentin","family":"Anthony","sequence":"first","affiliation":[{"name":"Ohio State University, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1965-9169","authenticated-orcid":false,"given":"Jacob","family":"Hatef","sequence":"additional","affiliation":[{"name":"Ohio State University, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3020-2848","authenticated-orcid":false,"given":"Deepak","family":"Narayanan","sequence":"additional","affiliation":[{"name":"NVIDIA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8228-1042","authenticated-orcid":false,"given":"Stella","family":"Biderman","sequence":"additional","affiliation":[{"name":"EleutherAI, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1212-0379","authenticated-orcid":false,"given":"Stas","family":"Bekman","sequence":"additional","affiliation":[{"name":"Contextual AI, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3843-5520","authenticated-orcid":false,"given":"Junqi","family":"Yin","sequence":"additional","affiliation":[{"name":"Oak Ridge National Laboratory, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1924-2769","authenticated-orcid":false,"given":"Aamir","family":"Shafi","sequence":"additional","affiliation":[{"name":"Ohio State University, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1200-2754","authenticated-orcid":false,"given":"Hari","family":"Subramoni","sequence":"additional","affiliation":[{"name":"Ohio State University, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0356-1781","authenticated-orcid":false,"given":"Dhabaleswar","family":"Panda","sequence":"additional","affiliation":[{"name":"Ohio State University, United States of America"}]}],"member":"320","published-online":{"date-parts":[[2024,8,12]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2021. FasterTransformer. https:\/\/github.com\/NVIDIA\/FasterTransformer."},{"key":"e_1_3_2_1_2_1","unstructured":"2022. DeepSpeed-MII. https:\/\/github.com\/microsoft\/DeepSpeed-MII."},{"key":"e_1_3_2_1_3_1","unstructured":"2023. MLPerf. https:\/\/mlperf.org\/. Accessed: 2024\/07\/10 13:39:06."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Reza\u00a0Yazdani Aminabadi Samyam Rajbhandari Minjia Zhang Ammar\u00a0Ahmad Awan Cheng Li Du Li Elton Zheng Jeff Rasley Shaden Smith Olatunji Ruwase and Yuxiong He. 2022. DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale. arxiv:2207.00032\u00a0[cs.LG]","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_5_1","unstructured":"Alex Andonian Quentin Anthony Stella Biderman Sid Black Preetham Gali Leo Gao Eric Hallahan Josh Levy-Kramer Connor Leahy Lucas Nestler Kip Parker Michael Pieler Jason Phang Shivanshu Purohit Hailey Schoelkopf Dashiell Stander Tri Songz Curt Tigges Benjamin Th\u00e9rien Phil Wang and Samuel Weinbach. 2023. GPT-NeoX: Large Scale Autoregressive Language Modeling in PyTorch. GitHub Repo. https:\/\/www.github.com\/eleutherai\/gpt-neox"},{"key":"e_1_3_2_1_6_1","volume-title":"International Conference on Machine Learning. PMLR, 2397\u20132430","author":"Biderman Stella","year":"2023","unstructured":"Stella Biderman, Hailey Schoelkopf, Quentin\u00a0Gregory Anthony, Herbie Bradley, Kyle O\u2019Brien, Eric Hallahan, Mohammad\u00a0Aflah Khan, Shivanshu Purohit, USVSN\u00a0Sai Prashanth, Edward Raff, 2023. Pythia: A suite for analyzing large language models across training and scaling. In International Conference on Machine Learning. PMLR, 2397\u20132430."},{"key":"e_1_3_2_1_7_1","volume-title":"GPT-Neo: Large scale autoregressive language modeling with mesh-tensorflow. If you use this software, please cite it using these metadata 58","author":"Black Sid","year":"2021","unstructured":"Sid Black, Leo Gao, Phil Wang, Connor Leahy, and Stella Biderman. 2021. GPT-Neo: Large scale autoregressive language modeling with mesh-tensorflow. If you use this software, please cite it using these metadata 58 (2021)."},{"key":"e_1_3_2_1_8_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared\u00a0D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan Rewon Child Aditya Ramesh Daniel Ziegler Jeffrey Wu Clemens Winter Chris Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems Vol.\u00a033. 1877\u20131901."},{"key":"e_1_3_2_1_9_1","volume-title":"PaLM: Scaling Language Modeling with Pathways. Computing Research Repository","author":"Chowdhery Aakanksha","year":"2022","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung\u00a0Won Chung, Charles Sutton, Sebastian Gehrmann, 2022. PaLM: Scaling Language Modeling with Pathways. Computing Research Repository (2022). arXiv:2204.02311\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2204.02311v5Version 5."},{"key":"e_1_3_2_1_10_1","unstructured":"Together Computer. 2023. RedPajama: an Open Dataset for Training Large Language Models. https:\/\/github.com\/togethercomputer\/RedPajama-Data"},{"key":"e_1_3_2_1_11_1","volume-title":"Flashattention-2: Faster attention with better parallelism and work partitioning. arXiv preprint arXiv:2307.08691","author":"Dao Tri","year":"2023","unstructured":"Tri Dao. 2023. Flashattention-2: Faster attention with better parallelism and work partitioning. arXiv preprint arXiv:2307.08691 (2023)."},{"key":"e_1_3_2_1_12_1","first-page":"16344","article-title":"Flashattention: Fast and memory-efficient exact attention with io-awareness","volume":"35","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. Flashattention: Fast and memory-efficient exact attention with io-awareness. Advances in Neural Information Processing Systems 35 (2022), 16344\u201316359.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_13_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arxiv:1810.04805\u00a0[cs.CL]","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arxiv:1810.04805\u00a0[cs.CL]"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/PMBS56514.2022.00007"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441578"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00050"},{"key":"e_1_3_2_1_17_1","unstructured":"Horace He. 2023. Let\u2019s talk about a detail that occurs during PyTorch 2.0\u2019s codegen - tiling.https:\/\/x.com\/cHHillee\/status\/1620878972547665921"},{"key":"e_1_3_2_1_18_1","unstructured":"Andrej Karpathy. 2023. The most dramatic optimization to nanoGPT so far (\u00a025% speedup) is to simply increase vocab size from 50257 to 50304 (nearest multiple of 64).https:\/\/x.com\/karpathy\/status\/1621578354024677377"},{"key":"e_1_3_2_1_19_1","volume-title":"XSP: Across-Stack Profiling and Analysis of Machine Learning Models on GPUs. In 2020 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE Computer Society","author":"Li C.","year":"2020","unstructured":"C. Li, A. Dakkak, J. Xiong, W. Wei, L. Xu, and W. Hwu. 2020. XSP: Across-Stack Profiling and Analysis of Machine Learning Models on GPUs. In 2020 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE Computer Society, Los Alamitos, CA, USA, 326\u2013327. https:\/\/doi.ieeecomputersociety.org\/10.1109\/IPDPS47924.2020.00042"},{"key":"e_1_3_2_1_20_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.sysarc.2019.101635"},{"key":"e_1_3_2_1_22_1","unstructured":"Zachary Nado Justin\u00a0M. Gilmer Christopher\u00a0J. Shallue Rohan Anil and George\u00a0E. Dahl. 2021. A Large Batch Optimizer Reality Check: Traditional Generic Optimizers Suffice Across Batch Sizes. arxiv:2102.06356\u00a0[cs.LG]"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_24_1","unstructured":"NVIDIA. 2023. Matrix Multiplication Background. User\u2019s Guide | NVIDIA Docs. https:\/\/docs.nvidia.com\/deeplearning\/performance\/dl-performance-matrix-multiplication\/index.html"},{"key":"e_1_3_2_1_25_1","unstructured":"OLCF. 2023. OLCF6 Technical Requirements and Benchmarks."},{"key":"e_1_3_2_1_26_1","volume-title":"Test Long: Attention with Linear Biases Enables Input Length Extrapolation. In International Conference on Learning Representations.","author":"Press Ofir","year":"2021","unstructured":"Ofir Press, Noah Smith, and Mike Lewis. 2021. Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_27_1","volume-title":"Language models are unsupervised multitask learners. OpenAI blog 1, 8","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, Ilya Sutskever, 2019. Language models are unsupervised multitask learners. OpenAI blog 1, 8 (2019), 9."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.5555\/3455716.3455856"},{"key":"e_1_3_2_1_29_1","volume-title":"Modeling Deep Learning Accelerator Enabled GPUs. 2019 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)","author":"Raihan Md\u00a0Aamir","year":"2018","unstructured":"Md\u00a0Aamir Raihan, Negar Goli, and Tor\u00a0M. Aamodt. 2018. Modeling Deep Learning Accelerator Enabled GPUs. 2019 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS) (2018), 79\u201392. https:\/\/api.semanticscholar.org\/CorpusID:53783076"},{"key":"e_1_3_2_1_30_1","volume-title":"Glu variants improve transformer. arXiv preprint arXiv:2002.05202","author":"Shazeer Noam","year":"2020","unstructured":"Noam Shazeer. 2020. Glu variants improve transformer. arXiv preprint arXiv:2002.05202 (2020)."},{"key":"e_1_3_2_1_31_1","volume-title":"Megatron-LM: Training Multi-Billion Parameter Language Models using GPU Model Parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-LM: Training Multi-Billion Parameter Language Models using GPU Model Parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_32_1","volume-title":"Using deepspeed and megatron to train megatron-turing nlg 530b, a large-scale generative language model. arXiv preprint arXiv:2201.11990","author":"Smith Shaden","year":"2022","unstructured":"Shaden Smith, Mostofa Patwary, Brandon Norick, Patrick LeGresley, Samyam Rajbhandari, Jared Casper, Zhun Liu, Shrimai Prabhumoye, George Zerveas, Vijay Korthikanti, 2022. Using deepspeed and megatron to train megatron-turing nlg 530b, a large-scale generative language model. arXiv preprint arXiv:2201.11990 (2022)."},{"key":"e_1_3_2_1_33_1","volume-title":"Roformer: Enhanced transformer with rotary position embedding. arXiv preprint arXiv:2104.09864","author":"Su Jianlin","year":"2021","unstructured":"Jianlin Su, Yu Lu, Shengfeng Pan, Ahmed Murtadha, Bo Wen, and Yunfeng Liu. 2021. Roformer: Enhanced transformer with rotary position embedding. arXiv preprint arXiv:2104.09864 (2021)."},{"key":"e_1_3_2_1_34_1","unstructured":"Yuhsiang\u00a0Mike Tsai Terry Cojean and Hartwig Anzt. 2020. Evaluating the Performance of NVIDIA\u2019s A100 Ampere GPU for Sparse Linear Algebra Computations. arxiv:2008.08478\u00a0[cs.MS]"},{"key":"e_1_3_2_1_35_1","volume-title":"Attention is All You Need. Advances in Neural Information Processing Systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All You Need. Advances in Neural Information Processing Systems 30 (2017)."},{"key":"e_1_3_2_1_36_1","unstructured":"Ben Wang and Aran Komatsuzaki. 2021. GPT-J-6B: A 6 Billion Parameter Autoregressive Language Model."},{"key":"e_1_3_2_1_37_1","volume-title":"ArXiv abs\/1907.10701","author":"Wang Yu\u00a0Emma","year":"2019","unstructured":"Yu\u00a0Emma Wang, Gu-Yeon Wei, and David\u00a0M. Brooks. 2019. Benchmarking TPU, GPU, and CPU Platforms for Deep Learning. ArXiv abs\/1907.10701 (2019). https:\/\/api.semanticscholar.org\/CorpusID:198894674"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS47924.2020.00071"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.tbench.2021.100005"},{"key":"e_1_3_2_1_40_1","volume-title":"ByteTransformer: A High-Performance Transformer Boosted for Variable-Length Inputs. In 2023 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE Computer Society","author":"Zhai Y.","unstructured":"Y. Zhai, C. Jiang, L. Wang, X. Jia, S. Zhang, Z. Chen, X. Liu, and Y. Zhu. 2023. ByteTransformer: A High-Performance Transformer Boosted for Variable-Length Inputs. In 2023 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE Computer Society, Los Alamitos, CA, USA, 344\u2013355."},{"key":"e_1_3_2_1_41_1","volume-title":"Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi\u00a0Victoria Lin, 2022. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)."}],"event":{"name":"ICPP '24: the 53rd International Conference on Parallel Processing","location":"Gotland Sweden","acronym":"ICPP '24"},"container-title":["Proceedings of the 53rd International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3673038.3673136","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3673038.3673136","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,23]],"date-time":"2025-09-23T17:31:05Z","timestamp":1758648665000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3673038.3673136"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,12]]},"references-count":41,"alternative-id":["10.1145\/3673038.3673136","10.1145\/3673038"],"URL":"https:\/\/doi.org\/10.1145\/3673038.3673136","relation":{},"subject":[],"published":{"date-parts":[[2024,8,12]]},"assertion":[{"value":"2024-08-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}