{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T18:26:37Z","timestamp":1773771997233,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":83,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,21]]},"DOI":"10.1145\/3695053.3731050","type":"proceedings-article","created":{"date-parts":[[2025,6,20]],"date-time":"2025-06-20T16:46:17Z","timestamp":1750437977000},"page":"1760-1776","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Debunking the CUDA Myth Towards GPU-based AI Systems"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1500-3074","authenticated-orcid":false,"given":"Yunjae","family":"Lee","sequence":"first","affiliation":[{"name":"KAIST, Daejeon, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4778-0729","authenticated-orcid":false,"given":"Juntaek","family":"Lim","sequence":"additional","affiliation":[{"name":"KAIST, Daejeon, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6672-0587","authenticated-orcid":false,"given":"Jehyeon","family":"Bang","sequence":"additional","affiliation":[{"name":"KAIST, Daejeon, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0626-0931","authenticated-orcid":false,"given":"Eunyeong","family":"Cho","sequence":"additional","affiliation":[{"name":"KAIST, Daejeon, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9308-7104","authenticated-orcid":false,"given":"Huijong","family":"Jeong","sequence":"additional","affiliation":[{"name":"SqueezeBits, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7987-527X","authenticated-orcid":false,"given":"Taesu","family":"Kim","sequence":"additional","affiliation":[{"name":"SqueezeBits, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8403-1557","authenticated-orcid":false,"given":"Hyungjun","family":"Kim","sequence":"additional","affiliation":[{"name":"SqueezeBits, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2853-9433","authenticated-orcid":false,"given":"Joonhyung","family":"Lee","sequence":"additional","affiliation":[{"name":"NAVER Cloud, Seongnam, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-7549-4404","authenticated-orcid":false,"given":"Jinseop","family":"Im","sequence":"additional","affiliation":[{"name":"KAIST, Daejeon, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2343-586X","authenticated-orcid":false,"given":"Ranggi","family":"Hwang","sequence":"additional","affiliation":[{"name":"KAIST, Daejeon, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3456-9038","authenticated-orcid":false,"given":"Se Jung","family":"Kwon","sequence":"additional","affiliation":[{"name":"NAVER Cloud, Seongnam, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5153-5765","authenticated-orcid":false,"given":"Dongsoo","family":"Lee","sequence":"additional","affiliation":[{"name":"NAVER Cloud, Seongnam, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3303-8681","authenticated-orcid":false,"given":"Minsoo","family":"Rhu","sequence":"additional","affiliation":[{"name":"KAIST, Daejeon, Republic of Korea"}]}],"member":"320","published-online":{"date-parts":[[2025,6,20]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Albert Gu and Tri Dao. 2024. Mamba: Linear-Time Sequence Modeling with Selective State Spaces. https:\/\/github.com\/state-spaces\/mamba."},{"key":"e_1_3_3_2_3_2","unstructured":"Amazon Web Services. 2024. AWS Inferentia. https:\/\/aws.amazon.com\/ai\/machine-learning\/inferentia\/."},{"key":"e_1_3_3_2_4_2","unstructured":"Amazon Web Services. 2024. Machine Learning (ML) on AWS. https:\/\/aws.amazon.com\/ai\/machine-learning\/."},{"key":"e_1_3_3_2_5_2","unstructured":"AMD. 2024. AMD MI300. https:\/\/www.amd.com\/en\/products\/accelerators\/instinct\/mi300.html."},{"key":"e_1_3_3_2_6_2","unstructured":"AMD. 2025. AMD Ryzen AI Max Series Processors. https:\/\/www.amd.com\/content\/dam\/amd\/en\/documents\/partner-hub\/ryzen\/ryzen-ai-max-series-how-to-sell-guide-competitive.pdf."},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00080"},{"key":"e_1_3_3_2_8_2","unstructured":"Jayaram Bobba Tzachi Cohen Dibyendu Das Sergei Grechanik and Dafna Mordechai. 2024. Speeding Up Intel Gaudi Deep Learning Accelerators Using an MLIR-Based Compiler. https:\/\/llvm.org\/devmtg\/2024-10\/slides\/quicktalks\/Bobba-SpeedingUpIntelGaudi.pdf."},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.16"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00080"},{"key":"e_1_3_3_2_11_2","volume-title":"arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.02311","author":"Chowdhery Aakanksha","year":"2022","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung\u00a0Won Chung, Charles Sutton, Sebastian Gehrmann, Parker Schuh, Kensen Shi, Sasha Tsvyashchenko, Joshua Maynez, Abhishek Rao, Parker Barnes, Yi Tay, Noam Shazeer, Vinodkumar Prabhakaran, Emily Reif, Nan Du, Ben Hutchinson, Reiner Pope, James Bradbury, Jacob Austin, Michael Isard, Guy Gur-Ari, Pengcheng Yin, Toju Duke, Anselm Levskaya, Sanjay Ghemawat, Sunipa Dev, Henryk Michalewski, Xavier Garcia, Vedant Misra, Kevin Robinson, Liam Fedus, Denny Zhou, Daphne Ippolito, David Luan, Hyeontaek Lim, Barret Zoph, Alexander Spiridonov, Ryan Sepassi, David Dohan, Shivani Agrawal, Mark Omernick, Andrew\u00a0M. Dai, Thanumalayan\u00a0Sankaranarayana Pillai, Marie Pellat, Aitor Lewkowycz, Erica Moreira, Rewon Child, Oleksandr Polozov, Katherine Lee, Zongwei Zhou, Xuezhi Wang, Brennan Saeta, Mark Diaz, Orhan Firat, Michele Catasta, Jason Wei, Kathy Meier-Hellstern, Douglas Eck, Jeff Dean, Slav Petrov, and Noah Fiedel. 2022. PaLM: Scaling Language Modeling with Pathways. In arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.02311."},{"key":"e_1_3_3_2_12_2","volume-title":"arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.08691","author":"Dao Tri","year":"2023","unstructured":"Tri Dao. 2023. FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. In arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.08691."},{"key":"e_1_3_3_2_13_2","volume-title":"arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.21783","author":"Dubey Abhimanyu","year":"2024","unstructured":"Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan, Anirudh Goyal, Anthony Hartshorn, Aobo Yang, Archi Mitra, Archie Sravankumar, Artem Korenev, Arthur Hinsvark, Arun Rao, Aston Zhang, Aurelien Rodriguez, Austen Gregerson, Ava Spataru, Baptiste Roziere, Bethany Biron, Binh Tang, Bobbie Chern, Charlotte Caucheteux, Chaya Nayak, Chloe Bi, Chris Marra, Chris McConnell, Christian Keller, Christophe Touret, Chunyang Wu, Corinne Wong, Cristian\u00a0Canton Ferrer, Cyrus Nikolaidis, Damien Allonsius, Daniel Song, Danielle Pintz, Danny Livshits, David Esiobu, Dhruv Choudhary, Dhruv Mahajan, Diego Garcia-Olano, Diego Perino, Dieuwke Hupkes, Egor Lakomkin, Ehab AlBadawy, Elina Lobanova, Emily Dinan, Eric\u00a0Michael Smith, Filip Radenovic, Frank Zhang, Gabriel Synnaeve, Gabrielle Lee, Georgia\u00a0Lewis Anderson, Graeme Nail, Gregoire Mialon, Guan Pang, Guillem Cucurell, Hailey Nguyen, Hannah Korevaar, Hu Xu, Hugo Touvron, Iliyan Zarov, Imanol\u00a0Arrieta Ibarra, Isabel Kloumann, Ishan Misra, Ivan Evtimov, Jade Copet, Jaewon Lee, Jan Geffert, Jana Vranes, Jason Park, Jay Mahadeokar, Jeet Shah, Jelmer van\u00a0der Linde, Jennifer Billock, Jenny Hong, Jenya Lee, Jeremy Fu, Jianfeng Chi, Jianyu Huang, Jiawen Liu, Jie Wang, Jiecao Yu, Joanna Bitton, Joe Spisak, Jongsoo Park, Joseph Rocca, Joshua Johnstun, Joshua Saxe, Junteng Jia, Kalyan\u00a0Vasuden Alwala, Kartikeya Upasani, Kate Plawiak, Ke Li, Kenneth Heafield, Kevin Stone, Khalid El-Arini, Krithika Iyer, Kshitiz Malik, Kuenley Chiu, Kunal Bhalla, Lauren Rantala-Yeary, Laurens van\u00a0der Maaten, Lawrence Chen, Liang Tan, Liz Jenkins, Louis Martin, Lovish Madaan, Lubo Malo, Lukas Blecher, Lukas Landzaat, Luke de Oliveira, Madeline Muzzi, Mahesh Pasupuleti, Mannat Singh, Manohar Paluri, Marcin Kardas, Mathew Oldham, Mathieu Rita, Maya Pavlova, Melanie Kambadur, Mike Lewis, Min Si, Mitesh\u00a0Kumar Singh, Mona Hassan, Naman Goyal, Narjes Torabi, Nikolay Bashlykov, Nikolay Bogoychev, Niladri Chatterji, Olivier Duchenne, Onur \u00c7elebi, Patrick Alrassy, Pengchuan Zhang, Pengwei Li, Petar Vasic, Peter Weng, Prajjwal Bhargava, Pratik Dubal, Praveen Krishnan, Punit\u00a0Singh Koura, Puxin Xu, Qing He, Qingxiao Dong, Ragavan Srinivasan, Raj Ganapathy, Ramon Calderer, Ricardo\u00a0Silveira Cabral, Robert Stojnic, Roberta Raileanu, Rohit Girdhar, Rohit Patel, Romain Sauvestre, Ronnie Polidoro, Roshan Sumbaly, Ross Taylor, Ruan Silva, Rui Hou, Rui Wang, Saghar Hosseini, Sahana Chennabasappa, Sanjay Singh, Sean Bell, Seohyun\u00a0Sonia Kim, Sergey Edunov, Shaoliang Nie, Sharan Narang, Sharath Raparthy, Sheng Shen, Shengye Wan, Shruti Bhosale, Shun Zhang, Simon Vandenhende, Soumya Batra, Spencer Whitman, Sten Sootla, Stephane Collot, Suchin Gururangan, Sydney Borodinsky, Tamar Herman, Tara Fowler, Tarek Sheasha, Thomas Georgiou, Thomas Scialom, Tobias Speckbacher, Todor Mihaylov, Tong Xiao, Ujjwal Karn, Vedanuj Goswami, Vibhor Gupta, Vignesh Ramanathan, Viktor Kerkez, Vincent Gonguet, Virginie Do, Vish Vogeti, Vladan Petrovic, Weiwei Chu, Wenhan Xiong, Wenyin Fu, Whitney Meers, Xavier Martinet, Xiaodong Wang, Xiaoqing\u00a0Ellen Tan, Xinfeng Xie, Xuchao Jia, Xuewei Wang, Yaelle Goldschlag, Yashesh Gaur, Yasmine Babaei, Yi Wen, Yiwen Song, Yuchen Zhang, Yue Li, Yuning Mao, Zacharie\u00a0Delpierre Coudert, Zheng Yan, Zhengxing Chen, Zoe Papakipos, Aaditya Singh, Aaron Grattafiori, Abha Jain, Adam Kelsey, Adam Shajnfeld, Adithya Gangidi, Adolfo Victoria, Ahuva Goldstand, Ajay Menon, Ajay Sharma, Alex Boesenberg, Alex Vaughan, Alexei Baevski, Allie Feinstein, Amanda Kallet, Amit Sangani, Anam Yunus, Andrei Lupu, Andres Alvarado, Andrew Caples, Andrew Gu, Andrew Ho, Andrew Poulton, Andrew Ryan, Ankit Ramchandani, Annie Franco, Aparajita Saraf, Arkabandhu Chowdhury, Ashley Gabriel, Ashwin Bharambe, Assaf Eisenman, Azadeh Yazdan, Beau James, Ben Maurer, Benjamin Leonhardi, Bernie Huang, Beth Loyd, Beto\u00a0De Paola, Bhargavi Paranjape, Bing Liu, Bo Wu, Boyu Ni, Braden Hancock, Bram Wasti, Brandon Spence, Brani Stojkovic, Brian Gamido, Britt Montalvo, Carl Parker, Carly Burton, Catalina Mejia, Changhan Wang, Changkyu Kim, Chao Zhou, Chester Hu, Ching-Hsiang Chu, Chris Cai, Chris Tindal, Christoph Feichtenhofer, Damon Civin, Dana Beaty, Daniel Kreymer, Daniel Li, Danny Wyatt, David Adkins, David Xu, Davide Testuggine, Delia David, Devi Parikh, Diana Liskovich, Didem Foss, Dingkang Wang, Duc Le, Dustin Holland, Edward Dowling, Eissa Jamil, Elaine Montgomery, Eleonora Presani, Emily Hahn, Emily Wood, Erik Brinkman, Esteban Arcaute, Evan Dunbar, Evan Smothers, Fei Sun, Felix Kreuk, Feng Tian, Firat Ozgenel, Francesco Caggioni, Francisco Guzm\u00e1n, Frank Kanayet, Frank Seide, Gabriela\u00a0Medina Florez, Gabriella Schwarz, Gada Badeer, Georgia Swee, Gil Halpern, Govind Thattai, Grant Herman, Grigory Sizov, Guangyi\u00a0(Jack) Zhang, Guna Lakshminarayanan, Hamid Shojanazeri, Han Zou, Hannah Wang, Hanwen Zha, Haroun Habeeb, Harrison Rudolph, Helen Suk, Henry Aspegren, Hunter Goldman, Ibrahim Damlaj, Igor Molybog, Igor Tufanov, Irina-Elena Veliche, Itai Gat, Jake Weissman, James Geboski, James Kohli, Japhet Asher, Jean-Baptiste Gaya, Jeff Marcus, Jeff Tang, Jennifer Chan, Jenny Zhen, Jeremy Reizenstein, Jeremy Teboul, Jessica Zhong, Jian Jin, Jingyi Yang, Joe Cummings, Jon Carvill, Jon Shepard, Jonathan McPhie, Jonathan Torres, Josh Ginsburg, Junjie Wang, Kai Wu, Kam\u00a0Hou U, Karan Saxena, Karthik Prasad, Kartikay Khandelwal, Katayoun Zand, Kathy Matosich, Kaushik Veeraraghavan, Kelly Michelena, Keqian Li, Kun Huang, Kunal Chawla, Kushal Lakhotia, Kyle Huang, Lailin Chen, Lakshya Garg, Lavender A, Leandro Silva, Lee Bell, Lei Zhang, Liangpeng Guo, Licheng Yu, Liron Moshkovich, Luca Wehrstedt, Madian Khabsa, Manav Avalani, Manish Bhatt, Maria Tsimpoukelli, Martynas Mankus, Matan Hasson, Matthew Lennie, Matthias Reso, Maxim Groshev, Maxim Naumov, Maya Lathi, Meghan Keneally, Michael\u00a0L. Seltzer, Michal Valko, Michelle Restrepo, Mihir Patel, Mik Vyatskov, Mikayel Samvelyan, Mike Clark, Mike Macey, Mike Wang, Miquel\u00a0Jubert Hermoso, Mo Metanat, Mohammad Rastegari, Munish Bansal, Nandhini Santhanam, Natascha Parks, Natasha White, Navyata Bawa, Nayan Singhal, Nick Egebo, Nicolas Usunier, Nikolay\u00a0Pavlovich Laptev, Ning Dong, Ning Zhang, Norman Cheng, Oleg Chernoguz, Olivia Hart, Omkar Salpekar, Ozlem Kalinli, Parkin Kent, Parth Parekh, Paul Saab, Pavan Balaji, Pedro Rittner, Philip Bontrager, Pierre Roux, Piotr Dollar, Polina Zvyagina, Prashant Ratanchandani, Pritish Yuvraj, Qian Liang, Rachad Alao, Rachel Rodriguez, Rafi Ayub, Raghotham Murthy, Raghu Nayani, Rahul Mitra, Raymond Li, Rebekkah Hogan, Robin Battey, Rocky Wang, Rohan Maheswari, Russ Howes, Ruty Rinott, Sai\u00a0Jayesh Bondu, Samyak Datta, Sara Chugh, Sara Hunt, Sargun Dhillon, Sasha Sidorov, Satadru Pan, Saurabh Verma, Seiji Yamamoto, Sharadh Ramaswamy, Shaun Lindsay, Shaun Lindsay, Sheng Feng, Shenghao Lin, Shengxin\u00a0Cindy Zha, Shiva Shankar, Shuqiang Zhang, Shuqiang Zhang, Sinong Wang, Sneha Agarwal, Soji Sajuyigbe, Soumith Chintala, Stephanie Max, Stephen Chen, Steve Kehoe, Steve Satterfield, Sudarshan Govindaprasad, Sumit Gupta, Sungmin Cho, Sunny Virk, Suraj Subramanian, Sy Choudhury, Sydney Goldman, Tal Remez, Tamar Glaser, Tamara Best, Thilo Kohler, Thomas Robinson, Tianhe Li, Tianjun Zhang, Tim Matthews, Timothy Chou, Tzook Shaked, Varun Vontimitta, Victoria Ajayi, Victoria Montanez, Vijai Mohan, Vinay\u00a0Satish Kumar, Vishal Mangla, V\u00edtor Albiero, Vlad Ionescu, Vlad Poenaru, Vlad\u00a0Tiberiu Mihailescu, Vladimir Ivanov, Wei Li, Wenchen Wang, Wenwen Jiang, Wes Bouaziz, Will Constable, Xiaocheng Tang, Xiaofang Wang, Xiaojian Wu, Xiaolan Wang, Xide Xia, Xilun Wu, Xinbo Gao, Yanjun Chen, Ye Hu, Ye Jia, Ye Qi, Yenda Li, Yilin Zhang, Ying Zhang, Yossi Adi, Youngjin Nam, Yu\u00a0(Sid) Wang, Yuchen Hao, Yundi Qian, Yuzi He, Zach Rait, Zachary DeVito, Zef Rosnbrick, Zhaoduo Wen, Zhenyu Yang, and Zhiwei Zhao. 2024. The Llama 3 Herd of Models. In arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.21783."},{"key":"e_1_3_3_2_14_2","unstructured":"Dynamic Sonnet Dataset. 2024. https:\/\/huggingface.co\/datasets\/squeezebits\/dynamic_sonnet_llama3."},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW63119.2024.00016"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589348"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2007.30"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/2000064.2000093"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00062"},{"key":"e_1_3_3_2_20_2","unstructured":"Google Cloud. 2024. Tensor Processing Units (TPU). https:\/\/cloud.google.com\/tpu."},{"key":"e_1_3_3_2_21_2","unstructured":"Google Cloud. 2024. Vertex AI. https:\/\/cloud.google.com\/vertex-ai\/."},{"key":"e_1_3_3_2_22_2","unstructured":"Alan Gray. 2019. Getting Started with CUDA Graphs. https:\/\/developer.nvidia.com\/blog\/cuda-graphs\/."},{"key":"e_1_3_3_2_23_2","unstructured":"Hugging Face. 2022. Optimum for Intel Gaudi Accelerators. https:\/\/github.com\/huggingface\/optimum-habana."},{"key":"e_1_3_3_2_24_2","unstructured":"InfiniBand Trade Association. 2024. RDMA over Converged Ethernet. https:\/\/www.roceinitiative.org\/."},{"key":"e_1_3_3_2_25_2","unstructured":"Intel. 2023. Habana Gaudi-2 White Paper. https:\/\/www.intel.com\/content\/www\/us\/en\/content-details\/784827\/gaudi-2-white-paper.html."},{"key":"e_1_3_3_2_26_2","unstructured":"Intel. 2023. Intel HLS-Gaudi2 AI Accelerator Server. https:\/\/habana.ai\/wp-content\/uploads\/2023\/10\/HLS-Gaudi2_Datasheet_10_23.pdf."},{"key":"e_1_3_3_2_27_2","unstructured":"Intel. 2024. Fused Scaled Dot Product Attention for Gaudi. https:\/\/docs.habana.ai\/en\/latest\/PyTorch\/Reference\/Python_Packages.html#hpex-kernels-fusedsdpa."},{"key":"e_1_3_3_2_28_2","unstructured":"Intel. 2024. Gaudi TPC Architectural Overview. https:\/\/docs.habana.ai\/en\/latest\/TPC\/TPC_User_Guide\/Processor_Architectural_Overview.html."},{"key":"e_1_3_3_2_29_2","unstructured":"Intel. 2024. Habana Collective Communications Library (HCCL). https:\/\/github.com\/HabanaAI\/hccl_demo."},{"key":"e_1_3_3_2_30_2","unstructured":"Intel. 2024. Habana HPU Graphs. https:\/\/docs.habana.ai\/en\/latest\/PyTorch\/Reference\/Python_Packages.html#hpu-graph-apis."},{"key":"e_1_3_3_2_31_2","unstructured":"Intel. 2024. Intel Gaudi 3 AI Accelerator Technical Paper. https:\/\/www.intel.com\/content\/www\/us\/en\/content-details\/817486\/intel-gaudi-3-ai-accelerator-white-paper.html."},{"key":"e_1_3_3_2_32_2","unstructured":"Intel. 2024. Intel Gaudi Software Suite. https:\/\/habana.ai\/intel-gaudi-software\/."},{"key":"e_1_3_3_2_33_2","unstructured":"Intel. 2024. System Management Interface Tool (hl-smi). https:\/\/docs.habana.ai\/en\/latest\/Management_and_Monitoring\/Embedded_System_Tools_Guide\/System_Management_Interface_Tool.html."},{"key":"e_1_3_3_2_34_2","unstructured":"Intel. 2024. TPC-C Language. https:\/\/docs.habana.ai\/en\/latest\/TPC\/TPC_User_Guide\/TPC_C_Language.html."},{"key":"e_1_3_3_2_35_2","unstructured":"Intel. 2024. vLLM Fork for Gaudi. https:\/\/github.com\/HabanaAI\/vllm-fork."},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3523227.3547387"},{"key":"e_1_3_3_2_37_2","volume-title":"arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1804.06826","author":"Jia Zhe","year":"2018","unstructured":"Zhe Jia, Marco Maggioni, Benjamin Staiger, and Daniele\u00a0P Scarpazza. 2018. Dissecting the NVIDIA Volta GPU Architecture Via Microbenchmarking. In arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1804.06826."},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2018.00035"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/HCS61935.2024.10665178"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00070"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358284"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370308"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/1815961.1816021"},{"key":"e_1_3_3_2_47_2","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS)","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen tau Yih, Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela. 2020. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.1145\/1188455.1188677"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2018.00091"},{"key":"e_1_3_3_2_50_2","unstructured":"John\u00a0D McCalpin. 1995. Memory Bandwidth and Machine Balance in Current High Performance Computers. IEEE Computer Society Technical Committee on Computer Architecture Newsletter 2 19-25 (1995)."},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"crossref","unstructured":"Xinxin Mei and Xiaowen Chu. 2017. Dissecting GPU Memory Hierarchy Through Microbenchmarking. IEEE Transactions on Parallel and Distributed Systems 28 1 (2017) 72\u201386.","DOI":"10.1109\/TPDS.2016.2549523"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/1815961.1815992"},{"key":"e_1_3_3_2_53_2","unstructured":"MLPerf. 2024. MLCommons (MLPerf) Inference Benchmarks for Recommendation Task. https:\/\/github.com\/mlcommons\/inference\/tree\/master\/recommendation\/dlrm_v2\/pytorch."},{"key":"e_1_3_3_2_54_2","volume-title":"arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1906.00091","author":"Naumov Maxim","year":"2019","unstructured":"Maxim Naumov, Dheevatsa Mudigere, Hao-Jun\u00a0Michael Shi, Jianyu Huang, Narayanan Sundaraman, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu, Alisson\u00a0G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii, Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko, Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong, and Misha Smelyanskiy. 2019. Deep Learning Recommendation Model for Personalization and Recommendation Systems. In arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1906.00091."},{"key":"e_1_3_3_2_55_2","unstructured":"NVIDIA. 2018. NVSwitch: Leveraging NVLink to Maximum Effect. https:\/\/developer.nvidia.com\/blog\/nvswitch-leveraging-nvlink-to-maximum-effect\/."},{"key":"e_1_3_3_2_56_2","unstructured":"NVIDIA. 2024. cuBLAS. https:\/\/developer.nvidia.com\/cublas."},{"key":"e_1_3_3_2_57_2","unstructured":"NVIDIA. 2024. cuDNN. https:\/\/developer.nvidia.com\/cudnn."},{"key":"e_1_3_3_2_58_2","unstructured":"NVIDIA. 2024. cuSOLVER. https:\/\/developer.nvidia.com\/cusolver."},{"key":"e_1_3_3_2_59_2","unstructured":"NVIDIA. 2024. cuSPARSE. https:\/\/developer.nvidia.com\/cusparse."},{"key":"e_1_3_3_2_60_2","unstructured":"NVIDIA. 2024. NVIDIA Collective Communications Library (NCCL). https:\/\/developer.nvidia.com\/nccl."},{"key":"e_1_3_3_2_61_2","unstructured":"NVIDIA. 2024. NVIDIA System Management Interface. https:\/\/developer.download.nvidia.com\/compute\/DCGM\/docs\/nvidia-smi-367.38.pdf."},{"key":"e_1_3_3_2_62_2","unstructured":"NVIDIA. 2024. NVIDIA Tensor Cores. https:\/\/www.nvidia.com\/en-us\/data-center\/tensor-cores\/."},{"key":"e_1_3_3_2_63_2","unstructured":"NVIDIA. 2024. Performance Reported by NCCL Tests. https:\/\/github.com\/NVIDIA\/nccl-tests\/blob\/master\/doc\/PERFORMANCE.md."},{"key":"e_1_3_3_2_64_2","unstructured":"NVIDIA. 2024. TensorRT-LLM. https:\/\/github.com\/NVIDIA\/TensorRT-LLM."},{"key":"e_1_3_3_2_65_2","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS)","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul\u00a0F. Christiano, Jan Leike, and Ryan Lowe. 2022. Training Language Models to Follow Instructions with Human Feedback. In Proceedings of the International Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00084"},{"key":"e_1_3_3_2_67_2","unstructured":"PyTorch. 2024. FBGEMM GPU Python API. https:\/\/pytorch.org\/FBGEMM\/fbgemm_gpu-python-api\/tbe_ops_training.html."},{"key":"e_1_3_3_2_68_2","unstructured":"PyTorch. 2024. Scaled Dot Product Attention (SDPA) Python API. https:\/\/pytorch.org\/docs\/stable\/generated\/torch.nn.functional.scaled_dot_product_attention.html."},{"key":"e_1_3_3_2_69_2","unstructured":"RAPIDS. 2024. cuDF: GPU DataFrames. https:\/\/github.com\/rapidsai\/cudf."},{"key":"e_1_3_3_2_70_2","unstructured":"RAPIDS. 2024. cuVS: Vector Search and Clustering on the GPU. https:\/\/github.com\/rapidsai\/cuvs."},{"key":"e_1_3_3_2_71_2","volume-title":"Proceedings of the International Symposium on High-Performance Computer Architecture (HPCA)","author":"Rhu Minsoo","year":"2013","unstructured":"Minsoo Rhu and Mattan Erez. 2013. The Dual-Path Execution Model for Efficient GPU Control Flow. In Proceedings of the International Symposium on High-Performance Computer Architecture (HPCA)."},{"key":"e_1_3_3_2_72_2","doi-asserted-by":"publisher","DOI":"10.1145\/2540708.2540717"},{"key":"e_1_3_3_2_73_2","volume-title":"arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. In arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1909.08053."},{"key":"e_1_3_3_2_74_2","volume-title":"arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.02944","author":"Shoushtary Mojtaba\u00a0Abaie","year":"2024","unstructured":"Mojtaba\u00a0Abaie Shoushtary, Jordi\u00a0Tubella Murgadas, and Antonio Gonzalez. 2024. Control Flow Management in Modern GPUs. In arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.02944."},{"key":"e_1_3_3_2_75_2","unstructured":"SqueezeBits. 2025. https:\/\/blog.squeezebits.com\/intel-gaudi-3-performance-evaluation-with-synapseai-v119-39839."},{"key":"e_1_3_3_2_76_2","unstructured":"Guillaume Thomas-collignon and Vishal Mehta. 2020. Optimizing CUDA Applications for NVIDIA A100 GPU. https:\/\/developer.download.nvidia.com\/video\/gputechconf\/gtc\/2020\/presentations\/s21819-optimizing-applications-for-nvidia-ampere-gpu-architecture.pdf."},{"key":"e_1_3_3_2_77_2","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_3_2_78_2","volume-title":"arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. In arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971."},{"key":"e_1_3_3_2_79_2","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NIPS)","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention Is All You Need. In Proceedings of the International Conference on Neural Information Processing Systems (NIPS)."},{"key":"e_1_3_3_2_80_2","unstructured":"VIA Research. 2025. https:\/\/github.com\/VIA-Research\/Intel-Gaudi-AI-benchmarks."},{"key":"e_1_3_3_2_81_2","doi-asserted-by":"publisher","DOI":"10.1145\/3442381.3450078"},{"key":"e_1_3_3_2_82_2","unstructured":"Shibo Wang and Pankaj Kanwar. 2019. BFloat16: The Secret to High Performance on Cloud TPUs. https:\/\/cloud.google.com\/blog\/products\/ai-machine-learning\/bfloat16-the-secret-to-high-performance-on-cloud-tpus?hl=en."},{"key":"e_1_3_3_2_83_2","volume-title":"Proceedings of USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo\u00a0Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. ORCA: A Distributed Serving System for Transformer-Based Generative Models. In Proceedings of USENIX Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_3_2_84_2","doi-asserted-by":"publisher","DOI":"10.1145\/3624062.3624257"}],"event":{"name":"ISCA '25: Proceedings of the 52nd Annual International Symposium on Computer Architecture","location":"Tokyo Japan","acronym":"SIGARCH '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 52nd Annual International Symposium on Computer Architecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3695053.3731050","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T11:06:32Z","timestamp":1750503992000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3695053.3731050"}},"subtitle":["Evaluation of the Performance and Programmability of Intel's Gaudi NPU for AI Model Serving"],"short-title":[],"issued":{"date-parts":[[2025,6,20]]},"references-count":83,"alternative-id":["10.1145\/3695053.3731050","10.1145\/3695053"],"URL":"https:\/\/doi.org\/10.1145\/3695053.3731050","relation":{},"subject":[],"published":{"date-parts":[[2025,6,20]]},"assertion":[{"value":"2025-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}