{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,29]],"date-time":"2026-01-29T03:35:07Z","timestamp":1769657707510,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","funder":[{"name":"NSF of China","award":["U22A2028"],"award-info":[{"award-number":["U22A2028"]}]},{"name":"NSF of China","award":["62302483"],"award-info":[{"award-number":["62302483"]}]},{"name":"Strategic Priority Research Program of the Chinese Academy of Sciences","award":["XDB0660300"],"award-info":[{"award-number":["XDB0660300"]}]},{"name":"Strategic Priority Research Program of the Chinese Academy of Sciences","award":["XDB0660302"],"award-info":[{"award-number":["XDB0660302"]}]},{"name":"CAS Project for Young Scientists in Basic Research","award":["YSBR-029"],"award-info":[{"award-number":["YSBR-029"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,1,28]]},"DOI":"10.1145\/3774934.3786425","type":"proceedings-article","created":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T15:25:57Z","timestamp":1769613957000},"page":"605-619","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["FlashAttention-T: Towards Fully Tensorized Attention by Exploiting Tensor-Vector Parallelism"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0373-411X","authenticated-orcid":false,"given":"Jianxing","family":"Xu","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"},{"name":"Institute of\u00a0Computing\u00a0Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7775-2724","authenticated-orcid":false,"given":"Yuanbo","family":"Wen","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9956-7039","authenticated-orcid":false,"given":"Jun","family":"Bi","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1830-4483","authenticated-orcid":false,"given":"Ruibai","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"},{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7743-9088","authenticated-orcid":false,"given":"Guanglin","family":"Xu","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8691-8549","authenticated-orcid":false,"given":"Rui","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0237-1034","authenticated-orcid":false,"given":"Wei","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8877-9052","authenticated-orcid":false,"given":"Ling","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Software, Chinese Academy of Sciences, Beijing, China"},{"name":"University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7601-0753","authenticated-orcid":false,"given":"Tianshi","family":"Chen","sequence":"additional","affiliation":[{"name":"Cambricon Technologies, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2530-5874","authenticated-orcid":false,"given":"Qi","family":"Guo","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3925-5185","authenticated-orcid":false,"given":"Yunji","family":"Chen","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"},{"name":"University of Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2026,1,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC55821.2022.9926299"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2023.EMNLP-MAIN.298"},{"key":"e_1_3_2_1_3_1","volume-title":"Make Your LLM Fully Utilize the Context. In Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024","author":"An Shengnan","year":"2024","unstructured":"Shengnan An, Zexiong Ma, Zeqi Lin, Nanning Zheng, Jian-Guang Lou, and Weizhu Chen. 2024. Make Your LLM Fully Utilize the Context. In Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver, BC, Canada, December 10 - 15, 2024, Amir Globersons, Lester Mackey, Danielle Belgrave, Angela Fan, Ulrich Paquet, Jakub M. Tomczak, and Cheng Zhang (Eds.). http:\/\/papers.nips.cc\/paper_files\/paper\/2024\/hash\/71c3451f6cd6a4f82bb822db25cea4fd-Abstract-Conference.html"},{"key":"e_1_3_2_1_4_1","unstructured":"Mark Chen Jerry Tworek Heewoo Jun Qiming Yuan Henrique Pond\u00e9 de Oliveira Pinto Jared Kaplan Harri Edwards Yuri Burda Nicholas Joseph Greg Brockman Alex Ray Raul Puri Gretchen Krueger Michael Petrov Heidy Khlaaf Girish Sastry Pamela Mishkin Brooke Chan Scott Gray Nick Ryder Mikhail Pavlov Alethea Power Lukasz Kaiser Mohammad Bavarian Clemens Winter Philippe Tillet Felipe Petroski Such Dave Cummings Matthias Plappert Fotios Chantzis Elizabeth Barnes Ariel Herbert-Voss William Hebgen Guss Alex Nichol Alex Paino Nikolas Tezak Jie Tang Igor Babuschkin Suchir Balaji Shantanu Jain William Saunders Christopher Hesse Andrew N. Carr Jan Leike Joshua Achiam Vedant Misra Evan Morikawa Alec Radford Matthew Knight Miles Brundage Mira Murati Katie Mayer Peter Welinder Bob McGrew Dario Amodei Sam McCandlish Ilya Sutskever and Wojciech Zaremba. 2021. Evaluating Large Language Models Trained on Code. CoRR abs\/2107.03374 (2021) arXiv:2107.03374. arxiv:2107.03374"},{"key":"e_1_3_2_1_5_1","volume-title":"The Twelfth International Conference on Learning Representations, ICLR 2024","author":"Dao Tri","year":"2024","unstructured":"Tri Dao. 2024. FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. In The Twelfth International Conference on Learning Representations, ICLR 2024, Vienna, Austria, May 7-11, 2024. OpenReview.net. https:\/\/openreview.net\/forum?id=mZn2Xyh9Ec"},{"key":"e_1_3_2_1_6_1","volume-title":"FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. In Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. In Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans, LA, USA, November 28 - December 9, 2022, Sanmi Koyejo, S. Mohamed, A. Agarwal, Danielle Belgrave, K. Cho, and A. Oh (Eds.). http:\/\/papers.nips.cc\/paper_files\/paper\/2022\/hash\/67d57c32e20fd0a7a302cb81d36e40d5-Abstract-Conference.html"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2405.04434"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan Amy Yang Angela Fan Anirudh Goyal Anthony Hartshorn Aobo Yang Archi Mitra Archie Sravankumar Artem Korenev Arthur Hinsvark Arun Rao Aston Zhang Aurelien Rodriguez Austen Gregerson Ava Spataru Baptiste Roziere Bethany Biron Binh Tang Bobbie Chern Charlotte Caucheteux Chaya Nayak Chloe Bi Chris Marra Chris McConnell Christian Keller Christophe Touret Chunyang Wu Corinne Wong Cristian Canton Ferrer Cyrus Nikolaidis Damien Allonsius Daniel Song Danielle Pintz Danny Livshits Danny Wyatt David Esiobu Dhruv Choudhary Dhruv Mahajan Diego Garcia-Olano Diego Perino Dieuwke Hupkes Egor Lakomkin Ehab AlBadawy Elina Lobanova Emily Dinan Eric Michael Smith Filip Radenovic Francisco Guzm\u00e1n Frank Zhang Gabriel Synnaeve Gabrielle Lee Georgia Lewis Anderson Govind Thattai Graeme Nail Gregoire Mialon Guan Pang Guillem Cucurell Hailey Nguyen Hannah Korevaar Hu Xu Hugo Touvron Iliyan Zarov Imanol Arrieta Ibarra Isabel Kloumann Ishan Misra Ivan Evtimov Jack Zhang Jade Copet Jaewon Lee Jan Geffert Jana Vranes Jason Park Jay Mahadeokar Jeet Shah Jelmer van der Linde Jennifer Billock Jenny Hong Jenya Lee Jeremy Fu Jianfeng Chi Jianyu Huang Jiawen Liu Jie Wang Jiecao Yu Joanna Bitton Joe Spisak Jongsoo Park Joseph Rocca Joshua Johnstun Joshua Saxe Junteng Jia Kalyan Vasuden Alwala Karthik Prasad Kartikeya Upasani Kate Plawiak Ke Li Kenneth Heafield Kevin Stone Khalid El-Arini Krithika Iyer Kshitiz Malik Kuenley Chiu Kunal Bhalla Kushal Lakhotia Lauren Rantala-Yeary Laurens van der Maaten Lawrence Chen Liang Tan Liz Jenkins Louis Martin Lovish Madaan Lubo Malo Lukas Blecher Lukas Landzaat Luke de Oliveira Madeline Muzzi Mahesh Pasupuleti Mannat Singh Manohar Paluri Marcin Kardas Maria Tsimpoukelli Mathew Oldham Mathieu Rita Maya Pavlova Melanie Kambadur Mike Lewis Min Si Mitesh Kumar Singh Mona Hassan Naman Goyal Narjes Torabi Nikolay Bashlykov Nikolay Bogoychev Niladri Chatterji Ning Zhang Olivier Duchenne Onur \u00c7elebi Patrick Alrassy Pengchuan Zhang Pengwei Li Petar Vasic Peter Weng Prajjwal Bhargava Pratik Dubal Praveen Krishnan Punit Singh Koura Puxin Xu Qing He Qingxiao Dong Ragavan Srinivasan Raj Ganapathy Ramon Calderer Ricardo Silveira Cabral Robert Stojnic Roberta Raileanu Rohan Maheswari Rohit Girdhar Rohit Patel Romain Sauvestre Ronnie Polidoro Roshan Sumbaly Ross Taylor Ruan Silva Rui Hou Rui Wang Saghar Hosseini Sahana Chennabasappa Sanjay Singh Sean Bell Seohyun Sonia Kim Sergey Edunov Shaoliang Nie Sharan Narang Sharath Raparthy Sheng Shen Shengye Wan Shruti Bhosale Shun Zhang Simon Vandenhende Soumya Batra Spencer Whitman Sten Sootla Stephane Collot Suchin Gururangan Sydney Borodinsky Tamar Herman Tara Fowler Tarek Sheasha Thomas Georgiou Thomas Scialom Tobias Speckbacher Todor Mihaylov Tong Xiao Ujjwal Karn Vedanuj Goswami Vibhor Gupta Vignesh Ramanathan Viktor Kerkez Vincent Gonguet Virginie Do Vish Vogeti V\u00edtor Albiero Vladan Petrovic Weiwei Chu Wenhan Xiong Wenyin Fu Whitney Meers Xavier Martinet Xiaodong Wang Xiaofang Wang Xiaoqing Ellen Tan Xide Xia Xinfeng Xie Xuchao Jia Xuewei Wang Yaelle Goldschlag Yashesh Gaur Yasmine Babaei Yi Wen Yiwen Song Yuchen Zhang Yue Li Yuning Mao Zacharie Delpierre Coudert Zheng Yan Zhengxing Chen Zoe Papakipos Aaditya Singh Aayushi Srivastava Abha Jain Adam Kelsey Adam Shajnfeld Adithya Gangidi Adolfo Victoria Ahuva Goldstand Ajay Menon Ajay Sharma Alex Boesenberg Alexei Baevski Allie Feinstein Amanda Kallet Amit Sangani Amos Teo Anam Yunus Andrei Lupu Andres Alvarado Andrew Caples Andrew Gu Andrew Ho Andrew Poulton Andrew Ryan Ankit Ramchandani Annie Dong Annie Franco Anuj Goyal Aparajita Saraf Arkabandhu Chowdhury Ashley Gabriel Ashwin Bharambe Assaf Eisenman Azadeh Yazdan Beau James Ben Maurer Benjamin Leonhardi Bernie Huang Beth Loyd Beto De Paola Bhargavi Paranjape Bing Liu Bo Wu Boyu Ni Braden Hancock Bram Wasti Brandon Spence Brani Stojkovic Brian Gamido Britt Montalvo Carl Parker Carly Burton Catalina Mejia Ce Liu Changhan Wang Changkyu Kim Chao Zhou Chester Hu Ching-Hsiang Chu Chris Cai Chris Tindal Christoph Feichtenhofer Cynthia Gao Damon Civin Dana Beaty Daniel Kreymer Daniel Li David Adkins David Xu Davide Testuggine Delia David Devi Parikh Diana Liskovich Didem Foss Dingkang Wang Duc Le Dustin Holland Edward Dowling Eissa Jamil Elaine Montgomery Eleonora Presani Emily Hahn Emily Wood Eric-Tuan Le Erik Brinkman Esteban Arcaute Evan Dunbar Evan Smothers Fei Sun Felix Kreuk Feng Tian Filippos Kokkinos Firat Ozgenel Francesco Caggioni Frank Kanayet Frank Seide Gabriela Medina Florez Gabriella Schwarz Gada Badeer Georgia Swee Gil Halpern Grant Herman Grigory Sizov Guangyi Zhang Guna Lakshminarayanan Hakan Inan Hamid Shojanazeri Han Zou Hannah Wang Hanwen Zha Haroun Habeeb Harrison Rudolph Helen Suk Henry Aspegren Hunter Goldman Hongyuan Zhan Ibrahim Damlaj Igor Molybog Igor Tufanov Ilias Leontiadis Irina-Elena Veliche Itai Gat Jake Weissman James Geboski James Kohli Janice Lam Japhet Asher Jean-Baptiste Gaya Jeff Marcus Jeff Tang Jennifer Chan Jenny Zhen Jeremy Reizenstein Jeremy Teboul Jessica Zhong Jian Jin Jingyi Yang Joe Cummings Jon Carvill Jon Shepard Jonathan McPhie Jonathan Torres Josh Ginsburg Junjie Wang Kai Wu Kam Hou U Karan Saxena Kartikay Khandelwal Katayoun Zand Kathy Matosich Kaushik Veeraraghavan Kelly Michelena Keqian Li Kiran Jagadeesh Kun Huang Kunal Chawla Kyle Huang Lailin Chen Lakshya Garg Lavender A Leandro Silva Lee Bell Lei Zhang Liangpeng Guo Licheng Yu Liron Moshkovich Luca Wehrstedt Madian Khabsa Manav Avalani Manish Bhatt Martynas Mankus Matan Hasson Matthew Lennie Matthias Reso Maxim Groshev Maxim Naumov Maya Lathi Meghan Keneally Miao Liu Michael L. Seltzer Michal Valko Michelle Restrepo Mihir Patel Mik Vyatskov Mikayel Samvelyan Mike Clark Mike Macey Mike Wang Miquel Jubert Hermoso Mo Metanat Mohammad Rastegari Munish Bansal Nandhini Santhanam Natascha Parks Natasha White Navyata Bawa Nayan Singhal Nick Egebo Nicolas Usunier Nikhil Mehta Nikolay Pavlovich Laptev Ning Dong Norman Cheng Oleg Chernoguz Olivia Hart Omkar Salpekar Ozlem Kalinli Parkin Kent Parth Parekh Paul Saab Pavan Balaji Pedro Rittner Philip Bontrager Pierre Roux Piotr Dollar Polina Zvyagina Prashant Ratanchandani Pritish Yuvraj Qian Liang Rachad Alao Rachel Rodriguez Rafi Ayub Raghotham Murthy Raghu Nayani Rahul Mitra Rangaprabhu Parthasarathy Raymond Li Rebekkah Hogan Robin Battey Rocky Wang Russ Howes Ruty Rinott Sachin Mehta Sachin Siby Sai Jayesh Bondu Samyak Datta Sara Chugh Sara Hunt Sargun Dhillon Sasha Sidorov Satadru Pan Saurabh Mahajan Saurabh Verma Seiji Yamamoto Sharadh Ramaswamy Shaun Lindsay Shaun Lindsay Sheng Feng Shenghao Lin Shengxin Cindy Zha Shishir Patil Shiva Shankar Shuqiang Zhang Shuqiang Zhang Sinong Wang Sneha Agarwal Soji Sajuyigbe Soumith Chintala Stephanie Max Stephen Chen Steve Kehoe Steve Satterfield Sudarshan Govindaprasad Sumit Gupta Summer Deng Sungmin Cho Sunny Virk Suraj Subramanian Sy Choudhury Sydney Goldman Tal Remez Tamar Glaser Tamara Best Thilo Koehler Thomas Robinson Tianhe Li Tianjun Zhang Tim Matthews Timothy Chou Tzook Shaked Varun Vontimitta Victoria Ajayi Victoria Montanez Vijai Mohan Vinay Satish Kumar Vishal Mangla Vlad Ionescu Vlad Poenaru Vlad Tiberiu Mihailescu Vladimir Ivanov Wei Li Wenchen Wang Wenwen Jiang Wes Bouaziz Will Constable Xiaocheng Tang Xiaojian Wu Xiaolan Wang Xilun Wu Xinbo Gao Yaniv Kleinman Yanjun Chen Ye Hu Ye Jia Ye Qi Yenda Li Yilin Zhang Ying Zhang Yossi Adi Youngjin Nam Yu Wang Yu Zhao Yuchen Hao Yundi Qian Yunlu Li Yuzi He Zach Rait Zachary DeVito Zef Rosnbrick Zhaoduo Wen Zhenyu Yang Zhiwei Zhao and Zhiyu Ma. 2024. The Llama 3 Herd of Models. CoRR abs\/2407.21783 (2024) arXiv:2407.21783. https:\/\/doi.org\/10.48550\/ARXIV.2407.21783 10.48550\/ARXIV.2407.21783","DOI":"10.48550\/ARXIV.2407.21783"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2505.02130"},{"key":"e_1_3_2_1_10_1","volume-title":"Measuring Massive Multitask Language Understanding. In 9th International Conference on Learning Representations, ICLR 2021","author":"Hendrycks Dan","year":"2021","unstructured":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. 2021. Measuring Massive Multitask Language Understanding. In 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3-7, 2021. OpenReview.net. https:\/\/openreview.net\/forum?id=d7KBjmI3GmQ"},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the Seventh Annual Conference on Machine Learning and Systems, MLSys 2024","author":"Hong Ke","year":"2024","unstructured":"Ke Hong, Guohao Dai, Jiaming Xu, Qiuli Mao, Xiuhong Li, Jun Liu, Kangdi Chen, Yuhan Dong, and Yu Wang. 2024. FlashDecoding++: Faster Large Language Model Inference with Asynchronization, Flat GEMM Optimization, and Heuristics. In Proceedings of the Seventh Annual Conference on Machine Learning and Systems, MLSys 2024, Santa Clara, CA, USA, May 13-16, 2024, Phillip B. Gibbons, Gennady Pekhimenko, and Christopher De Sa (Eds.). mlsys.org. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2024\/hash\/5321b1dabcd2be188d796c21b733e8c7-Abstract-Conference.html"},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the 37th International Conference on Machine Learning, ICML 2020","volume":"4386","author":"Hron Jiri","year":"2020","unstructured":"Jiri Hron, Yasaman Bahri, Jascha Sohl-Dickstein, and Roman Novak. 2020. Infinite attention: NNGP and NTK for deep attention networks. In Proceedings of the 37th International Conference on Machine Learning, ICML 2020, 13-18 July 2020, Virtual Event (Proceedings of Machine Learning Research, Vol. 119). PMLR, 4376\u20134386. http:\/\/proceedings.mlr.press\/v119\/hron20a.html"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2507.10789"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS57955.2024.00064"},{"key":"e_1_3_2_1_16_1","volume-title":"Online normalizer calculation for softmax. CoRR, abs\/1805.02867","author":"Milakov Maxim","year":"2018","unstructured":"Maxim Milakov and Natalia Gimelshein. 2018. Online normalizer calculation for softmax. CoRR, abs\/1805.02867 (2018), arXiv:1805.02867. arxiv:1805.02867"},{"key":"e_1_3_2_1_17_1","unstructured":"Mistral AI. 2024. Un Ministral des Ministraux. Online. https:\/\/mistral.ai\/news\/ministraux Model card available at: https:\/\/huggingface.co\/mistralai\/Ministral-8B-Instruct-2410"},{"key":"e_1_3_2_1_18_1","unstructured":"Mistral AI and NVIDIA. 2024. Mistral NeMo 12B: A State-of-the-Art Language Model. Online. https:\/\/mistral.ai\/news\/mistral-nemo\/ Model card available at: https:\/\/huggingface.co\/mistralai\/Mistral-Nemo-Instruct-2407"},{"key":"e_1_3_2_1_19_1","unstructured":"NVIDIA Corporation. 2020. NVIDIA A100 Tensor Core GPU Architecture. NVIDIA. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf Accessed: 2025-08-06"},{"key":"e_1_3_2_1_20_1","unstructured":"NVIDIA Corporation. 2022. NVIDIA H100 Tensor Core GPU Architecture. NVIDIA. https:\/\/resources.nvidia.com\/en-us-hopper-architecture\/nvidia-h100-tensor-c Accessed: 2025-08-06"},{"key":"e_1_3_2_1_21_1","unstructured":"NVIDIA Corporation. 2025. Parallel Thread Execution (PTX) ISA Version 9.0. https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution\/index.html Accessed: 2025-08-25"},{"key":"e_1_3_2_1_22_1","unstructured":"OpenAI. 2025. Fused Attention Tutorial. https:\/\/triton-lang.org\/main\/getting-started\/tutorials\/06-fused-attention.html Accessed: 2025-08-11"},{"key":"e_1_3_2_1_23_1","unstructured":"OpenAI. 2025. gpt-oss-120 & gpt-oss-20b Model Card. https:\/\/cdn.openai.com\/pdf\/419b6906-9da6-406c-a19d-1bb078ac7637\/oai_gpt-oss_model_card.pdf Accessed: 2025-08-11"},{"key":"e_1_3_2_1_24_1","volume-title":"High-Performance Deep Learning Library. In Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas K\u00f6pf, Edward Z. Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, December 8-14, 2019, Vancouver, BC, Canada, Hanna M. Wallach, Hugo Larochelle, Alina Beygelzimer, Florence d\u2019Alch\u00e9-Buc, Emily B. Fox, and Roman Garnett (Eds.). 8024\u20138035. https:\/\/proceedings.neurips.cc\/paper\/2019\/hash\/bdbca288fee7f92f2bfa9f7012727740-Abstract.html"},{"key":"e_1_3_2_1_25_1","unstructured":"Richard GE Pinch. 2005. The distance of a permutation from a subgroup of S_n. arXiv preprint math\/0511501."},{"key":"e_1_3_2_1_26_1","volume-title":"Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024","author":"Shah Jay","year":"2024","unstructured":"Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, and Tri Dao. 2024. FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-precision. In Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver, BC, Canada, December 10 - 15, 2024, Amir Globersons, Lester Mackey, Danielle Belgrave, Angela Fan, Ulrich Paquet, Jakub M. Tomczak, and Cheng Zhang (Eds.). http:\/\/papers.nips.cc\/paper_files\/paper\/2024\/hash\/7ede97c3e082c6df10a8d6103a2eebd2-Abstract-Conference.html"},{"key":"e_1_3_2_1_27_1","volume-title":"Fast Transformer Decoding: One Write-Head is All You Need. CoRR, abs\/1911.02150","author":"Shazeer Noam","year":"2019","unstructured":"Noam Shazeer. 2019. Fast Transformer Decoding: One Write-Head is All You Need. CoRR, abs\/1911.02150 (2019), arXiv:1911.02150. arxiv:1911.02150"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2402.17762"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton-Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aur\u00e9lien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. CoRR abs\/2307.09288 (2023) arXiv:2307.09288. https:\/\/doi.org\/10.48550\/ARXIV.2307.09288 10.48550\/ARXIV.2307.09288","DOI":"10.48550\/ARXIV.2307.09288"},{"key":"e_1_3_2_1_31_1","volume-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA, Isabelle Guyon, Ulrike von Luxburg, Samy Bengio, Hanna M. Wallach, Rob Fergus, S. V. N. Vishwanathan, and Roman Garnett (Eds.). 5998\u20136008. https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.FINDINGS-ACL.32"},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence, IJCAI 2024","author":"Wang Xindi","year":"2024","unstructured":"Xindi Wang, Mahsa Salmani, Parsa Omidi, Xiangyu Ren, Mehdi Rezagholizadeh, and Armaghan Eshaghi. 2024. Beyond the Limits: A Survey of Techniques to Extend the Context Length in Large Language Models. In Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence, IJCAI 2024, Jeju, South Korea, August 3-9, 2024. ijcai.org, 8299\u20138307. https:\/\/www.ijcai.org\/proceedings\/2024\/917"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","unstructured":"Jianxing Xu Yuanbo Wen Jun Bi and Ruibai Xu. 2025. Artifact for the PPoPP \u201926 paper: \u201cFlashAttention-T: Towards Fully Tensorized Attention by Exploiting Tensor\u2013Vector Parallelism.\u201d. Zenodo. https:\/\/doi.org\/10.5281\/zenodo.17673796 Version v1 published 2025-11-22 10.5281\/zenodo.17673796","DOI":"10.5281\/zenodo.17673796"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3676641.3716262"},{"key":"e_1_3_2_1_36_1","unstructured":"An Yang Anfeng Li Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chang Gao Chengen Huang Chenxu Lv Chujie Zheng Dayiheng Liu Fan Zhou Fei Huang Feng Hu Hao Ge Haoran Wei Huan Lin Jialong Tang Jian Yang Jianhong Tu Jianwei Zhang Jian Yang Jiaxi Yang Jingren Zhou Junyang Lin Kai Dang Keqin Bao Kexin Yang Le Yu Lianghao Deng Mei Li Mingfeng Xue Mingze Li Pei Zhang Peng Wang Qin Zhu Rui Men Ruize Gao Shixuan Liu Shuang Luo Tianhao Li Tianyi Tang Wenbiao Yin Xingzhang Ren Xinyu Wang Xinyu Zhang Xuancheng Ren Yang Fan Yang Su Yichang Zhang Yinger Zhang Yu Wan Yuqiong Liu Zekun Wang Zeyu Cui Zhenru Zhang Zhipeng Zhou and Zihan Qiu. 2025. Qwen3 Technical Report. CoRR abs\/2505.09388 (2025) arXiv:2505.09388. https:\/\/doi.org\/10.48550\/ARXIV.2505.09388 10.48550\/ARXIV.2505.09388"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2501.01005"},{"key":"e_1_3_2_1_38_1","volume-title":"Root Mean Square Layer Normalization. In Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019","author":"Zhang Biao","year":"2019","unstructured":"Biao Zhang and Rico Sennrich. 2019. Root Mean Square Layer Normalization. In Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, December 8-14, 2019, Vancouver, BC, Canada, Hanna M. Wallach, Hugo Larochelle, Alina Beygelzimer, Florence d\u2019Alch\u00e9-Buc, Emily B. Fox, and Roman Garnett (Eds.). 12360\u201312371. https:\/\/proceedings.neurips.cc\/paper\/2019\/hash\/1e8a19426224ca89e83cef47f1e7f53b-Abstract.html"},{"key":"e_1_3_2_1_39_1","volume-title":"International Conference on Machine Learning (ICML).","author":"Zhang Jintao","year":"2025","unstructured":"Jintao Zhang, Haofeng Huang, Pengle Zhang, Jia Wei, Jun Zhu, and Jianfei Chen. 2025. Sageattention2: Efficient attention with thorough outlier smoothing and per-thread int4 quantization. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_40_1","volume-title":"The Thirteenth International Conference on Learning Representations, ICLR 2025","author":"Zhang Jintao","year":"2025","unstructured":"Jintao Zhang, Jia wei, Pengle Zhang, Jun Zhu, and Jianfei Chen. 2025. SageAttention: Accurate 8-Bit Attention for Plug-and-play Inference Acceleration. In The Thirteenth International Conference on Learning Representations, ICLR 2025, Singapore, April 24-28, 2025. OpenReview.net. https:\/\/openreview.net\/forum?id=OL44KtasKc"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD53106.2021.00054"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2303.18223"},{"key":"e_1_3_2_1_43_1","volume-title":"SGLang: Efficient Execution of Structured Language Model Programs. In Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024","author":"Zheng Lianmin","year":"2024","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Chuyue Sun, Jeff Huang, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E. Gonzalez, Clark W. Barrett, and Ying Sheng. 2024. SGLang: Efficient Execution of Structured Language Model Programs. In Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver, BC, Canada, December 10 - 15, 2024, Amir Globersons, Lester Mackey, Danielle Belgrave, Angela Fan, Ulrich Paquet, Jakub M. Tomczak, and Cheng Zhang (Eds.). http:\/\/papers.nips.cc\/paper_files\/paper\/2024\/hash\/724be4472168f31ba1c9ac630f15dec8-Abstract-Conference.html"}],"event":{"name":"PPoPP '26: 31st ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","location":"Sydney NSW Australia","acronym":"PPoPP '26","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","SIGPLAN ACM Special Interest Group on Programming Languages"]},"container-title":["Proceedings of the 31st ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3774934.3786425","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T15:28:01Z","timestamp":1769614081000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3774934.3786425"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,28]]},"references-count":43,"alternative-id":["10.1145\/3774934.3786425","10.1145\/3774934"],"URL":"https:\/\/doi.org\/10.1145\/3774934.3786425","relation":{},"subject":[],"published":{"date-parts":[[2026,1,28]]},"assertion":[{"value":"2026-01-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}