{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T01:45:08Z","timestamp":1773193508259,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,2,28]],"date-time":"2025-02-28T00:00:00Z","timestamp":1740700800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,2,28]]},"DOI":"10.1145\/3710848.3710864","type":"proceedings-article","created":{"date-parts":[[2025,2,28]],"date-time":"2025-02-28T06:20:57Z","timestamp":1740723657000},"page":"183-196","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["FlashTensor: Optimizing Tensor Programs by Leveraging Fine-grained Tensor Property"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8654-0192","authenticated-orcid":false,"given":"Runxin","family":"Zhong","sequence":"first","affiliation":[{"name":"Tsinghua University, Qingcheng.AI, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2358-3395","authenticated-orcid":false,"given":"Yuyang","family":"Jin","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9045-9269","authenticated-orcid":false,"given":"Chen","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7633-9010","authenticated-orcid":false,"given":"Kinman","family":"Lei","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1733-9246","authenticated-orcid":false,"given":"Shuangyu","family":"Li","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7656-6428","authenticated-orcid":false,"given":"Jidong","family":"Zhai","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,2,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of Machine Learning and Systems, P. Gibbons, G. Pekhimenko, and C. De Sa (Eds.)","volume":"6","author":"Adnan Muhammad","year":"2024","unstructured":"Muhammad Adnan, Akhil Arunkumar, Gaurav Jain, Prashant Nair, Ilya Soloveychik, and Purushotham Kamath. 2024. Keyformer: KV Cache reduction through key tokens selection for Efficient Generative Inference. In Proceedings of Machine Learning and Systems, P. Gibbons, G. Pekhimenko, and C. De Sa (Eds.), Vol. 6. Machine Learning and Systems, Santa Clara, California, USA, 114--127. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2024\/file\/48fecef47b19fe501d27d338b6d52582-Paper-Conference.pdf"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640366"},{"key":"e_1_3_2_1_3_1","volume-title":"Simulated annealing. Statistical science 8, 1","author":"Bertsimas Dimitris","year":"1993","unstructured":"Dimitris Bertsimas and John Tsitsiklis. 1993. Simulated annealing. Statistical science 8, 1 (1993), 10--15."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3495883"},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the 13th USENIX Conference on Operating Systems Design and Implementation","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Meghan Cowan, Haichen Shen, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: an automated end-to-end optimizing compiler for deep learning. In Proceedings of the 13th USENIX Conference on Operating Systems Design and Implementation (Carlsbad, CA, USA) (OSDI'18). USENIX Association, USA, 579--594."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.5555\/3327144.3327258"},{"key":"e_1_3_2_1_7_1","unstructured":"Sharan Chetlur Cliff Woolley Philippe Vandermersch Jonathan Cohen John Tran Bryan Catanzaro and Evan Shelhamer. 2014. cudnn: Efficient primitives for deep learning. Technical Report. NVIDIA."},{"key":"e_1_3_2_1_8_1","unstructured":"NVIDIA cuBLAS. 2016. https:\/\/developer.nvidia.com\/cublas."},{"key":"e_1_3_2_1_9_1","volume-title":"CORM: Cache Optimization with Recent Message for Large Language Model Inference. arXiv:2404.15949 [cs.CL] https:\/\/arxiv.org\/abs\/2404.15949","author":"Dai Jincheng","year":"2024","unstructured":"Jincheng Dai, Zhuowei Huang, Haiyun Jiang, Chen Chen, Deng Cai, Wei Bi, and Shuming Shi. 2024. CORM: Cache Optimization with Recent Message for Large Language Model Inference. arXiv:2404.15949 [cs.CL] https:\/\/arxiv.org\/abs\/2404.15949"},{"key":"e_1_3_2_1_10_1","unstructured":"Tri Dao. 2023. FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. arXiv:2307.08691 [cs.LG] https:\/\/arxiv.org\/abs\/2307.08691"},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"Dao Tri","year":"2024","unstructured":"Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2024. FLASHATTENTION: fast and memory-efficient exact attention with IO-awareness. In Proceedings of the 36th International Conference on Neural Information Processing Systems (New Orleans, LA, USA) (NIPS '22). Curran Associates Inc., Red Hook, NY, USA, Article 1189, 16 pages."},{"key":"e_1_3_2_1_12_1","unstructured":"FlashInfer. 2024. https:\/\/flashinfer.ai\/."},{"key":"e_1_3_2_1_13_1","unstructured":"Pytorch FlexAttention. 2024. https:\/\/pytorch.org\/blog\/flexattention\/."},{"key":"e_1_3_2_1_14_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et al. 2024. The Llama 3 Herd of Models. arXiv:2407.21783 [cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651383"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359630"},{"key":"e_1_3_2_1_17_1","unstructured":"Albert Q.Jiang Alexandre Sablayrolles Arthur Mensch Chris Bamford Devendra Singh Chaplot Diego de las Casas Florian Bressand Gianna Lengyel Guillaume Lample Lucile Saulnier L\u00e9lio Renard Lavaud Marie-Anne Lachaux Pierre Stock Teven Le Scao Thibaut Lavril Thomas Wang Timoth\u00e9e Lacroix and William El Sayed. 2023. Mistral 7B. arXiv:2310.06825 [cs.CL] https:\/\/arxiv.org\/abs\/2310.06825"},{"key":"e_1_3_2_1_18_1","volume-title":"MLIR: A Compiler Infrastructure for the End of Moore's Law. CoRR abs\/2002.11054","author":"Lattner Chris","year":"2020","unstructured":"Chris Lattner, Jacques A. Pienaar, Mehdi Amini, Uday Bondhugula, River Riddle, Albert Cohen, Tatiana Shpeisman, Andy Davis, Nicolas Vasilache, and Oleksandr Zinenko. 2020. MLIR: A Compiler Infrastructure for the End of Moore's Law. CoRR abs\/2002.11054 (2020), 21 pages. arXiv:2002.11054 https:\/\/arxiv.org\/abs\/2002.11054"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1186\/s13677-024-00592-1"},{"key":"e_1_3_2_1_20_1","unstructured":"Yuhong Li Yingbing Huang Bowen Yang Bharat Venkitesh Acyr Locatelli Hanchen Ye Tianle Cai Patrick Lewis and Deming Chen. 2024. SnapKV: LLM Knows What You are Looking for Before Generation. arXiv:2404.14469 [cs.CL] https:\/\/arxiv.org\/abs\/2404.14469"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454083"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651384"},{"key":"e_1_3_2_1_23_1","unstructured":"OpenAI Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat and others. 2024. GPT-4 Technical Report. arXiv:2303.08774 [cs.CL] https:\/\/arxiv.org\/abs\/2303 08774"},{"key":"e_1_3_2_1_24_1","first-page":"12","article-title":"PyTorch: an imperative style, high-performance deep learning library. Curran Associates Inc., Red Hook, NY, USA","volume":"721","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas K\u00f6pf, Edward Yang, Zach DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: an imperative style, high-performance deep learning library. Curran Associates Inc., Red Hook, NY, USA, Chapter 721, 12.","journal-title":"Chapter"},{"key":"e_1_3_2_1_25_1","unstructured":"Bowen Peng Jeffrey Quesnelle Honglu Fan and Enrico Shippole. 2023. YaRN: Efficient Context Window Extension of Large Language Models. arXiv:2309.00071 [cs.CL] https:\/\/arxiv.org\/abs\/2309.00071"},{"key":"e_1_3_2_1_26_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever et al. 2018. Improving language understanding by generative pre-training. https:\/\/openai.com\/blog\/language-unsupervised. OpenAI Blog."},{"key":"e_1_3_2_1_27_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_2_1_28_1","volume-title":"Zhu","author":"Ren Siyu","year":"2024","unstructured":"Siyu Ren and Kenny Q. Zhu. 2024. On the Efficacy of Eviction Policy for Key-Value Constrained Generative Language Model Inference. arXiv:2402.06262 [cs.CL] https:\/\/arxiv.org\/abs\/2402.06262"},{"key":"e_1_3_2_1_29_1","unstructured":"Baptiste Rozi\u00e8re Jonas Gehring Fabian Gloeckle Sten Sootla Itai Gat Xiaoqing Ellen Tan Yossi Adi Jingyu Liu Romain Sauvestre Tal Remez J\u00e9r\u00e9my Rapin Artyom Kozhevnikov Ivan Evtimov Joanna Bitton Manish Bhatt Cristian Canton Ferrer Aaron Grattafiori Wenhan Xiong Alexandre D\u00e9fossez Jade Copet Faisal Azhar Hugo Touvron Louis Martin Nicolas Usunier Thomas Scialom and Gabriel Synnaeve. 2024. Code Llama: Open Foundation Models for Code. arXiv:2308.12950 [cs.CL] https:\/\/arxiv.org\/abs\/2308.12950"},{"key":"e_1_3_2_1_30_1","unstructured":"Jay Shah Ganesh Bikshandi Ying Zhang Vijay Thakkar Pradeep Ramani and Tri Dao. 2024. FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-precision. arXiv:2407.08608 [cs.LG] https:\/\/arxiv.org\/abs\/2407.08608"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jbi.2020.103627"},{"key":"e_1_3_2_1_32_1","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Shi Yining","year":"2023","unstructured":"Yining Shi, Zhi Yang, Jilong Xue, Lingxiao Ma, Yuqing Xia, Ziming Miao, Yuxiao Guo, Fan Yang, and Lidong Zhou. 2023. Welder: Scheduling Deep Learning Memory Access via Tile-graph. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). USENIX Association, Boston, MA, 701--718. https:\/\/www.usenix.org\/conference\/osdi23\/presentation\/shi"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3519939.3523448"},{"key":"e_1_3_2_1_34_1","volume-title":"Cassidy Hardin, Surya Bhupatiraju, L\u00e9onard Hussenot, Thomas Mesnard, Bobak Shahriari, Alexandre Ram\u00e9, et al.","author":"Team Gemma","year":"2024","unstructured":"Gemma Team, Morgane Riviere, Shreya Pathak, Pier Giuseppe Sessa, Cassidy Hardin, Surya Bhupatiraju, L\u00e9onard Hussenot, Thomas Mesnard, Bobak Shahriari, Alexandre Ram\u00e9, et al. 2024. Gemma 2: Improving Open Language Models at a Practical Size. arXiv:2408.00118 [cs.CL] https:\/\/arxiv.org\/abs\/2408.00118"},{"key":"e_1_3_2_1_35_1","unstructured":"MosaicML NLP Team. 2023. Introducing MPT-30B: Raising the bar for open-source foundation models. MosaicML. www.mosaicml.com\/blog\/mpt-30b Accessed: 2023-06-22."},{"key":"e_1_3_2_1_36_1","unstructured":"NVIDIA TensorRT. 2017. https:\/\/developer.nvidia.com\/tensorrt."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_2_1_38_1","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar Aurelien Rodriguez Armand Joulin Edouard Grave and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. arXiv:2302.13971 [cs.CL] https:\/\/arxiv.org\/abs\/2302 13971"},{"key":"e_1_3_2_1_39_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv:2307.09288 [cs.CL] https:\/\/arxiv.org\/abs\/2307.09288"},{"key":"e_1_3_2_1_40_1","volume-title":"\u0141 ukasz Kaiser, and Illia Polosukhin","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141 ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems, I. Guyon, U. Von Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett (Eds.), Vol. 30. Curran Associates, Inc., Long Beach, California, USA. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_2_1_41_1","volume-title":"PET: Optimizing Tensor Programs with Partially Equivalent Transformations and Automated Corrections. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21)","author":"Wang Haojie","year":"2021","unstructured":"Haojie Wang, Jidong Zhai, Mingyu Gao, Zixuan Ma, Shizhi Tang, Liyan Zheng, Yuanzhi Li, Kaiyuan Rong, Yuanyong Chen, and Zhihao Jia. 2021. PET: Optimizing Tensor Programs with Partially Equivalent Transformations and Automated Corrections. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21). USENIX Association, Virtual, 37--54. https:\/\/www.usenix.org\/conference\/osdi21\/presentation\/wang"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/1498765.1498785"},{"key":"e_1_3_2_1_43_1","unstructured":"TensorFlow XLA. 2023. https:\/\/www.tensorflow.org\/xla."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.814"},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"Zhang Zhenyu","year":"2024","unstructured":"Zhenyu Zhang, Ying Sheng, Tianyi Zhou, Tianlong Chen, Lianmin Zheng, Ruisi Cai, Zhao Song, Yuandong Tian, Christopher R\u00e9, Clark Barrett, Zhangyang Wang, and Beidi Chen. 2024. H2O: heavy-hitter oracle for efficient generative inference of large language models. In Proceedings of the 37th International Conference on Neural Information Processing Systems (New Orleans, LA, USA) (NIPS '23). Curran Associates Inc., Red Hook, NY, USA, Article 1506, 50 pages."},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation (OSDI'20)","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, Joseph E. Gonzalez, and Ion Stoica. 2020. Ansor: generating high-performance tensor programs for deep learning. In Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation (OSDI'20). USENIX Association, USA, Article 49, 17 pages."},{"key":"e_1_3_2_1_47_1","volume-title":"EINNET: Optimizing Tensor Programs with Derivation-Based Transformations. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Zheng Liyan","year":"2023","unstructured":"Liyan Zheng, Haojie Wang, Jidong Zhai, Muyan Hu, Zixuan Ma, Tuowei Wang, Shuhong Huang, Xupeng Miao, Shizhi Tang, Kezhao Huang, and Zhihao Jia. 2023. EINNET: Optimizing Tensor Programs with Derivation-Based Transformations. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). USENIX Association, Boston, MA, 739--755. https:\/\/www.usenix.org\/conference\/osdi23\/presentation\/zheng"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507723"}],"event":{"name":"PPoPP '25: The 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","location":"Las Vegas NV USA","acronym":"PPoPP '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3710848.3710864","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3710848.3710864","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:15:04Z","timestamp":1755875704000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3710848.3710864"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,28]]},"references-count":48,"alternative-id":["10.1145\/3710848.3710864","10.1145\/3710848"],"URL":"https:\/\/doi.org\/10.1145\/3710848.3710864","relation":{},"subject":[],"published":{"date-parts":[[2025,2,28]]},"assertion":[{"value":"2025-02-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}