{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T15:12:50Z","timestamp":1765465970427,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":72,"publisher":"ACM","funder":[{"name":"Early Researcher Award","award":["ER22-17-191"],"award-info":[{"award-number":["ER22-17-191"]}]},{"name":"Vector CIFAR AI Chair Award","award":["N&#x5c;&#x2f;A"],"award-info":[{"award-number":["N&#x5c;&#x2f;A"]}]},{"name":"VMware Early Career Faculty Grant&#x5c;&#x2f;Award","award":["N&#x5c;&#x2f;A"],"award-info":[{"award-number":["N&#x5c;&#x2f;A"]}]},{"name":"Google Scholar Research Award","award":["Holistic Systems Techniques for Efficient Training of Deep Learning Models"],"award-info":[{"award-number":["Holistic Systems Techniques for Efficient Training of Deep Learning Models"]}]},{"name":"NSERC Discovery Grant","award":["RGPIN-2024-06017"],"award-info":[{"award-number":["RGPIN-2024-06017"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,22]]},"DOI":"10.1145\/3760250.3762219","type":"proceedings-article","created":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T15:06:36Z","timestamp":1765465596000},"page":"281-297","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Tilus: A Tile-Level GPGPU Programming Language for Low-Precision Computation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9801-6080","authenticated-orcid":false,"given":"Yaoyao","family":"Ding","sequence":"first","affiliation":[{"name":"Univerisity of Toronto, Toronto, ON, Canada, NVIDIA, Santa Clara, CA, USA, and Vector Institute, Toronto, ON, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5718-3387","authenticated-orcid":false,"given":"Bohan","family":"Hou","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0914-5669","authenticated-orcid":false,"given":"Xiao","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Toronto, Toronto, ON, Canada and NVIDIA, Santa Clara, CA, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0401-1053","authenticated-orcid":false,"given":"Allan","family":"Lin","sequence":"additional","affiliation":[{"name":"University of Waterloo, Waterloo, ON, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5744-3940","authenticated-orcid":false,"given":"Tianqi","family":"Chen","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA and NVIDIA, Santa Clara, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9298-6254","authenticated-orcid":false,"given":"Cody Hao","family":"Yu","sequence":"additional","affiliation":[{"name":"Independent Researcher, Santa Clara, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8165-840X","authenticated-orcid":false,"given":"Yida","family":"Wang","sequence":"additional","affiliation":[{"name":"Amazon Web Services, Santa Clara, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3839-0919","authenticated-orcid":false,"given":"Gennady","family":"Pekhimenko","sequence":"additional","affiliation":[{"name":"University of Toronto, Toronto, ON, Canada, NVIDIA, Santa Clara, CA, Canada, and Vector Institute, Toronto, ON, Canada"}]}],"member":"320","published-online":{"date-parts":[[2025,12,11]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Wilson Wai Lun Fung, and Timothy G Rogers","author":"Aamodt Tor M","year":"2018","unstructured":"Tor M Aamodt, Wilson Wai Lun Fung, and Timothy G Rogers. 2018. The SIMT Core: Instruction and Register Data Flow. In General-Purpose Graphics Processor Architectures. Springer, 21-66."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3322967"},{"key":"e_1_3_2_1_3_1","unstructured":"Aditya Agrawal Matthew Hedlund and Blake Hechtman. 2024. eXmY: A Data Type and Technique for Arbitrary Bit Precision Quantization. arXiv:2405.13938 [cs.LG] https:\/\/arxiv.org\/abs\/2405.13938"},{"key":"e_1_3_2_1_4_1","volume-title":"Chameleon: Adaptive Code Optimization for Expedited Deep Neural Network Compilation. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=rygG4AVFvH","author":"Ahn Byung Hoon","year":"2020","unstructured":"Byung Hoon Ahn, Prannoy Pilligundla, Amir Yazdanbakhsh, and Hadi Esmaeilzadeh. 2020. Chameleon: Adaptive Code Optimization for Expedited Deep Neural Network Compilation. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=rygG4AVFvH"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707284"},{"key":"e_1_3_2_1_6_1","unstructured":"AMD Corporation. 2024a. CDNA 3 Architecture for Accelerated Computing. Available at https:\/\/www.amd.com\/en\/technologies\/cdna.html."},{"key":"e_1_3_2_1_7_1","volume-title":"HIP: Heterogeneous-Compute Interface for Portability.","author":"AMD Corporation","year":"2024","unstructured":"AMD Corporation. 2024b. HIP: Heterogeneous-Compute Interface for Portability. Available at https:\/\/rocm.docs.amd.com\/projects\/HIP\/en\/latest\/."},{"key":"e_1_3_2_1_8_1","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=dfqsW38v1X","author":"Ashkboos Saleh","year":"2024","unstructured":"Saleh Ashkboos, Amirkeivan Mohtashami, Maximilian L. Croci, Bo Li, Pashmina Cameron, Martin Jaggi, Dan Alistarh, Torsten Hoefler, and James Hensman. 2024. QuaRot: Outlier-Free 4-Bit Inference in Rotated LLMs. In The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=dfqsW38v1X"},{"key":"e_1_3_2_1_9_1","first-page":"1877","volume-title":"Lin (Eds.)","volume":"33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems, H. Larochelle, M. Ranzato, R. Hadsell, M.F. Balcan, and H. Lin (Eds.), Vol. 33. Curran Associates, Inc., 1877-1901. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2020\/file\/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf"},{"key":"e_1_3_2_1_10_1","volume-title":"Lcq: Low-rank codebook based quantization for large language models. arXiv preprint arXiv:2405.20973","author":"Cai Wen-Pu","year":"2024","unstructured":"Wen-Pu Cai, Ming-Yang Li, and Wu-Jun Li. 2024. Lcq: Low-rank codebook based quantization for large language models. arXiv preprint arXiv:2405.20973 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"Chee Jerry","year":"2023","unstructured":"Jerry Chee, Yaohui Cai, Volodymyr Kuleshov, and Christopher De Sa. 2023. QuIP: 2-bit quantization of large language models with guarantees. In Proceedings of the 37th International Conference on Neural Information Processing Systems (New Orleans, LA, USA) (NIPS '23). Curran Associates Inc., Red Hook, NY, USA, Article 196, 34 pages."},{"key":"e_1_3_2_1_12_1","volume-title":"TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In OSDI.","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Q. Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In OSDI."},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"Dettmers Tim","year":"2024","unstructured":"Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. 2024. LLM.int8(): 8-bit matrix multiplication for transformers at scale. In Proceedings of the 36th International Conference on Neural Information Processing Systems (New Orleans, LA, USA) (NIPS '22). Curran Associates Inc., Red Hook, NY, USA, Article 2198, 15 pages."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575702"},{"key":"e_1_3_2_1_15_1","first-page":"167","article-title":"Ios: Inter-operator scheduler for cnn acceleration","volume":"3","author":"Ding Yaoyao","year":"2021","unstructured":"Yaoyao Ding, Ligeng Zhu, Zhihao Jia, Gennady Pekhimenko, and Song Han. 2021. Ios: Inter-operator scheduler for cnn acceleration. Proceedings of Machine Learning and Systems, Vol. 3 (2021), 167-180.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_16_1","first-page":"38","article-title":"Cortex: A compiler for recursive deep learning models","volume":"3","author":"Fegade Pratik","year":"2021","unstructured":"Pratik Fegade, Tianqi Chen, Phillip Gibbons, and Todd Mowry. 2021. Cortex: A compiler for recursive deep learning models. Proceedings of Machine Learning and Systems, Vol. 3 (2021), 38-54.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.)","volume":"4","author":"Fegade Pratik","year":"2022","unstructured":"Pratik Fegade, Tianqi Chen, Phillip Gibbons, and Todd Mowry. 2022. The CoRa Tensor Compiler: Compilation for Ragged Tensors with Minimal Padding. In Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.), Vol. 4. 721-747. https:\/\/proceedings.mlsys.org\/paper\/2022\/file\/d3d9446802a44259755d38e6d163e820-Paper.pdf"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476157"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3576933"},{"key":"e_1_3_2_1_20_1","volume-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2022. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323 (2022)."},{"key":"e_1_3_2_1_21_1","volume-title":"MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large Language Models. arXiv:2408.11743 [cs.LG] https:\/\/arxiv.org\/abs\/2408.11743","author":"Frantar Elias","year":"2024","unstructured":"Elias Frantar, Roberto L. Castro, Jiale Chen, Torsten Hoefler, and Dan Alistarh. 2024. MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large Language Models. arXiv:2408.11743 [cs.LG] https:\/\/arxiv.org\/abs\/2408.11743"},{"key":"e_1_3_2_1_22_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582018"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358275"},{"key":"e_1_3_2_1_25_1","volume-title":"Accelerating a Triton Fused Kernel for W4A16 Quantized Inference with SplitK work decomposition. arXiv preprint arXiv:2402.00025","author":"Hoque Adnan","year":"2024","unstructured":"Adnan Hoque, Less Wright, Chih-Chieh Yang, Mudhakar Srivatsa, and Raghu Ganti. 2024. Accelerating a Triton Fused Kernel for W4A16 Quantized Inference with SplitK work decomposition. arXiv preprint arXiv:2402.00025 (2024)."},{"key":"e_1_3_2_1_26_1","first-page":"680","article-title":"Alcop: Automatic load-compute pipelining in deep learning compiler for ai-gpus","volume":"5","author":"Huang Guyue","year":"2023","unstructured":"Guyue Huang, Yang Bai, Liu Liu, Yuke Wang, Bei Yu, Yufei Ding, and Yuan Xie. 2023. Alcop: Automatic load-compute pipelining in deep learning compiler for ai-gpus. Proceedings of Machine Learning and Systems, Vol. 5 (2023), 680-694.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/IEEESTD.2019.8766229"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359630"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_30_1","volume-title":"AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. In MLSys.","author":"Lin Ji","year":"2024","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Wei-Ming Chen, Wei-Chen Wang, Guangxuan Xiao, Xingyu Dang, Chuang Gan, and Song Han. 2024. AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. In MLSys."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1387"},{"key":"e_1_3_2_1_32_1","volume-title":"SpinQuant: LLM Quantization with Learned Rotations. In The Thirteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=ogO6DGE6FZ","author":"Liu Zechun","year":"2025","unstructured":"Zechun Liu, Changsheng Zhao, Igor Fedorov, Bilge Soran, Dhruv Choudhary, Raghuraman Krishnamoorthi, Vikas Chandra, Yuandong Tian, and Tijmen Blankevoort. 2025. SpinQuant: LLM Quantization with Learned Rotations. In The Thirteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=ogO6DGE6FZ"},{"key":"e_1_3_2_1_33_1","volume-title":"Dissecting the NVIDIA Hopper Architecture through Microbenchmarking and Multiple Level Analysis. arXiv preprint arXiv:2501.12084","author":"Luo Weile","year":"2025","unstructured":"Weile Luo, Ruibo Fan, Zeyu Li, Dayou Du, Hongyuan Liu, Qiang Wang, and Xiaowen Chu. 2025. Dissecting the NVIDIA Hopper Architecture through Microbenchmarking and Multiple Level Analysis. arXiv preprint arXiv:2501.12084 (2025)."},{"key":"e_1_3_2_1_34_1","volume-title":"RAMMER: Enabling Holistic Deep Learning Compiler Optimizations with Rtasks","author":"Ma Lingxiao","year":"2020","unstructured":"Lingxiao Ma, Zhiqiang Xie, Zhi Yang, Jilong Xue, Youshan Miao, Wei Cui, Wenxiang Hu, Fan Yang, Lintao Zhang, and Lidong Zhou. 2020. RAMMER: Enabling Holistic Deep Learning Compiler Optimizations with Rtasks. USENIX Association, USA, 17."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3658617.3697668"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00107"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2010.41"},{"key":"e_1_3_2_1_38_1","volume-title":"CUTLASS: CUDA Templates for Linear Algebra Subroutines and Solvers. https:\/\/github.com\/NVIDIA\/cutlass.","author":"NVIDIA Corporation","year":"2021","unstructured":"NVIDIA Corporation. 2021. CUTLASS: CUDA Templates for Linear Algebra Subroutines and Solvers. https:\/\/github.com\/NVIDIA\/cutlass."},{"key":"e_1_3_2_1_39_1","unstructured":"NVIDIA Corporation. 2023. NVIDIA cuBLAS Library. https:\/\/developer.nvidia.com\/cublas Version 12.2.."},{"key":"e_1_3_2_1_40_1","unstructured":"NVIDIA Corporation. 2024a. CUDA C Programming Guide. Version 12.0. Available at https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/."},{"key":"e_1_3_2_1_41_1","unstructured":"NVIDIA Corporation. 2024b. Parallel Thread Execution ISA Version 12.0. Available at https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution\/index.html."},{"key":"e_1_3_2_1_42_1","volume-title":"SASS: Streaming Assembler for NVIDIA GPUs.","author":"NVIDIA Corporation","year":"2024","unstructured":"NVIDIA Corporation. 2024c. SASS: Streaming Assembler for NVIDIA GPUs. Available at https:\/\/docs.nvidia.com\/cuda\/cuda-binary-utilities\/index.html."},{"volume-title":"https:\/\/chat.openai.com\/. Accessed: 2024-11-12","author":"AI.","key":"e_1_3_2_1_43_1","unstructured":"OpenAI. 2024. ChatGPT. https:\/\/chat.openai.com\/. Accessed: 2024-11-12; Generative AI language model."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3572848.3577479"},{"volume-title":"Different-Sized LLMs. In Proceedings of the 41st International Conference on Machine Learning.","author":"Park Yeonhong","key":"e_1_3_2_1_45_1","unstructured":"Yeonhong Park, Jake Hyun, SangLyul Cho, Bonggeun Sim, and Jae W. Lee. 2024. Any-Precision LLM: Low-Cost Deployment of Multiple, Different-Sized LLMs. In Proceedings of the 41st International Conference on Machine Learning."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/2499370.2462176"},{"key":"e_1_3_2_1_47_1","unstructured":"Bita Darvish Rouhani Ritchie Zhao Ankit More Mathew Hall Alireza Khodamoradi Summer Deng Dhruv Choudhary Marius Cornea Eric Dellinger Kristof Denolf et al. 2023. Microscaling data formats for deep learning. arXiv preprint arXiv:2310.10537 (2023)."},{"key":"e_1_3_2_1_48_1","volume-title":"XLA : Compiling Machine Learning for Peak Performance.","author":"Sabne Amit","year":"2020","unstructured":"Amit Sabne. 2020. XLA : Compiling Machine Learning for Peak Performance."},{"key":"e_1_3_2_1_49_1","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"Shao Junru","year":"2024","unstructured":"Junru Shao, Xiyou Zhou, Siyuan Feng, Bohan Hou, Ruihang Lai, Hongyi Jin, Wuwei Lin, Masahiro Masuda, Cody Hao Yu, and Tianqi Chen. 2024. Tensor program optimization with probabilistic programs. In Proceedings of the 36th International Conference on Neural Information Processing Systems (New Orleans, LA, USA) (NIPS '22). Curran Associates Inc., Red Hook, NY, USA, Article 2593, 14 pages."},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings of Machine Learning and Systems, A. Smola, A. Dimakis, and I. Stoica (Eds.)","volume":"3","author":"Shen Haichen","year":"2021","unstructured":"Haichen Shen, Jared Roesch, Zhi Chen, Wei Chen, Yong Wu, Mu Li, Vin Sharma, Zachary Tatlock, and Yida Wang. 2021. Nimble: Efficiently Compiling Dynamic Neural Networks for Model Inference. In Proceedings of Machine Learning and Systems, A. Smola, A. Dimakis, and I. Stoica (Eds.), Vol. 3. 208-222. https:\/\/proceedings.mlsys.org\/paper\/2021\/file\/4e732ced3463d06de0ca9a15b6153677-Paper.pdf"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3519939.3523448"},{"key":"e_1_3_2_1_52_1","volume-title":"Cassidy Hardin, Surya Bhupatiraju, L\u00e9onard Hussenot, Thomas Mesnard, Bobak Shahriari, Alexandre Ram\u00e9, et al.","author":"Team Gemma","year":"2024","unstructured":"Gemma Team, Morgane Riviere, Shreya Pathak, Pier Giuseppe Sessa, Cassidy Hardin, Surya Bhupatiraju, L\u00e9onard Hussenot, Thomas Mesnard, Bobak Shahriari, Alexandre Ram\u00e9, et al., 2024. Gemma 2: Improving open language models at a practical size. arXiv preprint arXiv:2408.00118 (2024)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_2_1_54_1","volume-title":"Proceedings of the 41st International Conference on Machine Learning","author":"Tseng Albert","year":"2024","unstructured":"Albert Tseng, Jerry Chee, Qingyao Sun, Volodymyr Kuleshov, and Christopher De Sa. 2024. QuIP#: even better LLM quantization with hadamard incoherence and lattice codebooks. In Proceedings of the 41st International Conference on Machine Learning (Vienna, Austria) (ICML'24). JMLR.org, Article 1987, 27 pages."},{"key":"e_1_3_2_1_55_1","volume-title":"Tensor Comprehensions: Framework-Agnostic High-Performance Machine Learning Abstractions. ArXiv","author":"Vasilache Nicolas","year":"2018","unstructured":"Nicolas Vasilache, Oleksandr Zinenko, Theodoros Theodoridis, Priya Goyal, Zach DeVito, William S. Moses, Sven Verdoolaege, Andrew Adams, and Albert Cohen. 2018. Tensor Comprehensions: Framework-Agnostic High-Performance Machine Learning Abstractions. ArXiv, Vol. abs\/1802.04730 (2018)."},{"key":"e_1_3_2_1_56_1","volume-title":"\u0141 ukasz Kaiser, and Illia Polosukhin","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141 ukasz Kaiser, and Illia Polosukhin. 2017a. Attention is All you Need. In Advances in Neural Information Processing Systems, I. Guyon, U. Von Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett (Eds.), Vol. 30. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_2_1_57_1","volume-title":"\u0141 ukasz Kaiser, and Illia Polosukhin","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141 ukasz Kaiser, and Illia Polosukhin. 2017b. Attention is All you Need. In Advances in Neural Information Processing Systems, I. Guyon, U. Von Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett (Eds.), Vol. 30. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_2_1_58_1","first-page":"307","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Wang Lei","year":"2024","unstructured":"Lei Wang, Lingxiao Ma, Shijie Cao, Quanlu Zhang, Jilong Xue, Yining Shi, Ningxin Zheng, Ziming Miao, Fan Yang, Ting Cao, Yuqing Yang, and Mao Yang. 2024. Ladder: Enabling Efficient Low-Precision Deep Learning Computing through Hardware-aware Tensor Transformation. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 307-323. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/wang-lei"},{"volume-title":"Workshop, Teven Le Scao, Angela Fan, Christopher Akiki, Ellie Pavlick, Suzana Ili\u0107, Daniel Hesslow, Roman Castagn\u00e9, Alexandra Sasha Luccioni, Fran\u00e7ois Yvon, et al., 2022","year":"2022","key":"e_1_3_2_1_59_1","unstructured":"BigScience Workshop, Teven Le Scao, Angela Fan, Christopher Akiki, Ellie Pavlick, Suzana Ili\u0107, Daniel Hesslow, Roman Castagn\u00e9, Alexandra Sasha Luccioni, Fran\u00e7ois Yvon, et al., 2022. Bloom: A 176b-parameter open-access multilingual language model. arXiv preprint arXiv:2211.05100 (2022)."},{"key":"e_1_3_2_1_60_1","first-page":"699","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Xia Haojun","year":"2024","unstructured":"Haojun Xia, Zhen Zheng, Xiaoxia Wu, Shiyang Chen, Zhewei Yao, Stephen Youn, Arash Bakhtiari, Michael Wyatt, Donglin Zhuang, Zhongzhu Zhou, Olatunji Ruwase, Yuxiong He, and Shuaiwen Leon Song. 2024. Quant-LLM: Accelerating the Serving of Large Language Models via FP6-Centric Algorithm-System Co-Design on Modern GPUs. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). USENIX Association, Santa Clara, CA, 699-713. https:\/\/www.usenix.org\/conference\/atc24\/presentation\/xia"},{"key":"e_1_3_2_1_61_1","volume-title":"Proceedings of Machine Learning and Systems","volume":"4","author":"Xing Jiarong","year":"2022","unstructured":"Jiarong Xing, Leyuan Wang, Shang Zhang, Jack Chen, Ang Chen, and Yibo Zhu. 2022. Bolt: Bridging the Gap between Auto-tuners and Hardware-native Performance. In Proceedings of Machine Learning and Systems, Vol. 4."},{"key":"e_1_3_2_1_62_1","unstructured":"An Yang Baosong Yang Binyuan Hui Bo Zheng Bowen Yu Chang Zhou Chengpeng Li Chengyuan Li Dayiheng Liu Fei Huang et al. 2024. Qwen2 technical report. arXiv preprint arXiv:2407.10671 (2024)."},{"key":"e_1_3_2_1_63_1","first-page":"521","volume-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 521-538. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/yu"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i21.34385"},{"key":"e_1_3_2_1_65_1","volume-title":"Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.)","volume":"4","author":"Zhao Jie","year":"2022","unstructured":"Jie Zhao, Xiong Gao, Ruijie Xia, Zhaochuang Zhang, Deshi Chen, Lei Chen, Renwei Zhang, Zhen Geng, Bin Cheng, and Xuefeng Jin. 2022. Apollo: Automatic Partition-based Operator Fusion through Layer by Layer Optimization. In Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.), Vol. 4. 1-19. https:\/\/proceedings.mlsys.org\/paper\/2022\/file\/069059b7ef840f0c74a814ec9237b6ec-Paper.pdf"},{"key":"e_1_3_2_1_66_1","volume-title":"Haichen Shen, Joshua Fromm, Yizhi Liu, Yida Wang, Luis Ceze, Tianqi Chen, and Gennady Pekhimenko.","author":"Zheng Bojian","year":"2022","unstructured":"Bojian Zheng, Ziheng Jiang, Cody Hao Yu, Haichen Shen, Joshua Fromm, Yizhi Liu, Yida Wang, Luis Ceze, Tianqi Chen, and Gennady Pekhimenko. 2022b. DietCode: Automatic Optimization for Dynamic Tensor Programs. In Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.), Vol. 4. 848-863. https:\/\/proceedings.mlsys.org\/paper\/2022\/file\/fa7cdfad1a5aaf8370ebeda47a1ff1c3-Paper.pdf"},{"key":"e_1_3_2_1_67_1","first-page":"863","volume-title":"Ansor: Generating High-Performance Tensor Programs for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, Joseph E. Gonzalez, and Ion Stoica. 2020a. Ansor: Generating High-Performance Tensor Programs for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 863-879."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527440"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378508"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507723"},{"key":"e_1_3_2_1_71_1","first-page":"233","volume-title":"ROLLER: Fast and Efficient Tensor Compilation for Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zhu Hongyu","year":"2022","unstructured":"Hongyu Zhu, Ruofan Wu, Yijia Diao, Shanbin Ke, Haoyu Li, Chen Zhang, Jilong Xue, Lingxiao Ma, Yuqing Xia, Wei Cui, Fan Yang, Mao Yang, Lidong Zhou, Asaf Cidon, and Gennady Pekhimenko. 2022. ROLLER: Fast and Efficient Tensor Compilation for Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 233-248. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/zhu"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437984.3458838"}],"event":{"name":"ASPLOS '26:31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"],"location":"Pittsburgh PA USA"},"container-title":["Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3760250.3762219","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T15:07:53Z","timestamp":1765465673000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3760250.3762219"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,11]]},"references-count":72,"alternative-id":["10.1145\/3760250.3762219","10.1145\/3760250"],"URL":"https:\/\/doi.org\/10.1145\/3760250.3762219","relation":{},"subject":[],"published":{"date-parts":[[2025,12,11]]},"assertion":[{"value":"2025-12-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}