{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T08:52:49Z","timestamp":1771923169441,"version":"3.50.1"},"reference-count":57,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,1,31]]},"DOI":"10.1109\/cgo68049.2026.11395194","type":"proceedings-article","created":{"date-parts":[[2026,2,23]],"date-time":"2026-02-23T20:46:32Z","timestamp":1771879592000},"page":"630-643","source":"Crossref","is-referenced-by-count":0,"title":["Hexcute: A Compiler Framework for Automating Layout Synthesis in GPU Programs"],"prefix":"10.1109","author":[{"given":"Xiao","family":"Zhang","sequence":"first","affiliation":[{"name":"University of Toronto,NVIDIA,Toronto,Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yaoyao","family":"Ding","sequence":"additional","affiliation":[{"name":"University of Toronto,NVIDIA Vector Institute,Toronto,Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bolin","family":"Sun","sequence":"additional","affiliation":[{"name":"University of Toronto,NVIDIA,Toronto,Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yang","family":"Hu","sequence":"additional","affiliation":[{"name":"NVIDIA,Toronto,Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tatiana","family":"Shpeisman","sequence":"additional","affiliation":[{"name":"NVIDIA,Toronto,Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gennady","family":"Pekhimenko","sequence":"additional","affiliation":[{"name":"University of Toronto,NVIDIA Vector Institute,Toronto,Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"ref4","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3663363"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.2983149"},{"key":"ref7","article-title":"Recent advancements in end-to-end autonomous driving using deep learning: A survey","author":"Chib","year":"2023"},{"issue":"3","key":"ref8","first-page":"134","article-title":"Artificial intelligence and machine learning in drug discovery and development","volume-title":"Intelligent Medicine","volume":"2","author":"Patel","year":"2022"},{"key":"ref9","article-title":"Language models are few-shot learners","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems","author":"Brown"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/PDCAT.2008.38"},{"key":"ref11","article-title":"Amd cdna\u2122 3 architecture: The all-new amd gpu architecture for the modern era of hpc and ai","year":"2025"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589350"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/hcs59251.2023.10254715"},{"key":"ref14","article-title":"Ascend-cc: Confidential computing on heterogeneous npu for emerging generative ai workloads","author":"Dhar","year":"2024"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2010.41"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/178243.178259"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3126535"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"ref19","article-title":"CUTLASS","author":"Thakkar","year":"2025"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582018"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575702"},{"key":"ref22","first-page":"87","article-title":"Awq: Activation-aware weight quantization for on-device llm compression and acceleration","volume-title":"Proceedings of Machine Learning and Systems","volume":"6","author":"Lin"},{"key":"ref23","article-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers","author":"Frantar","year":"2022"},{"key":"ref24","article-title":"Mamba: Linear-time sequence modeling with selective state spaces","volume-title":"First Conference on Language Modeling","author":"Gu"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/2676726.2676992"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref27","article-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer","volume-title":"International Conference on Learning Representations","author":"Shazeer"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3729262"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3760250.3762221"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3168805"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3710848.3710871"},{"key":"ref32","article-title":"Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning","author":"Guo","year":"2025"},{"key":"ref33","article-title":"Hidet script"},{"key":"ref34","article-title":"CUTLASS - CuTe documentation","year":"2025"},{"key":"ref35","article-title":"Towards a high-performance ai compiler with upstream mlir","author":"Golin","year":"2024"},{"key":"ref36","article-title":"warp-level-matrix-instructions-ldmatrix","year":"2025"},{"key":"ref37","article-title":"Multiply-and-accumulate instruction: mma","year":"2025"},{"key":"ref38","article-title":"Flashattention: fast and memory-efficient exact attention with io-awareness","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"Dao"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2010.5452013"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/12276.13312"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/2555243.2555258"},{"key":"ref42","article-title":"NVDSL: Simplifying tensor cores with python-driven MLIR metaprogramming","volume-title":"Workshop on Efficient Systems for Foundation Models II @ ICML2024","author":"\u00d6zen"},{"key":"ref43","article-title":"cublas"},{"key":"ref44","article-title":"Flashinfer: Efficient and customizable attention engine for LLM inference serving","volume-title":"Eighth Conference on Machine Learning and Systems","author":"Ye"},{"key":"ref45","article-title":"Flashattention-3: fast and accurate attention with asynchrony and low-precision","volume-title":"Proceedings of the 38th International Conference on Neural Information Processing Systems","author":"Shah"},{"key":"ref46","article-title":"Mamba library","author":"Gu"},{"key":"ref47","volume-title":"CCCL: CUDA C++ Core Libraries","year":"2023"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/2499370.2462176"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.5555\/3291168.3291211"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378508"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3576933"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582047"},{"key":"ref53","article-title":"Ansor: Generating high-performance tensor programs for deep learning","volume-title":"Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation","author":"Zheng"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527440"},{"key":"ref55","article-title":"Ladder: enabling efficient low-precision deep learning computing through hardware-aware tensor transformation","volume-title":"Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation","author":"Wang"},{"key":"ref56","article-title":"Thunderkittens: Simple, fast, and $\\textit{Adorable}$ kernels","volume-title":"in The Thirteenth International Conference on Learning Representations","author":"Spector"},{"key":"ref57","article-title":"Hexcute: Artifact for cgo 2026","author":"Zhang","year":"2025"}],"event":{"name":"2026 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO)","location":"Sydney, Australia","start":{"date-parts":[[2026,1,31]]},"end":{"date-parts":[[2026,2,4]]}},"container-title":["2026 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11395173\/11394837\/11395194.pdf?arnumber=11395194","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T07:47:45Z","timestamp":1771919265000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11395194\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,31]]},"references-count":57,"URL":"https:\/\/doi.org\/10.1109\/cgo68049.2026.11395194","relation":{},"subject":[],"published":{"date-parts":[[2026,1,31]]}}}