{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T07:57:26Z","timestamp":1776931046816,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2023YFE0205700"],"award-info":[{"award-number":["2023YFE0205700"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62341410"],"award-info":[{"award-number":["62341410"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003009","name":"Science and Technology Development Fund","doi-asserted-by":"publisher","award":["0078\/2023\/AMJ, 001\/2024\/SKL"],"award-info":[{"award-number":["0078\/2023\/AMJ, 001\/2024\/SKL"]}],"id":[{"id":"10.13039\/501100003009","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3712285.3759771","type":"proceedings-article","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T16:04:47Z","timestamp":1762963487000},"page":"1604-1618","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["HyTiS: Hybrid Tile Scheduling for GPU GEMM with Enhanced Wave Utilization and Cache Locality"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6599-9976","authenticated-orcid":false,"given":"Zheng","family":"Zhang","sequence":"first","affiliation":[{"name":"Wuhan University, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8329-0145","authenticated-orcid":false,"given":"Hulin","family":"Wang","sequence":"additional","affiliation":[{"name":"Wuhan University, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6255-263X","authenticated-orcid":false,"given":"Hongming","family":"Xu","sequence":"additional","affiliation":[{"name":"Wuhan University, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3913-3623","authenticated-orcid":false,"given":"Donglin","family":"Yang","sequence":"additional","affiliation":[{"name":"NVIDIA Corporation, Santa Clara, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9500-3390","authenticated-orcid":false,"given":"Xiaobo","family":"Zhou","sequence":"additional","affiliation":[{"name":"University of Macau, Macau, Macao"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2869-7623","authenticated-orcid":false,"given":"Dazhao","family":"Cheng","sequence":"additional","affiliation":[{"name":"Wuhan University, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_3_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00008"},{"key":"e_1_3_3_3_3_2","unstructured":"Hoque Adnan Wright Less Martin Antoni Vir\u00f3s and Yang Chih-Chieh. 2024. Accelerating MoE model inference with Locality-Aware Kernel Design. https:\/\/pytorch.org\/blog\/accelerating-moe-model"},{"key":"e_1_3_3_3_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640366"},{"key":"e_1_3_3_3_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3173162.3173169"},{"key":"e_1_3_3_3_6_2","doi-asserted-by":"publisher","unstructured":"Jiri Brabec Jan Brandejs Karol Kowalski Sotiris Xantheas \u00d6rs Legeza and Libor Veis. 2021. Massively parallel quantum chemical density matrix renormalization group method. Journal of Computational Chemistry 42 8 (2021) 534\u2013544. 10.1002\/JCC.26476","DOI":"10.1002\/JCC.26476"},{"key":"e_1_3_3_3_7_2","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared\u00a0D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et\u00a0al. 2020. Language models are few-shot learners. Advances in neural information processing systems (NeurIPS) (2020)."},{"key":"e_1_3_3_3_8_2","doi-asserted-by":"crossref","unstructured":"Stefano Castruccio David\u00a0J McInerney Michael\u00a0L Stein Feifei Liu\u00a0Crouch Robert\u00a0L Jacob and Elisabeth\u00a0J Moyer. 2014. Statistical emulation of climate model projections based on precomputed GCM runs. Journal of Climate 27 5 (2014) 1829\u20131844.","DOI":"10.1175\/JCLI-D-13-00099.1"},{"key":"e_1_3_3_3_9_2","first-page":"578","volume-title":"Proceedings of the USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et\u00a0al. 2018. TVM: An automated End-to-End optimizing compiler for deep learning. In Proceedings of the USENIX Symposium on Operating Systems Design and Implementation (OSDI). USENIX Association, Carlsbad, CA, USA, 578\u2013594."},{"key":"e_1_3_3_3_10_2","unstructured":"Tianqi Chen Lianmin Zheng Eddie Yan Ziheng Jiang Thierry Moreau Luis Ceze Carlos Guestrin and Arvind Krishnamurthy. 2018. Learning to optimize tensor programs. Advances in Neural Information Processing Systems (NeurIPS) 31 (2018)."},{"key":"e_1_3_3_3_11_2","doi-asserted-by":"publisher","unstructured":"Jack Choquette. 2023. Nvidia hopper h100 gpu: Scaling performance. IEEE Micro 43 3 (2023) 9\u201317. 10.1109\/MM.2023.3256796","DOI":"10.1109\/MM.2023.3256796"},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/HCS49909.2020.9220622"},{"key":"e_1_3_3_3_13_2","unstructured":"NVIDIA Corporation. [n. d.]. CUTLASS. Retrieved March 1 2025 from https:\/\/github.com\/NVIDIA\/cutlass"},{"key":"e_1_3_3_3_14_2","unstructured":"NVIDIA Corporation. 2024. cuBLAS: Basic Linear Algebra on NVIDIA GPUs. Retrieved March 1 2025 from https:\/\/developer.nvidia.com\/cublas"},{"key":"e_1_3_3_3_15_2","unstructured":"NVIDIA Corporation. 2024. NSight Compute Metrics Guide. Retrieved March 1 2025 from https:\/\/docs.nvidia.com\/nsight-compute\/ProfilingGuide\/index.html#metrics-guide"},{"key":"e_1_3_3_3_16_2","unstructured":"NVIDIA Corporation. 2024. NVIDIA cuDNN. Retrieved March 1 2025 from https:\/\/developer.nvidia.com\/cudnn"},{"key":"e_1_3_3_3_17_2","volume-title":"9th International Conference on Learning Representations (ICLR)","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In 9th International Conference on Learning Representations (ICLR). Austria."},{"key":"e_1_3_3_3_18_2","doi-asserted-by":"publisher","unstructured":"Hajar Falahati Mohammad Sadrosadati Qiumin Xu Juan G\u00f3mez-Luna Banafsheh Saber\u00a0Latibari Hyeran Jeon Shaahin Hesaabi Hamid Sarbazi-Azad Onur Mutlu Murali Annavaram et\u00a0al. 2024. Cross-core data sharing for energy-efficient gpus. ACM Transactions on Architecture and Code Optimization (TACO) 21 3 (2024) 42:1\u201342:32. 10.1145\/3653019","DOI":"10.1145\/3653019"},{"key":"e_1_3_3_3_19_2","unstructured":"Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiao Bi et\u00a0al. 2025. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.12948 (2025)."},{"key":"e_1_3_3_3_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3649329.3655906"},{"key":"e_1_3_3_3_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_3_22_2","doi-asserted-by":"crossref","unstructured":"Wang Hulin Donglin Yang Yaqi Xia Zheng Zhang Qigang Wang Jianping Fan Xiaobo Zhou and Dazhao Cheng. 2024. Raptor-T: A Fused and Memory-Efficient Sparse Transformer for Long and Variable-Length Sequences. IEEE Trans. Comput. 73 (2024) 1852\u20131865.","DOI":"10.1109\/TC.2024.3389507"},{"key":"e_1_3_3_3_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/3410463.3414623"},{"key":"e_1_3_3_3_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/CGO57630.2024.10444873"},{"key":"e_1_3_3_3_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359630"},{"key":"e_1_3_3_3_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3676641.3715996"},{"key":"e_1_3_3_3_27_2","first-page":"4171","volume-title":"Proceedings of NAACL-HLT","author":"Kenton Jacob Devlin Ming-Wei\u00a0Chang","year":"2019","unstructured":"Jacob Devlin Ming-Wei\u00a0Chang Kenton and Lee\u00a0Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of NAACL-HLT. 4171\u20134186."},{"key":"e_1_3_3_3_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835937"},{"key":"e_1_3_3_3_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/CGO53902.2022.9741270"},{"key":"e_1_3_3_3_30_2","unstructured":"Yinhan Liu Myle Ott Naman Goyal Jingfei Du Mandar Joshi Danqi Chen Omer Levy Mike Lewis Luke Zettlemoyer and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1907.11692 (2019)."},{"key":"e_1_3_3_3_31_2","volume-title":"Proceedings of the USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Ma Lingxiao","year":"2020","unstructured":"Lingxiao Ma, Zhiqiang Xie, Zhi Yang, Jilong Xue, Youshan Miao, Wei Cui, Wenxiang Hu, Fan Yang, Lintao Zhang, and Lidong Zhou. 2020. Rammer: Enabling Holistic Deep Learning Compiler Optimizations with rTasks. In Proceedings of the USENIX Symposium on Operating Systems Design and Implementation (OSDI). USENIX Association."},{"key":"e_1_3_3_3_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454083"},{"key":"e_1_3_3_3_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3572848.3577479"},{"key":"e_1_3_3_3_34_2","doi-asserted-by":"publisher","unstructured":"Sreepathi Pai Matthew\u00a0J Thazhuthaveetil and Ramaswamy Govindarajan. 2013. Improving GPGPU concurrency with elastic kernels. ACM SIGARCH Computer Architecture News 41 1 (2013) 407\u2013418. 10.1145\/2451116.2451160","DOI":"10.1145\/2451116.2451160"},{"key":"e_1_3_3_3_35_2","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever et\u00a0al. 2018. Improving language understanding by generative pre-training."},{"key":"e_1_3_3_3_36_2","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et\u00a0al. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 (2019) 9."},{"key":"e_1_3_3_3_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304072"},{"key":"e_1_3_3_3_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_3_3_39_2","first-page":"24261","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Tolstikhin Ilya\u00a0O.","year":"2021","unstructured":"Ilya\u00a0O. Tolstikhin, Neil Houlsby, Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Thomas Unterthiner, Jessica Yung, Andreas Steiner, Daniel Keysers, Jakob Uszkoreit, Mario Lucic, and Alexey Dosovitskiy. 2021. MLP-Mixer: An all-MLP Architecture for Vision. In Advances in Neural Information Processing Systems (NeurIPS). 24261\u201324272."},{"key":"e_1_3_3_3_40_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar et\u00a0al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971 (2023)."},{"key":"e_1_3_3_3_41_2","doi-asserted-by":"publisher","unstructured":"Devashree Tripathy Amirali Abdolrashidi Laxmi\u00a0Narayan Bhuyan Liang Zhou and Daniel Wong. 2021. Paver: Locality graph-based thread block scheduling for gpus. ACM Transactions on Architecture and Code Optimization (TACO) 18 3 (2021) 1\u201326. 10.1145\/3451164","DOI":"10.1145\/3451164"},{"key":"e_1_3_3_3_42_2","unstructured":"Triton-Lang. 2025. Triton Persistent GEMM Tutorial. Retrieved March 11 2025 from https:\/\/github.com\/triton-lang\/triton\/blob\/main\/python\/tutorials\/09-persistent-matmul.py"},{"key":"e_1_3_3_3_43_2","first-page":"5998","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems (NeurIPS). 5998\u20136008."},{"key":"e_1_3_3_3_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751213"},{"key":"e_1_3_3_3_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00027"},{"key":"e_1_3_3_3_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607061"},{"key":"e_1_3_3_3_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640390"},{"key":"e_1_3_3_3_48_2","doi-asserted-by":"publisher","unstructured":"Zheng Zhang Yaqi Xia Hulin Wang Donglin Yang Chuang Hu Xiaobo Zhou and Dazhao Cheng. 2024. MPMoE: Memory Efficient MoE for Pre-Trained Models With Adaptive Pipeline Parallelism. IEEE Transactions on Parallel and Distributed Systems 35 6 (2024) 843\u2013856. 10.1109\/TPDS.2024.3385639","DOI":"10.1109\/TPDS.2024.3385639"},{"key":"e_1_3_3_3_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS54959.2023.00026"},{"key":"e_1_3_3_3_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00040"},{"key":"e_1_3_3_3_51_2","first-page":"863","volume-title":"Proceedings of the USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody\u00a0Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, Joseph\u00a0E. Gonzalez, and Ion Stoica. 2020. Ansor: Generating High-Performance Tensor Programs for Deep Learning. In Proceedings of the USENIX Symposium on Operating Systems Design and Implementation (OSDI). USENIX Association, 863\u2013879."},{"key":"e_1_3_3_3_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3623792"},{"key":"e_1_3_3_3_53_2","unstructured":"Daquan Zhou Bingyi Kang Xiaojie Jin Linjie Yang Xiaochen Lian Qibin Hou and Jiashi Feng. 2021. DeepViT: Towards Deeper Vision Transformer. CoRR abs\/2103.11886 (2021)."},{"key":"e_1_3_3_3_54_2","first-page":"233","volume-title":"Proceedings of the USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Zhu Hongyu","year":"2022","unstructured":"Hongyu Zhu, Ruofan Wu, Yijia Diao, Shanbin Ke, Haoyu Li, Chen Zhang, Jilong Xue, Lingxiao Ma, Yuqing Xia, Wei Cui, Fan Yang, Mao Yang, Lidong Zhou, Asaf Cidon, and Gennady Pekhimenko. 2022. ROLLER: Fast and Efficient Tensor Compilation for Deep Learning. In Proceedings of the USENIX Symposium on Operating Systems Design and Implementation (OSDI). USENIX Association, Carlsbad, CA, USA, 233\u2013248."}],"event":{"name":"SC '25: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis MO USA","acronym":"SC '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759771","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T18:40:52Z","timestamp":1773254452000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712285.3759771"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":53,"alternative-id":["10.1145\/3712285.3759771","10.1145\/3712285"],"URL":"https:\/\/doi.org\/10.1145\/3712285.3759771","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}