{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,16]],"date-time":"2025-12-16T07:50:39Z","timestamp":1765871439877,"version":"3.48.0"},"reference-count":38,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2025,11,19]],"date-time":"2025-11-19T00:00:00Z","timestamp":1763510400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,19]],"date-time":"2025-11-19T00:00:00Z","timestamp":1763510400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62172391,"],"award-info":[{"award-number":["62172391,"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62032023"],"award-info":[{"award-number":["62032023"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["T2125013"],"award-info":[{"award-number":["T2125013"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["CCF Trans. HPC"],"published-print":{"date-parts":[[2025,12]]},"DOI":"10.1007\/s42514-025-00240-3","type":"journal-article","created":{"date-parts":[[2025,11,19]],"date-time":"2025-11-19T13:09:41Z","timestamp":1763557781000},"page":"589-622","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Hiperti: high performance system for cross-platform code generation of transformer model inference based on MLIR"],"prefix":"10.1007","volume":"7","author":[{"given":"Jiashu","family":"Yao","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0457-4709","authenticated-orcid":false,"given":"Junmin","family":"Xiao","sequence":"additional","affiliation":[]},{"given":"Baokang","family":"Xie","sequence":"additional","affiliation":[]},{"given":"Shilong","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Xi","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Yunfei","family":"Pang","sequence":"additional","affiliation":[]},{"given":"Mingyi","family":"Li","sequence":"additional","affiliation":[]},{"given":"Hui","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Yun","family":"Song","sequence":"additional","affiliation":[]},{"given":"Guangming","family":"Tan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,19]]},"reference":[{"key":"240_CR1","unstructured":"Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, G.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg, J., Man\u00e9, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., Vi\u00e9gas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: Large-Scale Machine Learning on Heterogeneous Systems. Software available from tensorflow.org. https:\/\/www.tensorflow.org\/ (2015)"},{"key":"240_CR2","unstructured":"AMD: Next generation BLAS implementation for ROCm platform. https:\/\/github.com\/ROCm\/rocBLAS (2023)"},{"key":"240_CR3","unstructured":"Bai, J., Lu, F., Zhang, K., et al.: ONNX: Open Neural Network Exchange. GitHub (2019)"},{"key":"240_CR4","unstructured":"Bavoil, L.: Optimizing compute shaders for l2 locality using thread-group id swizzling. Retrieved August 25, 2021 (2020)"},{"key":"240_CR5","unstructured":"Chen, T., Moreau, T., Jiang, Z., Zheng, L., Yan, E., Shen, H., Cowan, M., Wang, L., Hu, Y., Ceze, L., et al.: $$\\{$$TVM$$\\}$$: An automated $$\\{$$End-to-End$$\\}$$ optimizing compiler for deep learning. In: 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18), pp. 578\u2013594 (2018)"},{"key":"240_CR6","unstructured":"Chen, T., Zheng, L., Yan, E., Jiang, Z., Moreau, T., Ceze, L., Guestrin, C., Krishnamurthy, A.: Learning to optimize tensor programs. Adv. Neural. Inf. Process. Syst. 31, (2018)"},{"key":"240_CR7","unstructured":"Cooperation, N.: Cublas introduction. https:\/\/docs.nvidia.com\/cuda\/cublas\/index.html (2022)"},{"key":"240_CR8","unstructured":"Dao, T., Fu, D., Ermon, S., Rudra, A., R\u00e9, C.F.: Fast and memory-efficient exact attention with io-awareness. arXiv:2205.14135 (2022)"},{"key":"240_CR9","unstructured":"Dao, T.: Flashattention-2: faster attention with better parallelism and work partitioning. arXiv preprint arXiv:2307.08691 (2023)"},{"key":"240_CR10","first-page":"16344","volume":"35","author":"T Dao","year":"2022","unstructured":"Dao, T., Fu, D., Ermon, S., Rudra, A., R\u00e9, C.: Flashattention: fast and memory-efficient exact attention with io-awareness. Adv. Neural. Inf. Process. Syst. 35, 16344\u201316359 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"240_CR11","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"240_CR12","doi-asserted-by":"crossref","unstructured":"Ding, Y., Yu, C.H., Zheng, B., Liu, Y., Wang, Y., Pekhimenko, G.: Hidet: task-mapping programming paradigm for deep learning tensor programs. In: Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, vol. 2, pp. 370\u2013384 (2023)","DOI":"10.1145\/3575693.3575702"},{"key":"240_CR13","doi-asserted-by":"publisher","first-page":"382","DOI":"10.1007\/s42514-020-00039-4","volume":"2","author":"J Fang","year":"2020","unstructured":"Fang, J., Huang, C., Tang, T., Wang, Z.: Parallel programming models for heterogeneous many-cores: a comprehensive survey. CCF Trans. High Perform. Comput. 2, 382\u2013400 (2020)","journal-title":"CCF Trans. High Perform. Comput."},{"key":"240_CR14","doi-asserted-by":"crossref","unstructured":"Feng, S., Hou, B., Jin, H., Lin, W., Shao, J., Lai, R., Ye, Z., Zheng, L., Yu, C.H., Yu, Y., et al.: Tensorir: an abstraction for automatic tensorized program optimization. In: Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, vol. 2, pp. 804\u2013817 (2023)","DOI":"10.1145\/3575693.3576933"},{"key":"240_CR15","doi-asserted-by":"crossref","unstructured":"Ghorpade, J., Parande, J., Kulkarni, M., Bawaskar, A.: Gpgpu processing in cuda architecture. arXiv preprint arXiv:1202.4347 (2012)","DOI":"10.5121\/acij.2012.3109"},{"key":"240_CR16","doi-asserted-by":"crossref","unstructured":"Hagedorn, B., Fan, B., Chen, H., Cecka, C., Garland, M., Grover, V.: Graphene: an ir for optimized tensor computations on gpus. In: Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, vol. 3, pp. 302\u2013313 (2023)","DOI":"10.1145\/3582016.3582018"},{"key":"240_CR17","unstructured":"Kerr, A., Merrill, D., Demouth, J., Tran, J., Farooqui, N., Tavenrath, M., Schuster, V., Gornish, E., Zheng, J., Sathe, B.: Cutlass: cuda template library for dense linear algebra at all levels and scales. In: NVIDIA GPU Technology Conference (GTC) S8854 (Mar 2018) (2018)"},{"key":"240_CR18","doi-asserted-by":"crossref","unstructured":"Lattner, C., Adve, V.: Llvm: A compilation framework for lifelong program analysis and transformation. In: International Symposium on Code Generation and Optimization, 2004. CGO 2004, pp. 75\u201386 (2004). IEEE","DOI":"10.1109\/CGO.2004.1281665"},{"key":"240_CR19","doi-asserted-by":"publisher","unstructured":"Lattner, C., Amini, M., Bondhugula, U., Cohen, A., Davis, A., Pienaar, J., Riddle, R., Shpeisman, T., Vasilache, N., Zinenko, O.: MLIR: scaling compiler infrastructure for domain specific computation. In: 2021 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO), pp. 2\u201314 (2021). https:\/\/doi.org\/10.1109\/CGO51591.2021.9370308","DOI":"10.1109\/CGO51591.2021.9370308"},{"issue":"3","key":"240_CR20","doi-asserted-by":"publisher","first-page":"708","DOI":"10.1109\/TPDS.2020.3030548","volume":"32","author":"M Li","year":"2020","unstructured":"Li, M., Liu, Y., Liu, X., Sun, Q., You, X., Yang, H., Luan, Z., Gan, L., Yang, G., Qian, D.: The deep learning compiler: z comprehensive survey. IEEE Trans. Parallel Distrib. Syst. 32(3), 708\u2013727 (2020)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"240_CR21","unstructured":"Ma, L., Xie, Z., Yang, Z., Xue, J., Miao, Y., Cui, W., Hu, W., Yang, F., Zhang, L., Zhou, L.: Rammer: enabling holistic deep learning compiler optimizations with $$\\{$$rTasks$$\\}$$. In: 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20), pp. 881\u2013897 (2020)"},{"key":"240_CR22","unstructured":"Naveed, H., Khan, A.U., Qiu, S., Saqib, M., Anwar, S., Usman, M., Barnes, N., Mian, A.: A comprehensive overview of large language models. arXiv preprint arXiv:2307.06435 (2023)"},{"key":"240_CR23","doi-asserted-by":"crossref","unstructured":"Niu, W., Guan, J., Wang, Y., Agrawal, G., Ren, B.: Dnnfusion: accelerating deep neural networks execution with advanced operator fusion. In: Proceedings of the 42nd ACM SIGPLAN International Conference on Programming Language Design and Implementation, pp. 883\u2013898 (2021)","DOI":"10.1145\/3453483.3454083"},{"key":"240_CR24","unstructured":"NVIDIA A100 Tensor Core GPU Architecture. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf (2023)"},{"key":"240_CR25","unstructured":"NVIDIA: CUDA Toolkit Documentation. https:\/\/docs.nvidia.com\/cuda\/index.html (2024)"},{"key":"240_CR26","unstructured":"NVIDIA: The CUDA Basic Linear Algebra Subroutine library. https:\/\/docs.nvidia.com\/cuda\/cublas\/index.html (2023)"},{"key":"240_CR27","unstructured":"OpenAI: Triton. https:\/\/github.com\/openai\/triton\/blob\/master\/lib\/codegen\/selection\/generator.cc (2022)"},{"key":"240_CR28","unstructured":"Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style, high-performance deep learning library. Adv. Neural. Inf. Process. Syst. 32, (2019)"},{"key":"240_CR29","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et al.: Improving language understanding by generative pre-training (2018)"},{"issue":"6","key":"240_CR30","doi-asserted-by":"publisher","first-page":"519","DOI":"10.1145\/2499370.2462176","volume":"48","author":"J Ragan-Kelley","year":"2013","unstructured":"Ragan-Kelley, J., Barnes, C., Adams, A., Paris, S., Durand, F., Amarasinghe, S.: Halide: a language and compiler for optimizing parallelism, locality, and recomputation in image processing pipelines. Acm Sigplan Not. 48(6), 519\u2013530 (2013)","journal-title":"Acm Sigplan Not."},{"issue":"31","key":"240_CR31","doi-asserted-by":"publisher","first-page":"23103","DOI":"10.1007\/s00521-023-08957-4","volume":"35","author":"T Talaei Khoei","year":"2023","unstructured":"Talaei Khoei, T., Ould Slimane, H., Kaabouch, N.: Deep learning: systematic review, models, challenges, and research directions. Neural Comput. Appl. 35(31), 23103\u201323124 (2023)","journal-title":"Neural Comput. Appl."},{"key":"240_CR32","unstructured":"Thakkar, V., Ramani, P., Cecka, C., Shivam, A., Lu, H., Yan, E., Kosaian, J., Hoemmen, M., Wu, H., Kerr, A., Nicely, M., Merrill, D., Blasig, D., Qiao, F., Majcher, P., Springer, P., Hohnerbach, M., Wang, J., Gupta, M.: CUTLASS. https:\/\/github.com\/NVIDIA\/cutlass (2023)"},{"key":"240_CR33","doi-asserted-by":"crossref","unstructured":"Tillet, P., Kung, H.-T., Cox, D.: Triton: an intermediate language and compiler for tiled neural network computations. In: Proceedings of the 3rd ACM SIGPLAN International Workshop on Machine Learning and Programming Languages, pp. 10\u201319 (2019)","DOI":"10.1145\/3315508.3329973"},{"issue":"1","key":"240_CR34","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3477497","volume":"19","author":"A Ukarande","year":"2021","unstructured":"Ukarande, A., Patidar, S., Rangan, R.: Locality-aware cta scheduling for gaming applications. ACM Trans. Archit. Code Optim. 19(1), 1\u201326 (2021)","journal-title":"ACM Trans. Archit. Code Optim."},{"key":"240_CR35","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141, Polosukhin, I.: Attention is all you need. Adv. Neural. Inf. Process. Syst. 30 (2017)"},{"key":"240_CR36","doi-asserted-by":"crossref","unstructured":"Zhai, Y., Jiang, C., Wang, L., Jia, X., Zhang, S., Chen, Z., Liu, X., Zhu, Y.: Bytetransformer: a high-performance transformer boosted for variable-length inputs. In: 2023 IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 344\u2013355 (2023). IEEE","DOI":"10.1109\/IPDPS54959.2023.00042"},{"key":"240_CR37","unstructured":"Zheng, L., Jia, C., Sun, M., Wu, Z., Yu, C.H., Haj-Ali, A., Wang, Y., Yang, J., Zhuo, D., Sen, K., et al.: Ansor: generating $$\\{$$High-Performance$$\\}$$ tensor programs for deep learning. In: 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20), pp. 863\u2013879 (2020)"},{"key":"240_CR38","doi-asserted-by":"crossref","unstructured":"Zheng, S., Liang, Y., Wang, S., Chen, R., Sheng, K.: Flextensor: An automatic schedule exploration and optimization framework for tensor computation on heterogeneous system. In: Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 859\u2013873 (2020)","DOI":"10.1145\/3373376.3378508"}],"container-title":["CCF Transactions on High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-025-00240-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42514-025-00240-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-025-00240-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,16]],"date-time":"2025-12-16T07:47:25Z","timestamp":1765871245000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42514-025-00240-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,19]]},"references-count":38,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2025,12]]}},"alternative-id":["240"],"URL":"https:\/\/doi.org\/10.1007\/s42514-025-00240-3","relation":{},"ISSN":["2524-4922","2524-4930"],"issn-type":[{"type":"print","value":"2524-4922"},{"type":"electronic","value":"2524-4930"}],"subject":[],"published":{"date-parts":[[2025,11,19]]},"assertion":[{"value":"30 July 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 July 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 November 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}