{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,24]],"date-time":"2025-02-24T11:40:17Z","timestamp":1740397217564,"version":"3.37.3"},"reference-count":57,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,2,1]],"date-time":"2025-02-01T00:00:00Z","timestamp":1738368000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,1]],"date-time":"2025-02-01T00:00:00Z","timestamp":1738368000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"National Science and Technology Major Project of China","award":["2023ZD0120502"],"award-info":[{"award-number":["2023ZD0120502"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372055"],"award-info":[{"award-number":["62372055"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["CCF Trans. HPC"],"published-print":{"date-parts":[[2025,2]]},"DOI":"10.1007\/s42514-024-00211-0","type":"journal-article","created":{"date-parts":[[2025,2,12]],"date-time":"2025-02-12T15:20:53Z","timestamp":1739373653000},"page":"17-28","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["SparkAttention: high-performance multi-head attention for large models on Volta GPU architecture"],"prefix":"10.1007","volume":"7","author":[{"given":"Youxuan","family":"Xu","sequence":"first","affiliation":[]},{"given":"Tong","family":"Wu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0022-7865","authenticated-orcid":false,"given":"Shigang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Xueying","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Jingjing","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,2,12]]},"reference":[{"key":"211_CR1","unstructured":"Beltagy, I., Peters, M.E., Cohan, A.: Longformer: the long-document transformer. arXiv preprint arXiv:2004.05150 (2020)"},{"key":"211_CR2","doi-asserted-by":"crossref","unstructured":"Bi, J., Zhu, Z., Meng, Q.: Transformer in computer vision. In: 2021 IEEE International conference on computer science, electronic information engineering and intelligent control technology (CEI). IEEE, pp. 178\u2013188 (2021)","DOI":"10.1109\/CEI52496.2021.9574462"},{"key":"211_CR3","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J.D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"211_CR4","unstructured":"Chen, T., Moreau, T., Jiang, Z., Zheng, L., Yan, E., Shen, H., Cowan, M., Wang, L., Hu, Y., Ceze, L., et\u00a0al.: TVM: An automated End-to-End optimizing compiler for deep learning. In: 13th USENIX symposium on operating systems design and implementation (OSDI 18), pp. 578\u2013594 (2018)"},{"key":"211_CR5","unstructured":"Choromanski, K., Likhosherstov, V., Dohan, D., Song, X., Gane, A., Sarlos, T., Hawkins, P., Davis, J., Mohiuddin, A., Kaiser, L. et\u00a0al.: Rethinking attention with performers. arXiv preprint arXiv:2009.14794 (2020)"},{"key":"211_CR6","unstructured":"Clark, K., Luong, M.-T., Le, Q.V., Manning, C.D.: Electra: Pre-training text encoders as discriminators rather than generators,\u201d arXiv preprint arXiv:2003.10555 (2020)"},{"key":"211_CR7","unstructured":"Corporation, N.: cuBLAS Library User Guide, version 12.0. [Online]. Available: https:\/\/docs.nvidia.com\/cuda\/cublas\/index.html (2023)"},{"key":"211_CR8","unstructured":"Dao, T.: Flashattention-2: faster attention with better parallelism and work partitioning,\u201d arXiv preprint arXiv:2307.08691 (2023)"},{"key":"211_CR9","first-page":"16 344","volume":"35","author":"T Dao","year":"2022","unstructured":"Dao, T., Fu, D., Ermon, S., Rudra, A., R\u00e9, C.: Flashattention: fast and memory-efficient exact attention with io-awareness. Adv. Neural. Inf. Process. Syst. 35, 16 344-16 359 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"211_CR10","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"211_CR11","doi-asserted-by":"crossref","unstructured":"Dong, L., Xu, S., Xu, B.: Speech-transformer: a no-recurrence sequence-to-sequence model for speech recognition. In: 2018 IEEE international conference on acoustics, speech and signal processing (ICASSP). IEEE, pp. 5884\u20135888 (2018)","DOI":"10.1109\/ICASSP.2018.8462506"},{"key":"211_CR12","unstructured":"Fan, A., Grave, E., Joulin, A.: Reducing transformer depth on demand with structured dropout. arXiv preprint arXiv:1909.11556 (2019)"},{"key":"211_CR13","doi-asserted-by":"crossref","unstructured":"Fang, J., Yu, Y., Zhao, C., Zhou, J.: Turbotransformers: an efficient gpu serving system for transformer models. In Proceedings of the 26th ACM SIGPLAN symposium on principles and practice of parallel programming, pp. 389\u2013402 (2021)","DOI":"10.1145\/3437801.3441578"},{"key":"211_CR14","unstructured":"Gong, L., He, D., Li, Z., Qin, T., Wang, L., Liu, T.: Efficient training of bert by progressively stacking. In: International conference on machine learning. PMLR, pp. 2337\u20132346 (2019)"},{"key":"211_CR15","doi-asserted-by":"crossref","unstructured":"Gulati, A., Qin, J., Chiu, C.-C., Parmar, N., Zhang, Y., Yu, J., Han, W., Wang, S., Zhang, Z., Wu, Y. et\u00a0al.,: Conformer: Convolution-augmented transformer for speech recognition, arXiv preprint arXiv:2005.08100, (2020)","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"211_CR16","unstructured":"Jakob, W., Rhinelander, J., Moldovan, D.: pybind11 - seamless operability between c++11 and python, Accessed: 31 Oct 2024. Available: https:\/\/github.com\/pybind\/pybind11 (2017)"},{"key":"211_CR17","doi-asserted-by":"crossref","unstructured":"Kalyan, K.S., Rajasekharan, A., Sangeetha, S.: Ammus: A survey of transformer-based pretrained models in natural language processing. arXiv preprint arXiv:2108.05542 (2021)","DOI":"10.1016\/j.jbi.2021.103982"},{"key":"211_CR18","unstructured":"Kitaev, N., Kaiser, \u0141., Levskaya, A.: Reformer: The efficient transformer. arXiv preprint arXiv:2001.04451 (2020)"},{"key":"211_CR19","doi-asserted-by":"crossref","unstructured":"Kwon, W., Li, Z., Zhuang, S., Sheng, Y., Zheng, L., Yu, C.H., Gonzalez, J., Zhang, H., Stoica, I.: Efficient memory management for large language model serving with pagedattention. In: Proceedings of the 29th symposium on operating systems principles, pp. 611\u2013626 (2023)","DOI":"10.1145\/3600006.3613165"},{"key":"211_CR20","unstructured":"Lefaudeux, B., Massa, F., Liskovich, D., Xiong, W., Caggiano, V., Naren, S., Xu, M., Hu, J., Tintore, M., Zhang, S., Labatut, P., Haziza, D., Wehrstedt, L., Reizenstein, J., Sizov, G.: xformers: a modular and hackable transformer modelling library. https:\/\/github.com\/facebookresearch\/xformers (2022)"},{"key":"211_CR21","doi-asserted-by":"crossref","unstructured":"Li, B., Wang, Z., Liu, H., Jiang, Y., Du, Q., Xiao, T., Wang, H., Zhu, J.: Shallow-to-deep training for neural machine translation. arXiv preprint arXiv:2010.03737 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.72"},{"key":"211_CR22","unstructured":"Liu, L., Jiang, H., He, P., Chen, W., Liu, X., Gao, J., Han, J.: On the variance of the adaptive learning rate and beyond. arXiv preprint arXiv:1908.03265 (2019)"},{"key":"211_CR23","unstructured":"Liu, Y., Ott, M., Goyal, N., Du, J., Joshi, M., Chen, D., Levy, O., Lewis, M., Zettlemoyer, L., Stoyanov, V.: Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"211_CR24","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., Guo, B.: Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp. 10\u00a0012\u201310\u00a0022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"211_CR25","unstructured":"Milakov, M., Gimelshein, N.: Online normalizer calculation for softmax. arXiv preprint arXiv:1805.02867 (2018)"},{"key":"211_CR26","unstructured":"NVIDIA: \u201cNvidia apex,\u201d https:\/\/github.com\/NVIDIA\/apex (2020)"},{"key":"211_CR27","unstructured":"Nvidia: Tensor core https:\/\/www.nvidia.cn\/data-center\/tensor-cores\/ (2023)"},{"key":"211_CR28","unstructured":"NVIDIA: warp level matrix multiply-accumulate instructions. https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution\/index.html#matrix-shape (2023)"},{"key":"211_CR29","unstructured":"Nvidia: Ptx: parallel thread execution https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution\/index.html (2024)"},{"key":"211_CR30","unstructured":"NVIDIA-Ada: Nvidia ada gpu architecture tuning guide https:\/\/docs.nvidia.com\/cuda\/ada-tuning-guide\/index.html (2023)"},{"key":"211_CR31","unstructured":"NVIDIA-Hopper: Nvidia hopper tuning guide https:\/\/docs.nvidia.com\/cuda\/hopper-tuning-guide\/index.html (2023)"},{"key":"211_CR32","unstructured":"NVIDIA-Tuning: Nvidia turing gpu architecture tuning guide https:\/\/docs.nvidia.com\/cuda\/turing-tuning-guide\/index.html (2023a)"},{"key":"211_CR33","unstructured":"NVIDIA-Tuning: Nvidia ampere gpu architecture tuning guide https:\/\/docs.nvidia.com\/cuda\/ampere-tuning-guide\/index.html (2023b)"},{"key":"211_CR34","unstructured":"NVIDIA-Tuning: Nvidia volta gpu architecture tuning guide https:\/\/docs.nvidia.com\/cuda\/volta-tuning-guide\/index.html (2023c)"},{"key":"211_CR35","unstructured":"OpenAI: Gpt-4 technical report, accessed: 2023-10-31. [Online]. Available: https:\/\/openai.com\/research\/gpt-4 (2023)"},{"key":"211_CR36","unstructured":"Patterson, D., Gonzalez, J., Le, Q., Liang, C., Munguia, L.-M., Rothchild, D., So, D., Texier, M., Dean, J.: Carbon emissions and large neural network training. arXiv preprint arXiv:2104.10350 (2021)"},{"key":"211_CR37","unstructured":"Power, H.M.L.C.C., Progress, D.A.I.: Ai and compute"},{"key":"211_CR38","unstructured":"Pytorch: Pytorch framework. https:\/\/pytorch.org\/docs\/stable\/index.html (2023)"},{"key":"211_CR39","unstructured":"Rabe, M.N., Staats, C.: Self-attention does not need $$o(n^2)$$ memory. arXiv preprint arXiv:2112.05682 (2021)"},{"issue":"8","key":"211_CR40","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al.: Language models are unsupervised multitask learners. OpenAI blog 1(8), 9 (2019)","journal-title":"OpenAI blog"},{"issue":"140","key":"211_CR41","first-page":"1","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., Zhou, Y., Li, W., Liu, P.J.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(140), 1\u201367 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"211_CR42","doi-asserted-by":"crossref","unstructured":"Rasley, J., Rajbhandari, S., Ruwase, O., He, Y.: Deepspeed: System optimizations enable training deep learning models with over 100 billion parameters. In: Proceedings of the 26th ACM SIGKDD international conference on knowledge discovery & data mining, pp. 3505\u20133506 (2020)","DOI":"10.1145\/3394486.3406703"},{"issue":"09","key":"211_CR43","first-page":"13 693","volume":"34","author":"E Strubell","year":"2020","unstructured":"Strubell, E., Ganesh, A., McCallum, A.: Energy and policy considerations for modern deep learning research. Proc. AAAI Conf. Artif. Intell. 34(09), 13 693-13 696 (2020)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"211_CR44","unstructured":"Sun, X., Choi, J., Chen, C.-Y., Wang, N., Venkataramani, S., Srinivasan, V.V., Cui, X., Zhang, W., Gopalakrishnan, K.: Hybrid 8-bit floating point (hfp8) training and inference for deep neural networks. Adv. Neural. Inf. Process. Syst. vol.\u00a032, (2019)"},{"key":"211_CR45","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141, Polosukhin, I.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"211_CR46","doi-asserted-by":"crossref","unstructured":"Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M. et\u00a0al.: Huggingface\u2019s transformers: State-of-the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019)","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"211_CR47","doi-asserted-by":"crossref","unstructured":"Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M., Davison, J., Shleifer, S., von Platen, P., Ma, C. Jernite, Y., Plu, J., Xu, C., Scao, T.L., Gugger, S., Drame, M., Lhoest, Q., Rush, A.M.: Transformers: State-of-the-art natural language processing. In: Proceedings of the 2020 conference on empirical methods in natural language processing: system demonstrations. Online: Association for computational linguistics, pp. 38\u201345. [Online]. Available: https:\/\/www.aclweb.org\/anthology\/2020.emnlp-demos.6 (2020)","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"211_CR48","doi-asserted-by":"crossref","unstructured":"Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M. et\u00a0al.: Transformers: State-of-the-art natural language processing. In: Proceedings of the 2020 conference on empirical methods in natural language processing: system demonstrations, pp. 38\u201345 (2020)","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"211_CR49","unstructured":"Wu, B., Xu, C., Dai, X., Wan, A., Zhang, P., Yan, Z., Tomizuka, M., Gonzalez, J., Keutzer, K., Vajda, P.: Visual transformers: Token-based image representation and processing for computer vision. arXiv preprint arXiv:2006.03677 (2020)"},{"key":"211_CR50","unstructured":"Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R.R., Le, Q.V.: Xlnet: Generalized autoregressive pretraining for language understanding. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"211_CR51","doi-asserted-by":"crossref","unstructured":"Yao, Z., Gholami, A., Shen, S., Mustafa, M., Keutzer, K., Mahoney, M.: Adahessian: An adaptive second order optimizer for machine learning. In: proceedings of the AAAI conference on artificial intelligence, vol.\u00a035, no.\u00a012, pp. 10\u00a0665\u201310\u00a0673 (2021)","DOI":"10.1609\/aaai.v35i12.17275"},{"key":"211_CR52","unstructured":"Ying, C., Ke, G., He, D., Liu, T.-Y.: Lazyformer: self attention with lazy update. arXiv preprint arXiv:2102.12702 (2021)"},{"key":"211_CR53","unstructured":"Yu, G.-I., Jeong, J.S., Kim, G.-W., Kim, S., Chun, B.-G.: Orca: A distributed serving system for $$\\{$$Transformer-Based$$\\}$$ generative models. In: 16th USENIX symposium on operating systems design and implementation (OSDI 22), pp. 521\u2013538 (2022)"},{"key":"211_CR54","first-page":"17 283","volume":"33","author":"M Zaheer","year":"2020","unstructured":"Zaheer, M., Guruganesh, G., Dubey, K.A., Ainslie, J., Alberti, C., Ontanon, S., Pham, P., Ravula, A., Wang, Q., Yang, L., et al.: Big bird: transformers for longer sequences. Adv. Neural. Inf. Process. Syst. 33, 17283-17297 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"211_CR55","doi-asserted-by":"crossref","unstructured":"Zhang, X., Liu, S., Zhang, R., Liu, C., Huang, D., Zhou, S., Guo, J., Guo, Q., Du, Z., Zhi, T. et\u00a0al.: Fixed-point back-propagation training. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 2330\u20132338 (2020)","DOI":"10.1109\/CVPR42600.2020.00240"},{"key":"211_CR56","doi-asserted-by":"crossref","unstructured":"Zhang, Q., Lu, H., Sak, H., Tripathi, A., McDermott, E., Koo, S., Kumar, S.: Transformer transducer: A streamable speech recognition model with transformer encoders and rnn-t loss. In: ICASSP2020-2020 IEEE international conference on acoustics, speech and signal processing (ICASSP). IEEE, pp. 7829\u20137833 (2020)","DOI":"10.1109\/ICASSP40776.2020.9053896"},{"key":"211_CR57","first-page":"14\u00a0011","volume":"33","author":"M Zhang","year":"2020","unstructured":"Zhang, M., He, Y.: Accelerating training of transformer-based language models with progressive layer dropping. Adv. Neural. Inf. Process. Syst. 33, 14011-14023 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."}],"container-title":["CCF Transactions on High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-024-00211-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42514-024-00211-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-024-00211-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,24]],"date-time":"2025-02-24T10:51:40Z","timestamp":1740394300000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42514-024-00211-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2]]},"references-count":57,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2025,2]]}},"alternative-id":["211"],"URL":"https:\/\/doi.org\/10.1007\/s42514-024-00211-0","relation":{},"ISSN":["2524-4922","2524-4930"],"issn-type":[{"type":"print","value":"2524-4922"},{"type":"electronic","value":"2524-4930"}],"subject":[],"published":{"date-parts":[[2025,2]]},"assertion":[{"value":"16 August 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 December 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 February 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"On behalf of all authors, the corresponding author states that there is no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}