{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,10]],"date-time":"2026-02-10T07:38:36Z","timestamp":1770709116031,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":23,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,8,29]],"date-time":"2022-08-29T00:00:00Z","timestamp":1661731200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,8,29]]},"DOI":"10.1145\/3545008.3545022","type":"proceedings-article","created":{"date-parts":[[2023,1,15]],"date-time":"2023-01-15T01:04:08Z","timestamp":1673744648000},"page":"1-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["Characterizing and Optimizing Transformer Inference on ARM Many-core Processor"],"prefix":"10.1145","author":[{"given":"Jiazhi","family":"Jiang","sequence":"first","affiliation":[{"name":"Sun Yat-Sen University, China"}]},{"given":"Jiangsu","family":"Du","sequence":"additional","affiliation":[{"name":"Sun Yat-Sen University, China"}]},{"given":"Dan","family":"Huang","sequence":"additional","affiliation":[{"name":"Sun Yat-Sen University, China"}]},{"given":"Dongsheng","family":"Li","sequence":"additional","affiliation":[{"name":"Sun Yat-Sen University, China"}]},{"given":"Jiang","family":"Zheng","sequence":"additional","affiliation":[{"name":"Sun Yat-Sen University, China"}]},{"given":"Yutong","family":"Lu","sequence":"additional","affiliation":[{"name":"Sun Yat-Sen University, China"}]}],"member":"320","published-online":{"date-parts":[[2023,1,13]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"December. 15 2021. Effective Transformer. https:\/\/github.com\/bytedance\/effectivetransformer."},{"key":"e_1_3_2_1_2_1","unstructured":"December 15 2021. FasterTransformer. https:\/\/github.com\/NVIDIA\/FasterTransformer."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3232162"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3330345.3330355"},{"key":"e_1_3_2_1_5_1","volume-title":"Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems. arXiv preprint arXiv:1512.01274(2015).","author":"Chen Tianqi","year":"2015","unstructured":"Tianqi Chen, Mu Li, Yutian Li, Min Lin, Naiyan Wang, Minjie Wang, Tianjun Xiao, Bing Xu, Chiyuan Zhang, and Zheng Zhang. 2015. Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems. arXiv preprint arXiv:1512.01274(2015)."},{"key":"e_1_3_2_1_6_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805(2018).","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805(2018)."},{"key":"e_1_3_2_1_7_1","unstructured":"Dave Dice and Alex Kogan. 2021. Optimizing Inference Performance of Transformers on CPUs. arXiv preprint arXiv:2102.06621(2021)."},{"key":"e_1_3_2_1_8_1","unstructured":"Jason\u00a0Li Emma\u00a0Ning Nathan\u00a0Yan. Published: 01-20-20 Accessed: 01-06-21.. Microsoft open sources breakthrough optimizations for transformer inference on gpu and cpu.https:\/\/cloudblogs.microsoft.com\/opensource\/2020\/01\/21\/microsoft-onnxopen- source-optimizations-transformer-inference-gpu-cpu\/. (Published: 01-20-20 Accessed: 01-06-21.)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441578"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-016-5588-7"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/1377603.1377607"},{"key":"e_1_3_2_1_12_1","volume-title":"Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942(2019).","author":"Lan Zhenzhong","year":"2019","unstructured":"Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, and Radu Soricut. 2019. Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942(2019)."},{"key":"e_1_3_2_1_13_1","volume-title":"Optimizing Massively Parallel Winograd Convolution on ARM Processor. In 50th International Conference on Parallel Processing. 1\u201312","author":"Li Dongsheng","year":"2021","unstructured":"Dongsheng Li, Dan Huang, Zhiguang Chen, and Yutong Lu. 2021. Optimizing Massively Parallel Winograd Convolution on ARM Processor. In 50th International Conference on Parallel Processing. 1\u201312."},{"key":"e_1_3_2_1_14_1","volume-title":"Performance and energy consumption of HPC workloads on a cluster based on Arm ThunderX2 CPU. Future generation computer systems 112","author":"Mantovani Filippo","year":"2020","unstructured":"Filippo Mantovani, Marta Garcia-Gasulla, Jos\u00e9 Gracia, Esteban Stafford, Fabio Banchelli, Marc Josep-Fabrego, Joel Criado-Ledesma, and Mathias Nachtmann. 2020. Performance and energy consumption of HPC workloads on a cluster based on Arm ThunderX2 CPU. Future generation computer systems 112 (2020), 800\u2013818."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC43674.2020.9286150"},{"key":"e_1_3_2_1_16_1","unstructured":"Azita Nouri Philip\u00a0E Davis Pradeep Subedi and Manish Parashar. 2021. Exploring the Role of Machine Learning in Scientific Workflows: Opportunities and Challenges. arXiv preprint arXiv:2110.13999(2021)."},{"key":"e_1_3_2_1_17_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019), 8026\u20138037."},{"key":"e_1_3_2_1_18_1","volume-title":"Language models are unsupervised multitask learners. OpenAI blog 1, 8","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, Ilya Sutskever, 2019. Language models are unsupervised multitask learners. OpenAI blog 1, 8 (2019), 9."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433763"},{"key":"e_1_3_2_1_20_1","volume-title":"Accessed: 01-06-21.. Optimization for BERT Inference Performance on CPU. https:\/\/github.com\/NVIDIA\/FasterTransformer.","author":"Shufan\u00a0Wu Pengxin\u00a0Yuan","unstructured":"Pengxin\u00a0Yuan Shufan\u00a0Wu, Tao\u00a0Lv. Published: 09-12-19, Accessed: 01-06-21.. Optimization for BERT Inference Performance on CPU. https:\/\/github.com\/NVIDIA\/FasterTransformer. (Published: 09-12-19, Accessed: 01-06-21.)."},{"key":"e_1_3_2_1_21_1","volume-title":"15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21)","author":"Thorpe John","year":"2021","unstructured":"John Thorpe, Yifan Qiao, Jonathan Eyolfson, Shen Teng, Guanzhou Hu, Zhihao Jia, Jinliang Wei, Keval Vora, Ravi Netravali, Miryung Kim, 2021. Dorylus: Affordable, Scalable, and Accurate {GNN} Training with Distributed {CPU} Servers and Serverless Threads. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21). 495\u2013514."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476217"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.3390\/electronics10161984"}],"event":{"name":"ICPP '22: 51st International Conference on Parallel Processing","location":"Bordeaux France","acronym":"ICPP '22"},"container-title":["Proceedings of the 51st International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3545008.3545022","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3545008.3545022","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:02:43Z","timestamp":1750186963000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3545008.3545022"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,29]]},"references-count":23,"alternative-id":["10.1145\/3545008.3545022","10.1145\/3545008"],"URL":"https:\/\/doi.org\/10.1145\/3545008.3545022","relation":{},"subject":[],"published":{"date-parts":[[2022,8,29]]},"assertion":[{"value":"2023-01-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}