{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T19:21:48Z","timestamp":1777058508698,"version":"3.51.4"},"reference-count":43,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2023,9,15]],"date-time":"2023-09-15T00:00:00Z","timestamp":1694736000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,9,15]],"date-time":"2023-09-15T00:00:00Z","timestamp":1694736000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Junshi","award":["62102389"],"award-info":[{"award-number":["62102389"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["CCF Trans. HPC"],"published-print":{"date-parts":[[2024,6]]},"DOI":"10.1007\/s42514-023-00160-0","type":"journal-article","created":{"date-parts":[[2023,9,15]],"date-time":"2023-09-15T14:01:51Z","timestamp":1694786511000},"page":"343-364","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["Uncovering the performance bottleneck of modern HPC processor with static code analyzer: a case study on Kunpeng 920"],"prefix":"10.1007","volume":"6","author":[{"given":"Shaojie","family":"Tan","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9729-8821","authenticated-orcid":false,"given":"Qingcai","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhenwei","family":"Cao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoyu","family":"Hao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junshi","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hong","family":"An","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,9,15]]},"reference":[{"key":"160_CR1","unstructured":"Abadi, M., Barham, P., Chen, J., et\u00a0al.: Tensorflow: A system for large-scale machine learning. In: Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation. USENIX Association, USA, OSDI\u201916, p 265-283 (2016)"},{"key":"160_CR3","doi-asserted-by":"crossref","unstructured":"Abel, A., Reineke, J.: uops. info: characterizing latency, throughput, and port usage of instructions on intel microarchitectures. In: Proceedings of the Twenty-Fourth International Conference on Architectural Support for Programming Languages and Operating Systems, pp 673\u2013686 (2019)","DOI":"10.1145\/3297858.3304062"},{"key":"160_CR2","doi-asserted-by":"crossref","unstructured":"Abel, A., Reineke, J.: uica: accurate throughput prediction of basic blocks on recent intel microarchitectures. In: Proceedings of the 36th ACM International Conference on Supercomputing, pp 1\u201314 (2022)","DOI":"10.1145\/3524059.3532396"},{"key":"160_CR4","doi-asserted-by":"crossref","unstructured":"Alappat, C., Meyer, N., Laukemann, J., et\u00a0al.: Execution-cache-memory modeling and performance tuning of sparse matrix-vector multiplication and lattice quantum chromodynamics on a64fx. Concurrency and Computation: Practice and Experience p e6512 (2021)","DOI":"10.1002\/cpe.6512"},{"issue":"2","key":"160_CR5","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2024716.2024718","volume":"39","author":"N Binkert","year":"2011","unstructured":"Binkert, N., Beckmann, B., Black, G., et al.: The gem5 simulator. ACM SIGARCH Comput. Archit. News 39(2), 1\u20137 (2011)","journal-title":"ACM SIGARCH Comput. Archit. News"},{"key":"160_CR6","unstructured":"Bruening, D., Garnett, T., Amarasinghe, S.: An infrastructure for adaptive dynamic optimization. In: Proceedings of the International Symposium on Code Generation and Optimization: Feedback-Directed and Runtime Optimization. IEEE Computer Society, USA, CGO \u201903, p 265\u2013275 (2003)"},{"key":"160_CR7","doi-asserted-by":"crossref","unstructured":"Charif-Rubial, A.S., Oseret, E., Noudohouenou, J., et\u00a0al.: Cqa: a code quality analyzer tool at binary level. In: 2014 21st International Conference on High Performance Computing (HiPC), IEEE, pp 1\u201310 (2014)","DOI":"10.1109\/HiPC.2014.7116904"},{"key":"160_CR8","doi-asserted-by":"crossref","unstructured":"Chen, Y., Brahmakshatriya, A., Mendis, C., et\u00a0al.: Bhive: a benchmark suite and measurement framework for validating x86-64 basic block performance models. In: 2019 IEEE International Symposium on Workload Characterization (IISWC), IEEE, pp 167\u2013177 (2019)","DOI":"10.1109\/IISWC47752.2019.9042166"},{"key":"160_CR9","volume-title":"Neural network architectures: an introduction","author":"JE Dayhoff","year":"1990","unstructured":"Dayhoff, J.E.: Neural network architectures: an introduction. Van Nostrand Reinhold Co (1990)"},{"issue":"1","key":"160_CR10","first-page":"1","volume":"5","author":"L Eeckhout","year":"2010","unstructured":"Eeckhout, L.: Computer architecture performance evaluation methods. Synth. Lect. Comput. Archit. 5(1), 1\u2013145 (2010)","journal-title":"Synth. Lect. Comput. Archit."},{"key":"160_CR11","unstructured":"Fog, A.: The microarchitecture of intel, amd, and via cpus. https:\/\/www.agner.org\/optimize\/microarchitecture.pdf (2022). Accessed 11 Sept 2023"},{"key":"160_CR12","unstructured":"Hammer, J., Hager, G., Wellein, G.: Ooo instruction benchmarking framework on the back of dragons. SC18 SRC Poster (in review) (2018)"},{"key":"160_CR13","unstructured":"HiSilicon.: Kunpeng 920 chipset. https:\/\/www.hisilicon.com\/en\/products\/Kunpeng\/Huawei-Kunpeng\/Huawei-Kunpeng-920 (2021). Accessed 11 Sept 2023"},{"key":"160_CR14","unstructured":"Hofmann, J.: ibench-instruction benchmarks (2017). URL https:\/\/githubcom\/RRZE-HPC\/ibench"},{"key":"160_CR15","unstructured":"Huawei.: Taishan v110 - microarchitectures - hisilicon. https:\/\/en.wikichip.org\/wiki\/hisilicon\/microarchitectures\/taishan_v110 (2022). Accessed 11 Sept 2023"},{"key":"160_CR16","doi-asserted-by":"crossref","unstructured":"Huh, J., Tuck, J.: Improving the effectiveness of searching for isomorphic chains in superword level parallelism. In: 2017 50th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO), IEEE, pp 718\u2013729 (2017)","DOI":"10.1145\/3123939.3124554"},{"key":"160_CR17","unstructured":"Intel architecture code analyzer user\u2019s guide. https:\/\/software.intel.com\/content\/dam\/develop\/external\/us\/en\/documents \/intel-architecture-code-analyzer-3-0-users-guide-157552.pdf (2017). Accessed 11 Sept 2023"},{"key":"160_CR25","unstructured":"llvm-mca - llvm machine code analyzer. https:\/\/llvm.org\/docs\/CommandGuide\/llvm-mca.html (2022). Accessed 11 Sept 2023"},{"key":"160_CR18","doi-asserted-by":"crossref","unstructured":"Jiang, Q., Tan, S., Cao, Z., et\u00a0al.: Quantifying throughput of basic blocks on arm microarchitectures by static code analyzers: A case study on kunpeng 920. In: 2022 IEEE 24nd International Conference on High Performance Computing and Communications; IEEE 20th International Conference on Smart City; IEEE 8th International Conference on Data Science and Systems (HPCC\/SmartCity\/DSS), IEEE (2022)","DOI":"10.1109\/HPCC-DSS-SmartCity-DependSys57074.2022.00151"},{"issue":"1","key":"160_CR19","doi-asserted-by":"publisher","first-page":"45","DOI":"10.1109\/LCA.2015.2414456","volume":"15","author":"Y Kim","year":"2016","unstructured":"Kim, Y., Yang, W., Mutlu, O.: Ramulator: a fast and extensible dram simulator. IEEE Comput. Archit. Lett. 15(1), 45\u201349 (2016). https:\/\/doi.org\/10.1109\/LCA.2015.2414456","journal-title":"IEEE Comput. Archit. Lett."},{"key":"160_CR20","unstructured":"Kronawitter, S., Lengauer, C.: Optimization of two jacobi smoother kernels by domain-specific program transformation. In: Proceedings of the 1st International Workshop on High-Performance Stencil Computations (HiStencils), pp 75\u201380 (2014)"},{"key":"160_CR21","unstructured":"Lattner, C., Adve, V.: Llvm: A compilation framework for lifelong program analysis & transformation. In: International Symposium on Code Generation and Optimization, 2004. CGO 2004., IEEE, pp 75\u201386 (2004a)"},{"key":"160_CR22","doi-asserted-by":"publisher","unstructured":"Lattner, C., Adve, V.: Llvm: a compilation framework for lifelong program analysis & transformation. In: International Symposium on Code Generation and Optimization, 2004. CGO 2004., pp 75\u201386, https:\/\/doi.org\/10.1109\/CGO.2004.1281665 (2004b)","DOI":"10.1109\/CGO.2004.1281665"},{"key":"160_CR23","doi-asserted-by":"crossref","unstructured":"Laukemann, J., Hammer, J., Hager, G., et al.: Automatic throughput and critical path analysis of x86 and arm assembly kernels. In: 2019 IEEE\/ACM Performance Modeling, pp. 1\u20136. Benchmarking and Simulation of High Performance Computer Systems (PMBS), IEEE (2019)","DOI":"10.1109\/PMBS49563.2019.00006"},{"key":"160_CR24","doi-asserted-by":"crossref","unstructured":"Laukemann, J., Hammer, J., Hofmann, J., et\u00a0al.: Automated instruction stream throughput prediction for intel and amd microarchitectures. In: 2018 IEEE\/ACM performance modeling, benchmarking and simulation of high performance computer systems (PMBS), IEEE, pp 121\u2013131 (2018)","DOI":"10.1109\/PMBS.2018.8641578"},{"key":"160_CR26","doi-asserted-by":"crossref","unstructured":"Lozano, R.C., Carlsson, M., Drejhammar, F., et\u00a0al.: Constraint-based register allocation and instruction scheduling. In: International Conference on Principles and Practice of Constraint Programming, Springer, pp 750\u2013766 (2012)","DOI":"10.1007\/978-3-642-33558-7_54"},{"key":"160_CR27","unstructured":"McGovern, A., Moss, J.: Scheduling straight-line code using reinforcement learning and rollouts. Advances in neural information processing Systems 11 (1998)"},{"key":"160_CR28","doi-asserted-by":"crossref","unstructured":"Mendis, C., Amarasinghe, S.: goslp: globally optimized superword level parallelism framework. Proceedings of the ACM on Programming Languages 2(OOPSLA):1\u201328 (2018)","DOI":"10.1145\/3276480"},{"key":"160_CR29","unstructured":"Mendis, C., Renda, A., Amarasinghe, S., et\u00a0al.: Ithemal: Accurate, portable and fast basic block throughput estimation using deep neural networks. In: International Conference on machine learning, PMLR, pp 4505\u20134515 (2019)"},{"key":"160_CR30","doi-asserted-by":"publisher","first-page":"38","DOI":"10.1109\/2.982914","volume":"35","author":"S Mukherjee","year":"2002","unstructured":"Mukherjee, S., Adve, S., Austin, T., et al.: Performance simulation tools. Computer 35, 38\u201339 (2002). https:\/\/doi.org\/10.1109\/2.982914","journal-title":"Computer"},{"key":"160_CR31","doi-asserted-by":"crossref","unstructured":"Pohl, A., Cosenza, B., Juurlink, B.: Portable cost modeling for auto-vectorizers. In: 2019 IEEE 27th International Symposium on Modeling, Analysis, and Simulation of Computer and Telecommunication Systems (MASCOTS), IEEE, pp 359\u2013369 (2019)","DOI":"10.1109\/MASCOTS.2019.00046"},{"issue":"102","key":"160_CR32","first-page":"106","volume":"140","author":"A Pohl","year":"2020","unstructured":"Pohl, A., Cosenza, B., Juurlink, B.: Vectorization cost modeling for neon, avx and sve. Perform. Eval. 140(102), 106 (2020)","journal-title":"Perform. Eval."},{"key":"160_CR33","unstructured":"Reimplementation of the bhive profiler (2021). https:\/\/github.com\/gilbertmike\/bhive. Accessed 11 Sept 2023"},{"key":"160_CR34","doi-asserted-by":"crossref","unstructured":"Renda, A., Chen, Y., Mendis, C., et\u00a0al.: Difftune: Optimizing cpu simulator parameters with learned differentiable surrogates. In: 2020 53rd Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO), IEEE, pp 442\u2013455 (2020)","DOI":"10.1109\/MICRO50266.2020.00045"},{"key":"160_CR35","doi-asserted-by":"crossref","unstructured":"Ritter, F., Hack, S.: Pmevo: portable inference of port mappings for out-of-order processors by evolutionary optimization. In: Proceedings of the 41st ACM SIGPLAN Conference on Programming Language Design and Implementation, pp 608\u2013622 (2020)","DOI":"10.1145\/3385412.3385995"},{"issue":"1","key":"160_CR36","doi-asserted-by":"publisher","first-page":"16","DOI":"10.1109\/L-CA.2011.4","volume":"10","author":"P Rosenfeld","year":"2011","unstructured":"Rosenfeld, P., Cooper-Balis, E., Jacob, B.: Dramsim2: a cycle accurate memory system simulator. IEEE Comput. Archit. Lett. 10(1), 16\u201319 (2011). https:\/\/doi.org\/10.1109\/L-CA.2011.4","journal-title":"IEEE Comput. Archit. Lett."},{"issue":"3","key":"160_CR37","doi-asserted-by":"publisher","first-page":"475","DOI":"10.1145\/2508148.2485963","volume":"41","author":"D Sanchez","year":"2013","unstructured":"Sanchez, D., Kozyrakis, C.: Zsim: fast and accurate microarchitectural simulation of thousand-core systems. ACM SIGARCH Comput. Architect. News 41(3), 475\u2013486 (2013)","journal-title":"ACM SIGARCH Comput. Architect. News"},{"key":"160_CR38","doi-asserted-by":"crossref","unstructured":"Seiferth, J., Alappat, C., Korch, M., et\u00a0al.: Applicability of the ecm performance model to explicit ode methods on current multi-core processors. In: International Conference on High Performance Computing, Springer, pp 163\u2013183 (2018)","DOI":"10.1007\/978-3-319-92040-5_9"},{"key":"160_CR39","doi-asserted-by":"crossref","unstructured":"Stengel, H., Treibig, J., Hager, G., et\u00a0al.: Quantifying performance bottlenecks of stencil computations using the execution-cache-memory model. In: Proceedings of the 29th ACM on International Conference on Supercomputing, pp 207\u2013216 (2015)","DOI":"10.1145\/2751205.2751240"},{"issue":"5","key":"160_CR40","doi-asserted-by":"publisher","first-page":"77","DOI":"10.1145\/780822.781141","volume":"38","author":"M Stephenson","year":"2003","unstructured":"Stephenson, M., Amarasinghe, S., Martin, M., et al.: Meta optimization: improving compiler heuristics with machine learning. ACM SIGPLAN Not. 38(5), 77\u201390 (2003)","journal-title":"ACM SIGPLAN Not."},{"key":"160_CR41","doi-asserted-by":"crossref","unstructured":"Treibig, J., Hager, G.: Introducing a performance model for bandwidth-limited loop kernels. In: International Conference on Parallel Processing and Applied Mathematics, Springer, pp 615\u2013624 (2009)","DOI":"10.1007\/978-3-642-14390-8_64"},{"key":"160_CR42","doi-asserted-by":"publisher","DOI":"10.1145\/2601097.2601199","author":"I Wald","year":"2014","unstructured":"Wald, I., Woop, S., Benthin, C., et al.: Embree: a kernel framework for efficient CPU ray tracing. ACM Trans. Graph. (2014). https:\/\/doi.org\/10.1145\/2601097.2601199","journal-title":"ACM Trans. Graph."},{"issue":"4","key":"160_CR43","doi-asserted-by":"publisher","first-page":"65","DOI":"10.1145\/1498765.1498785","volume":"52","author":"S Williams","year":"2009","unstructured":"Williams, S., Waterman, A., Patterson, D.: Roofline: an insightful visual performance model for multicore architectures. Commun. ACM 52(4), 65\u201376 (2009)","journal-title":"Commun. ACM"}],"container-title":["CCF Transactions on High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-023-00160-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42514-023-00160-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-023-00160-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,6,24]],"date-time":"2024-06-24T07:05:38Z","timestamp":1719212738000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42514-023-00160-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,9,15]]},"references-count":43,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2024,6]]}},"alternative-id":["160"],"URL":"https:\/\/doi.org\/10.1007\/s42514-023-00160-0","relation":{},"ISSN":["2524-4922","2524-4930"],"issn-type":[{"value":"2524-4922","type":"print"},{"value":"2524-4930","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,9,15]]},"assertion":[{"value":"30 March 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 June 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 September 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}