{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,10]],"date-time":"2026-06-10T01:00:59Z","timestamp":1781053259370,"version":"3.54.1"},"reference-count":53,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Future Generation Computer Systems"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.future.2026.108606","type":"journal-article","created":{"date-parts":[[2026,5,23]],"date-time":"2026-05-23T15:09:59Z","timestamp":1779548999000},"page":"108606","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Mapo: Performance model driven GPU memory access code optimization"],"prefix":"10.1016","volume":"184","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-5398-7955","authenticated-orcid":false,"given":"Shuai","family":"Liu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaoshe","family":"Dong","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Junkai","family":"Cao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ruifan","family":"Chu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5064-2376","authenticated-orcid":false,"given":"Ziheng","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9179-6611","authenticated-orcid":false,"given":"Qiang","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiuxiu","family":"Bai","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"issue":"4","key":"10.1016\/j.future.2026.108606_b1","doi-asserted-by":"crossref","first-page":"4014","DOI":"10.1007\/s11227-020-03417-5","article-title":"Performance prediction of parallel applications: a systematic literature review","volume":"77","author":"Flores-Contreras","year":"2021","journal-title":"J. Supercomput."},{"key":"10.1016\/j.future.2026.108606_b2","doi-asserted-by":"crossref","unstructured":"X. Ding, L. Chen, M. Emani, C. Liao, P.-H. Lin, T. Vanderbruggen, Z. Xie, A. Cerpa, W. Du, Hpc-gpt: Integrating large language model for high-performance computing, in: Proceedings of the SC\u201923 Workshops of the International Conference on High Performance Computing, Network, Storage, and Analysis, 2023, pp. 951\u2013960.","DOI":"10.1145\/3624062.3624172"},{"issue":"1","key":"10.1016\/j.future.2026.108606_b3","doi-asserted-by":"crossref","first-page":"194","DOI":"10.1007\/s11227-024-06637-1","article-title":"chatHPC: Empowering HPC users with large language models","volume":"81","author":"Yin","year":"2025","journal-title":"J. Supercomput."},{"key":"10.1016\/j.future.2026.108606_b4","series-title":"The AI CUDA Engineer: Agentic CUDA Kernel Discovery, Optimization and Composition","author":"Lange","year":"2025"},{"issue":"1","key":"10.1016\/j.future.2026.108606_b5","doi-asserted-by":"crossref","first-page":"20","DOI":"10.1109\/TPDS.2023.3325630","article-title":"Adaptive auto-tuning framework for global exploration of stencil optimization on gpus","volume":"35","author":"Sun","year":"2023","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"10.1016\/j.future.2026.108606_b6","doi-asserted-by":"crossref","unstructured":"Y. Arafa, A.-H. Badawy, A. ElWazir, A. Barai, A. Eker, G. Chennupati, N. Santhi, S. Eidenbenz, Hybrid, scalable, trace-driven performance modeling of GPGPUs, in: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, 2021, pp. 1\u201315.","DOI":"10.1145\/3458817.3476221"},{"key":"10.1016\/j.future.2026.108606_b7","doi-asserted-by":"crossref","unstructured":"T. Shu, Y. Guo, J. Wozniak, X. Ding, I. Foster, T. Kurc, Bootstrapping in-situ workflow auto-tuning via combining performance models of component applications, in: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, 2021, pp. 1\u201315.","DOI":"10.1145\/3458817.3476197"},{"issue":"3","key":"10.1016\/j.future.2026.108606_b8","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3522712","article-title":"Performance and power prediction for concurrent execution on gpus","volume":"19","author":"Moolchandani","year":"2022","journal-title":"ACM Trans. Archit. Code Optim. (TACO)"},{"key":"10.1016\/j.future.2026.108606_b9","doi-asserted-by":"crossref","DOI":"10.1016\/j.jpdc.2024.104919","article-title":"HBPB, applying reuse distance to improve cache efficiency proactively","volume":"191","author":"Krause","year":"2024","journal-title":"J. Parallel Distrib. Comput."},{"issue":"1","key":"10.1016\/j.future.2026.108606_b10","first-page":"1","article-title":"ReuseTracker: fast yet accurate multicore reuse distance analyzer","volume":"19","author":"Sasongko","year":"2021","journal-title":"ACM Trans. Archit. Code Optim. (TACO)"},{"key":"10.1016\/j.future.2026.108606_b11","series-title":"2021 IEEE International Symposium on High-Performance Computer Architecture","first-page":"868","article-title":"Need for speed: Experiences building a trustworthy system-level gpu simulator","author":"Villa","year":"2021"},{"key":"10.1016\/j.future.2026.108606_b12","series-title":"2019 IEEE International Symposium on Performance Analysis of Systems and Software","first-page":"151","article-title":"Analyzing machine learning workloads using a detailed GPU simulator","author":"Lew","year":"2019"},{"key":"10.1016\/j.future.2026.108606_b13","series-title":"2020 ACM\/IEEE 47th Annual International Symposium on Computer Architecture","first-page":"473","article-title":"Accel-sim: An extensible simulation framework for validated gpu modeling","author":"Khairy","year":"2020"},{"key":"10.1016\/j.future.2026.108606_b14","doi-asserted-by":"crossref","unstructured":"S. Lee, A. Phanishayee, D. Mahajan, Forecasting GPU performance for deep learning training and inference, in: Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Vol. 1, 2025, pp. 493\u2013508.","DOI":"10.1145\/3669940.3707265"},{"issue":"1","key":"10.1016\/j.future.2026.108606_b15","first-page":"1","article-title":"A simple model for portable and fast prediction of execution time and power consumption of GPU kernels","volume":"18","author":"Braun","year":"2020","journal-title":"ACM Trans. Archit. Code Optim. (TACO)"},{"key":"10.1016\/j.future.2026.108606_b16","doi-asserted-by":"crossref","first-page":"66","DOI":"10.1016\/j.jpdc.2022.09.002","article-title":"Evaluating execution time predictions on gpu kernels using an analytical model and machine learning techniques","volume":"171","author":"Amaris","year":"2023","journal-title":"J. Parallel Distrib. Comput."},{"key":"10.1016\/j.future.2026.108606_b17","doi-asserted-by":"crossref","unstructured":"A. Dutta, J. Alcaraz, A. TehraniJamsaz, E. Cesar, A. Sikora, A. Jannesari, Performance optimization using multimodal modeling and heterogeneous GNN, in: Proceedings of the 32nd International Symposium on High-Performance Parallel and Distributed Computing, 2023, pp. 45\u201357.","DOI":"10.1145\/3588195.3592984"},{"key":"10.1016\/j.future.2026.108606_b18","doi-asserted-by":"crossref","unstructured":"A. Dutta, A. Jannesari, Mirencoder: Multi-modal ir-based pretrained embeddings for performance optimizations, in: Proceedings of the 2024 International Conference on Parallel Architectures and Compilation Techniques, 2024, pp. 156\u2013167.","DOI":"10.1145\/3656019.3676895"},{"key":"10.1016\/j.future.2026.108606_b19","series-title":"SC24: International Conference for High Performance Computing, Networking, Storage and Analysis","first-page":"1","article-title":"Learning generalizable program and architecture representations for performance modeling","author":"Li","year":"2024"},{"key":"10.1016\/j.future.2026.108606_b20","series-title":"Language models for code optimization: Survey, challenges and future directions","author":"Gong","year":"2025"},{"issue":"11","key":"10.1016\/j.future.2026.108606_b21","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3570638","article-title":"Optimization techniques for GPU programming","volume":"55","author":"Hijma","year":"2023","journal-title":"ACM Comput. Surv."},{"key":"10.1016\/j.future.2026.108606_b22","doi-asserted-by":"crossref","DOI":"10.1145\/3772721","article-title":"A survey on large language models for code generation","author":"Jiang","year":"2025","journal-title":"ACM Trans. Softw. Eng. Methodol."},{"key":"10.1016\/j.future.2026.108606_b23","doi-asserted-by":"crossref","unstructured":"D. Nichols, J.H. Davis, Z. Xie, A. Rajaram, A. Bhatele, Can large language models write parallel code?, in: Proceedings of the 33rd International Symposium on High-Performance Parallel and Distributed Computing, 2024, pp. 281\u2013294.","DOI":"10.1145\/3625549.3658689"},{"key":"10.1016\/j.future.2026.108606_b24","series-title":"CUDA-LLM: LLMs can write efficient CUDA kernels","author":"Chen","year":"2025"},{"key":"10.1016\/j.future.2026.108606_b25","unstructured":"A. Ouyang, S. Guo, S. Arora, A.L. Zhang, W. Hu, C. R\u00e9, A. Mirhoseini, KernelBench: Can LLMs Write Efficient GPU Kernels?, in: ICML, 2025."},{"issue":"1","key":"10.1016\/j.future.2026.108606_b26","doi-asserted-by":"crossref","first-page":"72","DOI":"10.1109\/TPDS.2016.2549523","article-title":"Dissecting GPU memory hierarchy through microbenchmarking","volume":"28","author":"Mei","year":"2016","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"10.1016\/j.future.2026.108606_b27","series-title":"Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing","first-page":"8696","article-title":"CodeT5: Identifier-aware unified pre-trained Encoder-Decoder models for code understanding and generation","author":"Wang","year":"2021"},{"key":"10.1016\/j.future.2026.108606_b28","series-title":"Empirical Methods in Natural Language Processing","first-page":"1536","article-title":"CodeBERT: A pre-trained model for programming and natural languages","author":"Feng","year":"2020"},{"key":"10.1016\/j.future.2026.108606_b29","series-title":"Starcoder: may the source be with you!","author":"Li","year":"2023"},{"issue":"4","key":"10.1016\/j.future.2026.108606_b30","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3418463","article-title":"Ir2vec: Llvm ir based scalable program embeddings","volume":"17","author":"VenkataKeerthy","year":"2020","journal-title":"ACM Trans. Archit. Code Optim."},{"key":"10.1016\/j.future.2026.108606_b31","unstructured":"T.N. Kipf, M. Welling, Semi-Supervised Classification with Graph Convolutional Networks, in: 5th International Conference on Learning Representations, ICLR 2017, Toulon, France, April 24-26, 2017."},{"key":"10.1016\/j.future.2026.108606_b32","unstructured":"P. Veli\u010dkovi\u0107, G. Cucurull, A. Casanova, A. Romero, P. Li\u00f2, Y. Bengio, Graph Attention Networks, in: International Conference on Learning Representations, 2018."},{"key":"10.1016\/j.future.2026.108606_b33","unstructured":"Y. Li, D. Tarlow, M. Brockschmidt, R.S. Zemel, Gated Graph Sequence Neural Networks, in: 4th International Conference on Learning Representations, ICLR 2016, San Juan, Puerto Rico, May 2-4, 2016."},{"key":"10.1016\/j.future.2026.108606_b34","series-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems","first-page":"6000","article-title":"Attention is all you need","author":"Vaswani","year":"2017"},{"key":"10.1016\/j.future.2026.108606_b35","series-title":"XGBoost: A Scalable Tree Boosting System","author":"Chen","year":"2016"},{"key":"10.1016\/j.future.2026.108606_b36","series-title":"Deepseekmath: Pushing the limits of mathematical reasoning in open language models","author":"Shao","year":"2024"},{"key":"10.1016\/j.future.2026.108606_b37","series-title":"Proximal policy optimization algorithms","author":"Schulman","year":"2017"},{"key":"10.1016\/j.future.2026.108606_b38","unstructured":"E.J. Hu, Y. Shen, P. Wallis, Z. Allen-Zhu, Y. Li, S. Wang, L. Wang, W. Chen, LoRA: Low-Rank Adaptation of Large Language Models, in: The Tenth International Conference on Learning Representations, ICLR 2022, Virtual Event, April 25-29, 2022."},{"key":"10.1016\/j.future.2026.108606_b39","series-title":"2021 International Conference on Applied Artificial Intelligence","first-page":"1","article-title":"LS-CAT: a large-scale CUDA AutoTuning dataset","author":"Bjertnes","year":"2021"},{"key":"10.1016\/j.future.2026.108606_b40","series-title":"PolyBenchGPU","author":"Pouchet","year":"2012"},{"key":"10.1016\/j.future.2026.108606_b41","series-title":"RodiniaGPU","author":"CS","year":"2014"},{"key":"10.1016\/j.future.2026.108606_b42","series-title":"DeepBench","author":"Baiduresearch","year":"2016"},{"key":"10.1016\/j.future.2026.108606_b43","unstructured":"T. Mikolov, K. Chen, G. Corrado, J. Dean, Efficient Estimation of Word Representations in Vector Space, in: 1st International Conference on Learning Representations, ICLR 2013, Scottsdale, Arizona, USA, May 2-4, 2013, Workshop Track Proceedings, 2013."},{"issue":"4","key":"10.1016\/j.future.2026.108606_b44","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3291051","article-title":"Efficient cache performance modeling in GPUs using reuse distance analysis","volume":"15","author":"Kiani","year":"2018","journal-title":"ACM Trans. Archit. Code Optim. (TACO)"},{"key":"10.1016\/j.future.2026.108606_b45","series-title":"2019 IEEE 38th International Performance Computing and Communications Conference","first-page":"1","article-title":"GPUs cache performance estimation using reuse distance analysis","author":"Arafa","year":"2019"},{"issue":"2","key":"10.1016\/j.future.2026.108606_b46","doi-asserted-by":"crossref","first-page":"421","DOI":"10.31577\/cai_2019_2_421","article-title":"Rdgc: a reuse distance-based approach to GPU cache performance analysis","volume":"38","author":"Kiani","year":"2019","journal-title":"Comput. Inform."},{"key":"10.1016\/j.future.2026.108606_b47","doi-asserted-by":"crossref","unstructured":"Y. Arafa, A.-H. Badawy, G. Chennupati, A. Barai, N. Santhi, S. Eidenbenz, Fast, accurate, and scalable memory modeling of GPGPUs using reuse profiles, in: Proceedings of the 34th ACM International Conference on Supercomputing, 2020, pp. 1\u201312.","DOI":"10.1145\/3392717.3392761"},{"key":"10.1016\/j.future.2026.108606_b48","doi-asserted-by":"crossref","unstructured":"H. Abdelkhalik, Y. Arafa, N. Santhi, N. Prajapati, A.-H.A. Badawy, Modeling and Characterizing Shared and Local Memories of the Ampere GPUs, in: Proceedings of the International Symposium on Memory Systems, 2023, pp. 1\u20133.","DOI":"10.1145\/3631882.3631891"},{"key":"10.1016\/j.future.2026.108606_b49","doi-asserted-by":"crossref","unstructured":"A. Razzak, A. Barai, N. Santhi, A.H. Badawy, Static Reuse Profile Estimation for Array Applications, in: Proceedings of the International Symposium on Memory Systems, 2024, pp. 235\u2013244.","DOI":"10.1145\/3695794.3695817"},{"key":"10.1016\/j.future.2026.108606_b50","series-title":"Fast and Accurate Static Estimation of Reuse Distance Profiles for Array-Based Nested Loops Applications","author":"Razzak","year":"2025"},{"key":"10.1016\/j.future.2026.108606_b51","series-title":"Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning","author":"Guo","year":"2025"},{"key":"10.1016\/j.future.2026.108606_b52","series-title":"2022 IEEE 24th Int Conf on High Performance Computing & Communications; 8th Int Conf on Data Science & Systems; 20th Int Conf on Smart City; 8th Int Conf on Dependability in Sensor, Cloud & Big Data Systems & Application (HPCC\/DSS\/SmartCity\/DependSys)","first-page":"51","article-title":"MISA-MD: A new design of molecular dynamics software for GPU architecture","author":"Chu","year":"2022"},{"issue":"11","key":"10.1016\/j.future.2026.108606_b53","article-title":"Application verification of parallel computing method for large-scale CFD numerical simulation of turbomachinery","volume":"45","author":"Zhao","year":"2024","journal-title":"J. Propuls. Technol."}],"container-title":["Future Generation Computer Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167739X26002402?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167739X26002402?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,10]],"date-time":"2026-06-10T00:43:05Z","timestamp":1781052185000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0167739X26002402"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":53,"alternative-id":["S0167739X26002402"],"URL":"https:\/\/doi.org\/10.1016\/j.future.2026.108606","relation":{},"ISSN":["0167-739X"],"issn-type":[{"value":"0167-739X","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Mapo: Performance model driven GPU memory access code optimization","name":"articletitle","label":"Article Title"},{"value":"Future Generation Computer Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.future.2026.108606","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"108606"}}