{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,5,21]],"date-time":"2024-05-21T00:23:01Z","timestamp":1716250981776},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2024,2,26]],"date-time":"2024-02-26T00:00:00Z","timestamp":1708905600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,2,26]],"date-time":"2024-02-26T00:00:00Z","timestamp":1708905600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Computing"],"published-print":{"date-parts":[[2024,5]]},"DOI":"10.1007\/s00607-023-01255-w","type":"journal-article","created":{"date-parts":[[2024,2,26]],"date-time":"2024-02-26T07:02:11Z","timestamp":1708930931000},"page":"1519-1555","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Many-BSP: an analytical performance model for CUDA kernels"],"prefix":"10.1007","volume":"106","author":[{"given":"Ali","family":"Riahi","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Abdorreza","family":"Savadi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mahmoud","family":"Naghibzadeh","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,2,26]]},"reference":[{"key":"1255_CR1","doi-asserted-by":"crossref","unstructured":"Hu, Z., Guangming, L., Wenrui, D.: A throughput-aware analytical performance model for GPU applications. In: Advanced Computer Architecture: 10th Annual Conference, ACA 2014, Shenyang, China, August 23-24, 2014. Proceedings. Springer Berlin Heidelberg. (2014) https:\/\/doi.org\/10.1007\/978-3-662-44491-7_8","DOI":"10.1007\/978-3-662-44491-7_8"},{"issue":"1","key":"1255_CR2","doi-asserted-by":"publisher","first-page":"154","DOI":"10.1016\/j.jcss.2010.06.012","volume":"77","author":"LG Valiant","year":"2011","unstructured":"Valiant LG (2011) A bridging model for multi-core computing. J. Comput. Syst. Sci. 77(1):154\u2013166. https:\/\/doi.org\/10.1016\/j.jcss.2010.06.012","journal-title":"J. Comput. Syst. Sci."},{"issue":"2","key":"1255_CR3","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1109\/MM.2011.24","volume":"31","author":"CM Wittenbrink","year":"2011","unstructured":"Wittenbrink CM, Emmett K, Arjun P (2011) Fermi GF100 GPU architecture. IEEE Micro 31(2):50\u201359. https:\/\/doi.org\/10.1109\/MM.2011.24","journal-title":"IEEE Micro"},{"key":"1255_CR4","unstructured":"NVIDIA: Whitepaper NVIDIA GeForce GTX 680, NVIDIA Corp (2012)"},{"key":"1255_CR5","unstructured":"NVIDIA: Whitepaper NVIDIA GeForce GTX 980, NVIDIA Corp (2014)"},{"key":"1255_CR6","unstructured":"NVIDIA: Whitepaper NVIDIA GeForce GTX 1080, NVIDIA Corp (2016)"},{"key":"1255_CR7","unstructured":"NVIDIA: Whitepaper NVIDIA TESLA V100 GPU ARCHITECTURE, NVIDIA Corp (2017)"},{"issue":"11","key":"1255_CR8","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3570638","volume":"55","author":"P Hijma","year":"2023","unstructured":"Hijma P et al (2023) Optimization techniques for GPU programming. ACM Comput. Surv. 55(11):1\u201381. https:\/\/doi.org\/10.1145\/3570638","journal-title":"ACM Comput. Surv."},{"issue":"11","key":"1255_CR9","doi-asserted-by":"publisher","first-page":"3142","DOI":"10.1109\/TPDS.2017.2704080","volume":"28","author":"MK Yoon","year":"2017","unstructured":"Yoon MK et al (2017) Dynamic resizing on active warps scheduler to hide operation stalls on GPUs. IEEE Trans. Parallel Distrib. Syst. 28(11):3142\u20133156. https:\/\/doi.org\/10.1109\/TPDS.2017.2704080","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"1255_CR10","doi-asserted-by":"crossref","unstructured":"Li, A., et al.: Warp-consolidation: a novel execution model for GPUs. In: Proceedings of the 2018 International Conference on Supercomputing. (2018) https:\/\/doi.org\/10.1145\/3205289.3205294","DOI":"10.1145\/3205289.3205294"},{"issue":"1","key":"1255_CR11","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1016\/j.jpdc.2012.04.003","volume":"73","author":"AR Brodtkorb","year":"2013","unstructured":"Brodtkorb AR, Hagen TR, S\u00e6tra ML (2013) Graphics processing unit (GPU) programming strategies and trends in GPU computing. J. Parallel Distrib. Comput. 73(1):4\u201313. https:\/\/doi.org\/10.1016\/j.jpdc.2012.04.003","journal-title":"J. Parallel Distrib. Comput."},{"key":"1255_CR12","doi-asserted-by":"crossref","unstructured":"Matela, J., Martin, \u0160., Petr, H.: Low GPU occupancy approach to fast arithmetic coding in JPEG2000. In: International Doctoral Workshop on Mathematical and Engineering Methods in Computer Science. Springer Berlin Heidelberg. (2011) https:\/\/doi.org\/10.1007\/978-3-642-25929-6_13","DOI":"10.1007\/978-3-642-25929-6_13"},{"key":"1255_CR13","volume-title":"Understanding Latency Hiding on GPUs","author":"V Volkov","year":"2016","unstructured":"Volkov V (2016) Understanding Latency Hiding on GPUs. University of California, Berkeley"},{"key":"1255_CR14","doi-asserted-by":"crossref","unstructured":"Yu, Y., et al.: A stall-aware warp scheduling for dynamically optimizing thread-level parallelism in GPGPUs. In: Proceedings of the 29th ACM on International Conference on Supercomputing. (2015) https:\/\/doi.org\/10.1145\/2751205.2751234","DOI":"10.1145\/2751205.2751234"},{"key":"1255_CR15","unstructured":"NVIDIA: CUDA C Programming Guide, Version 10.1, NVIDIA Corp (2019)"},{"issue":"3","key":"1255_CR16","doi-asserted-by":"publisher","first-page":"332","DOI":"10.1145\/2508148.2485951","volume":"41","author":"A Jog","year":"2013","unstructured":"Jog A et al (2013) Orchestrated scheduling and prefetching for GPGPUs. Comput. Architect. News 41(3):332\u2013343. https:\/\/doi.org\/10.1145\/2508148.2485951","journal-title":"Comput. Architect. News"},{"key":"1255_CR17","doi-asserted-by":"crossref","unstructured":"Aguilera, P., et al.: Process variation-aware workload partitioning algorithms for GPUs supporting spatial-multitasking. In: 2014 Design, Automation & Test in Europe Conference & Exhibition (DATE). IEEE. (2014) https:\/\/doi.org\/10.7873\/DATE.2014.189","DOI":"10.7873\/DATE2014.189"},{"issue":"4","key":"1255_CR18","doi-asserted-by":"publisher","first-page":"395","DOI":"10.1145\/2499368.2451158","volume":"48","author":"A Jog","year":"2013","unstructured":"Jog A et al (2013) OWL: cooperative thread array aware scheduling techniques for improving GPGPU performance. ACM SIGPLAN Not. 48(4):395\u2013406. https:\/\/doi.org\/10.1145\/2499368.2451158","journal-title":"ACM SIGPLAN Not."},{"key":"1255_CR19","unstructured":"NVIDIA: CUDA C Programming Guide, Version 7.0, NVIDIA Corporation (2015)"},{"issue":"2","key":"1255_CR20","doi-asserted-by":"publisher","first-page":"56","DOI":"10.1109\/MM.2010.41","volume":"30","author":"J Nickolls","year":"2010","unstructured":"Nickolls J, Dally WJ (2010) The GPU computing era. IEEE micro 30(2):56\u201369. https:\/\/doi.org\/10.1109\/MM.2010.41","journal-title":"IEEE micro"},{"key":"1255_CR21","unstructured":"Singhania, N.: Static Analysis for GPU Program Performance, University of Pennsylvania (2018)"},{"issue":"1","key":"1255_CR22","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3177964","volume":"15","author":"Z Lin","year":"2018","unstructured":"Lin Z, Mantor M, Zhou H (2018) GPU performance vs. thread-level parallelism: scalability analysis and a novel way to improve TLP. Trans. Archit. Code Optim. (TACO) 15(1):1\u201321. https:\/\/doi.org\/10.1145\/3177964","journal-title":"Trans. Archit. Code Optim. (TACO)"},{"issue":"2","key":"1255_CR23","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2166879.2166882","volume":"30","author":"M Gebhart","year":"2012","unstructured":"Gebhart M et al (2012) A hierarchical thread scheduler and register file for energy-efficient throughput processors. ACM Trans. Comput. Syst. 30(2):1\u201338. https:\/\/doi.org\/10.1145\/2166879.2166882","journal-title":"ACM Trans. Comput. Syst."},{"key":"1255_CR24","unstructured":"Koike, A.: A Computational Model and Algorithms to Utilize GPUs for Discrete Problems. University of Sokendai (2015)"},{"key":"1255_CR25","doi-asserted-by":"crossref","unstructured":"Nemirovsky, M., Tullsen, D. M.: Multithreading architecture. In: Springer Cham, Synth Lect Comput Archit, 1st edn, pp 1-109. (2013) https:\/\/doi.org\/10.2200\/S00458ED1V01Y201212CAC021","DOI":"10.2200\/S00458ED1V01Y201212CAC021"},{"issue":"8","key":"1255_CR26","doi-asserted-by":"publisher","first-page":"1560","DOI":"10.1109\/TCAD.2017.2764886","volume":"37","author":"Y Liang","year":"2017","unstructured":"Liang Y et al (2017) Optimizing cache bypassing and warp scheduling for GPUs. IEEE Trans. Comput.-Aided Des. Integr. Circuits Syst. 37(8):1560\u20131573. https:\/\/doi.org\/10.1109\/TCAD.2017.2764886","journal-title":"IEEE Trans. Comput.-Aided Des. Integr. Circuits Syst."},{"key":"1255_CR27","doi-asserted-by":"crossref","unstructured":"Narasiman, V., et al.: Improving GPU performance via large warps and two-level warp scheduling. In: Proceedings of the 44th Annual IEEE\/ACM International Symposium on Microarchitecture. (2011) https:\/\/doi.org\/10.1145\/2155620.2155656","DOI":"10.1145\/2155620.2155656"},{"key":"1255_CR28","doi-asserted-by":"crossref","unstructured":"Hagiescu, A., et al.: Automated architecture-aware mapping of streaming applications onto GPUs. In: IEEE International Parallel & Distributed Processing Symposium, IEEE. (2011) https:\/\/doi.org\/10.1109\/IPDPS.2011.52","DOI":"10.1109\/IPDPS.2011.52"},{"key":"1255_CR29","doi-asserted-by":"crossref","unstructured":"Awatramani, M., et al.: Phase aware warp scheduling: mitigating effects of phase behavior in gpgpu applications. In: International Conference on Parallel Architecture and Compilation (PACT), IEEE. (2015) https:\/\/doi.org\/10.1109\/PACT.2015.31","DOI":"10.1109\/PACT.2015.31"},{"key":"1255_CR30","doi-asserted-by":"publisher","first-page":"520","DOI":"10.1016\/j.future.2017.02.036","volume":"82","author":"Y Zhang","year":"2018","unstructured":"Zhang Y et al (2018) Locality based warp scheduling in GPGPUs. Future Gener. Comput. Syst. 82:520\u2013527. https:\/\/doi.org\/10.1016\/j.future.2017.02.036","journal-title":"Future Gener. Comput. Syst."},{"issue":"4","key":"1255_CR31","doi-asserted-by":"publisher","first-page":"1","DOI":"10.9708\/jksci.2019.24.04.001","volume":"24","author":"GB Kim","year":"2019","unstructured":"Kim GB, Kim JM, Kim CH (2019) Latency hiding based warp scheduling policy for high performance GPUs. J. Korea Soc. Comput. Inf. 24(4):1\u20139. https:\/\/doi.org\/10.9708\/jksci.2019.24.04.001","journal-title":"J. Korea Soc. Comput. Inf."},{"key":"1255_CR32","doi-asserted-by":"crossref","unstructured":"Kothapalli, K., et al.: A performance prediction model for the CUDA GPGPU platform. In: 2009 International Conference on High Performance Computing (HiPC), IEEE. (2009) https:\/\/doi.org\/10.1109\/HIPC.2009.5433179","DOI":"10.1109\/HIPC.2009.5433179"},{"key":"1255_CR33","unstructured":"Kirtzic, J. S., Daescu, O., Richardson, T. X.: A parallel algorithm development model for the GPU architecture. In: Proceedings of Int\u2019l Conference on Parallel and Distributed Processing Techniques and Applications (2012)"},{"key":"1255_CR34","unstructured":"Lai Junjie, A. S.: TEG: Gpu performance estimation using a timing model. Diss, INRIA, p 2011 (2011)"},{"key":"1255_CR35","doi-asserted-by":"crossref","unstructured":"Amaris, M., et al.: A simple BSP-based model to predict execution time in GPU applications. In: 2015 IEEE 22nd International Conference on High Performance Computing (HiPC), IEEE. (2015) https:\/\/doi.org\/10.1109\/HiPC.2015.34","DOI":"10.1109\/HiPC.2015.34"},{"key":"1255_CR36","unstructured":"Kim, H., et al.: Macsim: A CPU-GPU heterogeneous simulation framework user guide. Georgia Institute of Technology (2012)"},{"issue":"12","key":"1255_CR37","doi-asserted-by":"publisher","first-page":"2865","DOI":"10.1109\/TPDS.2020.3004623","volume":"31","author":"Q Wang","year":"2020","unstructured":"Wang Q, Chu X (2020) GPGPU performance estimation with core and memory frequency scaling. IEEE Trans. Parallel Distrib. Syst. 31(12):2865\u20132881. https:\/\/doi.org\/10.1109\/TPDS.2020.3004623","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"issue":"7","key":"1255_CR38","doi-asserted-by":"publisher","first-page":"1165","DOI":"10.1109\/TCAD.2015.2501303","volume":"35","author":"Y Liang","year":"2015","unstructured":"Liang Y et al (2015) An accurate GPU performance model for effective control flow divergence optimization. IEEE Trans. Comput.-Aided Des. Integr. Circuits Syst. 35(7):1165\u20131178. https:\/\/doi.org\/10.1109\/TCAD.2015.2501303","journal-title":"IEEE Trans. Comput.-Aided Des. Integr. Circuits Syst."},{"key":"1255_CR39","doi-asserted-by":"crossref","unstructured":"Cornelis, J. G., Lemeire, J.: The pipeline performance model: a generic executable performance model for GPUs. In: 2019 27th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP), IEEE. (2019) https:\/\/doi.org\/10.1109\/EMPDP.2019.8671606","DOI":"10.1109\/EMPDP.2019.8671606"},{"key":"1255_CR40","doi-asserted-by":"crossref","unstructured":"O\u2019Neal, K., et al. HALWPE: Hardware-assisted light weight performance estimation for GPUs. In: 2017 54th ACM\/EDAC\/IEEE Design Automation Conference (DAC), IEEE. (2017) https:\/\/doi.org\/10.1145\/3061639.3062257","DOI":"10.1145\/3061639.3062257"},{"key":"1255_CR41","doi-asserted-by":"crossref","unstructured":"Wu, G., et al.: GPGPU performance and power estimation using machine learning. In: 2015 IEEE 21st International Symposium on High Performance Computer Architecture (HPCA), IEEE. (2015) https:\/\/doi.org\/10.1109\/HPCA.2015.7056063","DOI":"10.1109\/HPCA.2015.7056063"},{"key":"1255_CR42","doi-asserted-by":"publisher","DOI":"10.1007\/s10586-021-03428-8","author":"M Lattuada","year":"2022","unstructured":"Lattuada M et al (2022) Performance prediction of deep learning applications training in GPU as a service systems. Cluster Comput. https:\/\/doi.org\/10.1007\/s10586-021-03428-8","journal-title":"Cluster Comput."},{"issue":"2","key":"1255_CR43","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1109\/MM.2008.31","volume":"28","author":"E Lindholm","year":"2008","unstructured":"Lindholm E et al (2008) NVIDIA Tesla: a unified graphics and computing architecture. IEEE micro 28(2):39\u201355. https:\/\/doi.org\/10.1109\/MM.2008.31","journal-title":"IEEE micro"},{"key":"1255_CR44","unstructured":"NVIDIA: Whitepaper NVIDIA Tesla P100. NVIDIA Corporation (2016)"},{"key":"1255_CR45","unstructured":"NVIDIA: Whitepaper NVIDIA TURING GPU ARCHITECTURE, NVIDIA Corp (2018)"},{"key":"1255_CR46","doi-asserted-by":"crossref","unstructured":"Wong, H., et al.: Demystifying GPU microarchitecture through microbenchmarking. In: 2010 IEEE International Symposium on Performance Analysis of Systems & Software (ISPASS), IEEE. (2010) https:\/\/doi.org\/10.1109\/ISPASS.2010.5452013","DOI":"10.1109\/ISPASS.2010.5452013"},{"issue":"4","key":"1255_CR47","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3295690","volume":"15","author":"SK Shekofteh","year":"2019","unstructured":"Shekofteh SK et al (2019) Metric selection for GPU Kernel classification. Trans. Archit. Code Optim. (TACO) 15(4):1\u201327. https:\/\/doi.org\/10.1145\/3295690","journal-title":"Trans. Archit. Code Optim. (TACO)"},{"key":"1255_CR48","doi-asserted-by":"crossref","unstructured":"Arafa, Y., et al.: Low overhead instruction latency characterization for nvidia gpgpus. In: 2019 IEEE High Performance Extreme Computing Conference (HPEC), IEEE. (2019) https:\/\/doi.org\/10.1109\/HPEC.2019.8916466","DOI":"10.1109\/HPEC.2019.8916466"},{"key":"1255_CR49","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s00607-019-00780-x","volume":"102","author":"A Riahi","year":"2020","unstructured":"Riahi A, Savadi A, Naghibzadeh M (2020) Comparison of analytical and ML-based models for predicting CPU-GPU data transfer time. Computing 102:1\u201318. https:\/\/doi.org\/10.1007\/s00607-019-00780-x","journal-title":"Computing"}],"container-title":["Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00607-023-01255-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00607-023-01255-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00607-023-01255-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,5,20]],"date-time":"2024-05-20T18:05:21Z","timestamp":1716228321000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00607-023-01255-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,2,26]]},"references-count":49,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2024,5]]}},"alternative-id":["1255"],"URL":"https:\/\/doi.org\/10.1007\/s00607-023-01255-w","relation":{},"ISSN":["0010-485X","1436-5057"],"issn-type":[{"value":"0010-485X","type":"print"},{"value":"1436-5057","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,2,26]]},"assertion":[{"value":"24 September 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 December 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 February 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}