{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,29]],"date-time":"2025-09-29T08:24:50Z","timestamp":1759134290344,"version":"3.37.3"},"reference-count":31,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2019,1,17]],"date-time":"2019-01-17T00:00:00Z","timestamp":1547683200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100003593","name":"Conselho Nacional de Desenvolvimento Cient\u00edfico e Tecnol\u00f3gico","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003593","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Cluster Comput"],"published-print":{"date-parts":[[2020,3]]},"DOI":"10.1007\/s10586-018-02901-1","type":"journal-article","created":{"date-parts":[[2019,1,17]],"date-time":"2019-01-17T06:04:15Z","timestamp":1547705055000},"page":"177-188","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Kernel concurrency opportunities based on GPU benchmarks characterization"],"prefix":"10.1007","volume":"23","author":[{"given":"Pablo","family":"Carvalho","sequence":"first","affiliation":[]},{"given":"Rommel","family":"Cruz","sequence":"additional","affiliation":[]},{"given":"Lucia M. A.","family":"Drummond","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9092-6007","authenticated-orcid":false,"given":"Cristiana","family":"Bentes","sequence":"additional","affiliation":[]},{"given":"Esteban","family":"Clua","sequence":"additional","affiliation":[]},{"given":"Edson","family":"Cataldo","sequence":"additional","affiliation":[]},{"given":"Leandro A. J.","family":"Marzulo","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,1,17]]},"reference":[{"key":"2901_CR1","doi-asserted-by":"crossref","unstructured":"Adriaens, J.T., Compton, K., Kim, N.S., Schulte, M.J.: The case for GPGPU spatial multitasking. In: IEEE 18th International Symposium on High Performance Computer Architecture (HPCA), pp. 1\u201312. IEEE (2012)","DOI":"10.1109\/HPCA.2012.6168946"},{"key":"2901_CR2","unstructured":"Asanovic, K.: The landscape of parallel computing research: a view from berkeley. Tech. Rep. UCB\/EECS-2006-183, EECS Department, University of California, Berkley, CA, USA (2006)"},{"key":"2901_CR3","unstructured":"Bakhoda, A., Yuan, G.L., Fung, W.W., Wong, H., Aamodt, T.M.: Analyzing CUDA workloads using a detailed GPU simulator. In: IEEE International Symposium on Performance Analysis of Systems and Software, 2009. ISPASS 2009, pp. 163\u2013174. IEEE (2009)"},{"key":"2901_CR4","volume-title":"Benchmarking Modern Multiprocessors","author":"C Bienia","year":"2011","unstructured":"Bienia, C.: Benchmarking Modern Multiprocessors. Princeton University, Princeton (2011)"},{"key":"2901_CR5","unstructured":"Bienia, C.: Benchmarking modern multiprocessors. Ph.D. thesis, Princeton University (2011)"},{"key":"2901_CR6","doi-asserted-by":"crossref","unstructured":"Breder, B., Charles, E., Cruz, R., Clua, E., Bentes, C., Drummond, L.: Maximizando o uso dos recursos de GPU atrav\u00e9s da reordena\u00e7\u00e3o da submiss\u00e3o de kernels concorrentes. In: Anais do WSCAD 2016 Simp\u00f3sio de Sistemas Computacionais de Alto Desempenho, pp. 98\u2013109. Editora da Sociedade Brasileira de Computa\u00e7\u00e3o (SBC) (2016)","DOI":"10.5753\/wscad.2016.14264"},{"key":"2901_CR7","doi-asserted-by":"crossref","unstructured":"Burtscher, M., Nasre, R., Pingali, K.: A quantitative study of irregular programs on GPUs. In: 2012 IEEE International Symposium on Workload Characterization (IISWC), pp. 141\u2013151. IEEE (2012)","DOI":"10.1109\/IISWC.2012.6402918"},{"key":"2901_CR8","first-page":"71","volume-title":"Communications in Computer and Information Science","author":"Pablo Carvalho","year":"2017","unstructured":"Carvalho, P., Drummond, L., Bentes, C., Clua, E., Cataldo, E., Marzulo, L.: Analysis and characterization of gpu benchmarks for kernel concurrency efficiency. In: Mocskos E., Nesmachnow S. (eds.) High Performance Computing. CARLA 2017. Communications in Computer and Information Science, vol. 796 (2017)"},{"key":"2901_CR9","doi-asserted-by":"crossref","unstructured":"Che, S., Boyer, M., Meng, J., Tarjan, D., Sheaffer, J.W., Lee, S.H., Skadron, K.: Rodinia: a benchmark suite for heterogeneous computing. In: Proceedings of the IEEE International Symposium on Workload Characterization (IISWC), pp. 44\u201354 (2009)","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"2901_CR10","doi-asserted-by":"crossref","unstructured":"Che, S., Sheaffer, J.W., Boyer, M., Szafaryn, L.G., Wang, L., Skadron, K.: A characterization of the rodinia benchmark suite with comparison to contemporary CMP workloads. In: Proceedings of the IEEE International Symposium on Workload Characterization (2010)","DOI":"10.1109\/IISWC.2010.5650274"},{"issue":"2","key":"2901_CR11","doi-asserted-by":"publisher","first-page":"238","DOI":"10.1177\/1094342013507960","volume":"28","author":"S Che","year":"2014","unstructured":"Che, S., Skadron, K.: Benchfriend: correlating the performance of GPU benchmarks. Int. J. High Perform. Comput. Appl. 28(2), 238\u2013250 (2014)","journal-title":"Int. J. High Perform. Comput. Appl."},{"key":"2901_CR12","doi-asserted-by":"crossref","unstructured":"Cruz, R., Drummond, L., Clua, E., Bentes, C.: Analyzing and estimating the performance of concurrent kernels execution on GPUs. In: Proceedings of the XVIII Simp\u00f3sio em Sistemas Computacionais de Alto Desempenho-WSCAD (2017)","DOI":"10.5753\/wscad.2017.245"},{"key":"2901_CR13","unstructured":"Cruz, R.A., Bentes, C., Breder, B., Vasconcellos, E., Clua, E., de\u00a0Carvalho, P., Drummond, L.: Maximizing the GPU resource usage by reordering concurrent kernels submission. Concurr. Comput."},{"key":"2901_CR14","doi-asserted-by":"crossref","unstructured":"Danalis, A., Marin, G., McCurdy, C., Meredith, J.S., Roth, P.C., Spafford, K., Tipparaju, V., Vetter, J.S.: The scalable heterogeneous computing (SHOC) benchmark suite. In: Proceedings of the 3rd Workshop on General-Purpose Computation on Graphics Processing Units, pp. 63\u201374 (2010)","DOI":"10.1145\/1735688.1735702"},{"key":"2901_CR15","doi-asserted-by":"crossref","unstructured":"Goswami, N., Shankar, R., Joshi, M., Li, T.: Exploring GPGPU workloads: Characterization methodology, analysis and microarchitecture evaluation implications. In: 2010 IEEE International Symposium on Workload Characterization (IISWC), pp. 1\u201310. IEEE (2010)","DOI":"10.1109\/IISWC.2010.5649549"},{"key":"2901_CR16","doi-asserted-by":"crossref","unstructured":"Hu, Q., Shu, J., Fan, J., Lu, Y.: Run-time performance estimation and fairness-oriented scheduling policy for concurrent GPGPU applications. In: 2016 45th International Conference on Parallel Processing (ICPP), pp. 57\u201366. IEEE (2016)","DOI":"10.1109\/ICPP.2016.14"},{"key":"2901_CR17","doi-asserted-by":"crossref","unstructured":"Jog, A., Kayiran, O., Kesten, T., Pattnaik, A., Bolotin, E., Chatterjee, N., Keckler, S.W., Kandemir, M.T., Das, C.R.: Anatomy of GPU memory system for multi-application execution. In: Proceedings of the 2015 International Symposium on Memory Systems, pp. 223\u2013234. ACM (2015)","DOI":"10.1145\/2818950.2818979"},{"issue":"6","key":"2901_CR18","doi-asserted-by":"publisher","first-page":"769","DOI":"10.1109\/TC.2006.85","volume":"55","author":"A Joshi","year":"2006","unstructured":"Joshi, A., Phansalkar, A., Eeckhout, L., John, L.K.: Measuring benchmark similarity using inherent program characteristics. IEEE Trans. Comput. 55(6), 769\u2013782 (2006)","journal-title":"IEEE Trans. Comput."},{"key":"2901_CR19","unstructured":"Kerr, A., Diamos, G., Yalamanchili, S.: A characterization and analysis of PTX kernels. In: IEEE International Symposium on Workload Characterization, 2009. IISWC 2009, pp. 3\u201312. IEEE (2009)"},{"key":"2901_CR20","unstructured":"Li, T., Narayana, V.K., El-Ghazawi, T.: A power-aware symbiotic scheduling algorithm for concurrent GPU kernels. In: IEEE 21st International Conference on Parallel and Distributed Systems (ICPADS), 2015, pp. 562\u2013569 (2015)"},{"key":"2901_CR21","unstructured":"NVIDIA: Cuda multi process service overview (2017). https:\/\/docs.nvidia.com\/pdf\/CUDA_Multi_Process_Service_Overview.pdf"},{"key":"2901_CR22","unstructured":"NVIDIA Corp: Profiler user\u2019s guide. https:\/\/docs.nvidia.com\/cuda\/profiler-users-guide\/index.html#nvprof-overview (2017). An optional note"},{"key":"2901_CR23","doi-asserted-by":"crossref","unstructured":"O\u2019Neil, M.A., Burtscher, M.: Microarchitectural performance characterization of irregular GPU kernels. In: 2014 IEEE International Symposium on Workload Characterization (IISWC), pp. 130\u2013139. IEEE (2014)","DOI":"10.1109\/IISWC.2014.6983052"},{"key":"2901_CR24","doi-asserted-by":"crossref","unstructured":"Pai, S., Thazhuthaveetil, M.J., Govindarajan, R.: Improving GPGPU concurrency with elastic kernels. In: ACM SIGPLAN Notices, vol. 48, pp. 407\u2013418. ACM (2013)","DOI":"10.1145\/2499368.2451160"},{"key":"2901_CR25","doi-asserted-by":"crossref","unstructured":"Ravi, V.T., Becchi, M., Agrawal, G., Chakradhar, S.: Supporting GPU sharing in cloud environments with a transparent runtime consolidation framework. In: Proceedings of the 20th International Symposium on High Performance Distributed Computing, pp. 217\u2013228. ACM (2011)","DOI":"10.1145\/1996130.1996160"},{"key":"2901_CR26","unstructured":"SHOC: (2012). https:\/\/github.com\/vetter\/shoc\/wiki"},{"key":"2901_CR27","first-page":"122","volume-title":"Lecture Notes in Computer Science","author":"Kyle Spafford","year":"2010","unstructured":"Spafford, K., Meredith, J.S., Vetter, J.S., Chen, J., Grout, R.W., Sankaran, R.: Accelerating S3D: a GPGPU case study. In: Euro-Par Workshops, pp. 122\u2013131. Springer, New York (2009)"},{"key":"2901_CR28","unstructured":"Stratton, J.A., Rodrigues, C., Sung, I.J., Obeid, N., Chang, L.W., Anssari, N., Liu, G.D., mei W.\u00a0Hwu, W.: Parboil: a revised benchmark suite for scientific and commercial throughput computing (2012)"},{"key":"2901_CR29","doi-asserted-by":"crossref","unstructured":"Wende, F., Cordes, F., Steinke, T.: On improving the performance of multi-threaded CUDA applications with concurrent kernel execution by kernel reordering. In: Symposium on Application Accelerators in High Performance Computing (SAAHPC), pp. 74\u201383 (2012)","DOI":"10.1109\/SAAHPC.2012.12"},{"key":"2901_CR30","doi-asserted-by":"crossref","unstructured":"Xu, Q., Jeon, H., Kim, K., Ro, W.W., Annavaram, M.: Warped-slicer: efficient intra-SM slicing through dynamic resource partitioning for GPU multiprogramming. In: Proceedings of the 43rd International Symposium on Computer Architecture, pp. 230\u2013242. IEEE Press (2016)","DOI":"10.1145\/3007787.3001161"},{"issue":"6","key":"2901_CR31","doi-asserted-by":"publisher","first-page":"1522","DOI":"10.1109\/TPDS.2013.257","volume":"25","author":"J Zhong","year":"2014","unstructured":"Zhong, J., He, B.: Kernelet: high-throughput GPU kernel executions with dynamic slicing and scheduling. IEEE Trans. Parallel Distrib. Syst. 25(6), 1522\u20131532 (2014)","journal-title":"IEEE Trans. Parallel Distrib. Syst."}],"container-title":["Cluster Computing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10586-018-02901-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10586-018-02901-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10586-018-02901-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,9,10]],"date-time":"2022-09-10T12:30:11Z","timestamp":1662813011000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10586-018-02901-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,1,17]]},"references-count":31,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2020,3]]}},"alternative-id":["2901"],"URL":"https:\/\/doi.org\/10.1007\/s10586-018-02901-1","relation":{},"ISSN":["1386-7857","1573-7543"],"issn-type":[{"type":"print","value":"1386-7857"},{"type":"electronic","value":"1573-7543"}],"subject":[],"published":{"date-parts":[[2019,1,17]]},"assertion":[{"value":"29 January 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 August 2018","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 December 2018","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 January 2019","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}