{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,7]],"date-time":"2026-02-07T08:47:56Z","timestamp":1770454076329,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","license":[{"start":{"date-parts":[[2018,6,12]],"date-time":"2018-06-12T00:00:00Z","timestamp":1528761600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 61572508 and 61672526"],"award-info":[{"award-number":["No. 61572508 and 61672526"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100010663","name":"European Research Council","doi-asserted-by":"publisher","award":["No. 741097"],"award-info":[{"award-number":["No. 741097"]}],"id":[{"id":"10.13039\/100010663","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003130","name":"Fonds Wetenschappelijk Onderzoek","doi-asserted-by":"publisher","award":["G.0434.16N and G.0144.17N"],"award-info":[{"award-number":["G.0434.16N and G.0144.17N"]}],"id":[{"id":"10.13039\/501100003130","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2018,6,12]]},"DOI":"10.1145\/3205289.3205311","type":"proceedings-article","created":{"date-parts":[[2018,9,13]],"date-time":"2018-09-13T12:54:52Z","timestamp":1536843292000},"page":"65-75","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":26,"title":["Classification-Driven Search for Effective SM Partitioning in Multitasking GPUs"],"prefix":"10.1145","author":[{"given":"Xia","family":"Zhao","sequence":"first","affiliation":[{"name":"Ghent University, Belgium"}]},{"given":"Zhiying","family":"Wang","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, China"}]},{"given":"Lieven","family":"Eeckhout","sequence":"additional","affiliation":[{"name":"Ghent University, Belgium"}]}],"member":"320","published-online":{"date-parts":[[2018,6,12]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/2872362.2872368"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/1996130.1996160"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/2854038.2854040"},{"key":"e_1_3_2_1_4_1","first-page":"109","author":"Suzuki Y.","year":"2014","unstructured":"Y. Suzuki , S. Kato , H. Yamada , and K. Kono , \"GPUvm: Why Not Virtualizing GPUs at the Hypervisor? ,\" in Proceedings of the USENIX Annual Technical Conference (ATC) , pp. 109 -- 120 , June 2014 . Y. Suzuki, S. Kato, H. Yamada, and K. Kono, \"GPUvm: Why Not Virtualizing GPUs at the Hypervisor?,\" in Proceedings of the USENIX Annual Technical Conference (ATC), pp. 109--120, June 2014.","journal-title":",\" in Proceedings of the USENIX Annual Technical Conference (ATC)"},{"key":"e_1_3_2_1_5_1","unstructured":"Amazon \"Amazon web services.\" https:\/\/aws.amazon.com\/cn\/ec2\/.  Amazon \"Amazon web services.\" https:\/\/aws.amazon.com\/cn\/ec2\/."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2012.6168946"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.5555\/2665671.2665702"},{"key":"e_1_3_2_1_8_1","first-page":"503","author":"Awatramani M.","year":"2013","unstructured":"M. Awatramani , J. Zambreno , and D. Rover , \"Increasing GPU Throughput using Kernel Interleaved Thread Block Scheduling,\" in Proceedings of the International Conference on Computer Design (ICCD) , pp. 503 -- 506 , October 2013 . M. Awatramani, J. Zambreno, and D. Rover, \"Increasing GPU Throughput using Kernel Interleaved Thread Block Scheduling,\" in Proceedings of the International Conference on Computer Design (ICCD), pp. 503--506, October 2013.","journal-title":"\"Increasing GPU Throughput using Kernel Interleaved Thread Block Scheduling,\" in Proceedings of the International Conference on Computer Design (ICCD)"},{"key":"e_1_3_2_1_9_1","first-page":"358","volume-title":"Simultaneous Multikernel GPU: Multitasking Throughput Processors via Fine-Grained Sharing,\" in Proceedings of the International Symposium on High Performance Computer Architecture (HPCA)","author":"Wang Z.","year":"2016","unstructured":"Z. Wang , J. Yang , R. Melhem , B. Childers , Y. Zhang , and M. Guo , \" Simultaneous Multikernel GPU: Multitasking Throughput Processors via Fine-Grained Sharing,\" in Proceedings of the International Symposium on High Performance Computer Architecture (HPCA) , pp. 358 -- 369 , March 2016 . Z. Wang, J. Yang, R. Melhem, B. Childers, Y. Zhang, and M. Guo, \"Simultaneous Multikernel GPU: Multitasking Throughput Processors via Fine-Grained Sharing,\" in Proceedings of the International Symposium on High Performance Computer Architecture (HPCA), pp. 358--369, March 2016."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2016.29"},{"key":"e_1_3_2_1_11_1","unstructured":"\"NVIDIA Tesla V100 Volta Architecture.\"  \"NVIDIA Tesla V100 Volta Architecture.\""},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/2818950.2818979"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037707"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3123975"},{"key":"e_1_3_2_1_15_1","volume-title":"March","author":"Wang H.","year":"2018","unstructured":"H. Wang , F. Luo , M. Ibrahim , O. Kayiran , and A. Jog , \" Efficient and Fair Multiprogramming in GPUs via Effective Bandwidth Management,\" in Proceedings of the International Symposium on High Performance Computer Architecture (HPCA) , March 2018 . H. Wang, F. Luo, M. Ibrahim, O. Kayiran, and A. Jog, \"Efficient and Fair Multiprogramming in GPUs via Effective Bandwidth Management,\" in Proceedings of the International Symposium on High Performance Computer Architecture (HPCA), March 2018."},{"key":"e_1_3_2_1_16_1","first-page":"1","author":"Jadidi A.","year":"2017","unstructured":"A. Jadidi , M. Arjomand , M. T. Kandemir , and C. R. Das , \"Optimizing Energy Consumption in GPUS Through Feedback-driven CTA Scheduling,\" in Proceedings of the High Performance Computing Symposium (HPC) , pp. 12: 1 -- 12 :12, April 2017 . A. Jadidi, M. Arjomand, M. T. Kandemir, and C. R. Das, \"Optimizing Energy Consumption in GPUS Through Feedback-driven CTA Scheduling,\" in Proceedings of the High Performance Computing Symposium (HPC), pp. 12:1--12:12, April 2017.","journal-title":"\"Optimizing Energy Consumption in GPUS Through Feedback-driven CTA Scheduling,\" in Proceedings of the High Performance Computing Symposium (HPC)"},{"key":"e_1_3_2_1_17_1","volume-title":"December","author":"Jadidi A.","year":"2015","unstructured":"A. Jadidi , \" Kernel-Based Energy Optimization In GP Us ,\" Master's thesis , The Pennsylvania State University , December 2015 . A. Jadidi, \"Kernel-Based Energy Optimization In GPUs,\" Master's thesis, The Pennsylvania State University, December 2015."},{"key":"e_1_3_2_1_18_1","first-page":"1","author":"Vijaykumar N.","year":"2016","unstructured":"N. Vijaykumar , K. Hsieh , G. Pekhimenko , S. Khan , A. Shrestha , S. Ghose , A. Jog , P. B. Gibbons , and O. Mutlu , \"Zorua: A Holistic Approach to Resource Virtualization in GPUs,\" in Proceedings of the International Symposium on Microarchitecture (MICRO) , pp. 1 -- 14 , October 2016 . N. Vijaykumar, K. Hsieh, G. Pekhimenko, S. Khan, A. Shrestha, S. Ghose, A. Jog, P. B. Gibbons, and O. Mutlu, \"Zorua: A Holistic Approach to Resource Virtualization in GPUs,\" in Proceedings of the International Symposium on Microarchitecture (MICRO), pp. 1--14, October 2016.","journal-title":"\"Zorua: A Holistic Approach to Resource Virtualization in GPUs,\" in Proceedings of the International Symposium on Microarchitecture (MICRO)"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830784"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2012.6168947"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/2694344.2694346"},{"key":"e_1_3_2_1_22_1","unstructured":"Nvidia \"NVIDIA TESLA P100 GPU ACCELERATOR.\" https:\/\/images.nvidia.com\/content\/tesla\/pdf\/nvidia-tesla-p100-PCIe-datasheet.pdf 2016.  Nvidia \"NVIDIA TESLA P100 GPU ACCELERATOR.\" https:\/\/images.nvidia.com\/content\/tesla\/pdf\/nvidia-tesla-p100-PCIe-datasheet.pdf 2016."},{"key":"e_1_3_2_1_23_1","first-page":"163","volume-title":"Analyzing CUDA Workloads Using a Detailed GPU Simulator,\" in Proceeding of the International Symposium on Performance Analysis of Systems and Software (ISPASS)","author":"Bakhoda A.","year":"2009","unstructured":"A. Bakhoda , G. L. Yuan , W. W. L. Fung , H. Wong , and T. M. Aamodt , \" Analyzing CUDA Workloads Using a Detailed GPU Simulator,\" in Proceeding of the International Symposium on Performance Analysis of Systems and Software (ISPASS) , pp. 163 -- 174 , April 2009 . A. Bakhoda, G. L. Yuan, W. W. L. Fung, H. Wong, and T. M. Aamodt, \"Analyzing CUDA Workloads Using a Detailed GPU Simulator,\" in Proceeding of the International Symposium on Performance Analysis of Systems and Software (ISPASS), pp. 163--174, April 2009."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485964"},{"key":"e_1_3_2_1_25_1","unstructured":"\"NVIDIA CUDA SDK Code Samples.\" https:\/\/developer.nvidia.com\/cuda-downloads.  \"NVIDIA CUDA SDK Code Samples.\" https:\/\/developer.nvidia.com\/cuda-downloads."},{"key":"e_1_3_2_1_26_1","volume-title":"March","author":"Stratton J. A.","year":"2012","unstructured":"J. A. Stratton , C. Rodrigues , I.-J. Sung , N. Obeid , L.-W. Chang , N. Anssari , G. D. Liu , and W.-m. W. Hwu , \"Parboil : A Revised Benchmark Suite for Scientific and Commercial Throughput Computing,\" tech. rep ., March 2012 . J. A. Stratton, C. Rodrigues, I.-J. Sung, N. Obeid, L.-W. Chang, N. Anssari, G. D. Liu, and W.-m. W. Hwu, \"Parboil: A Revised Benchmark Suite for Scientific and Commercial Throughput Computing,\" tech. rep., March 2012."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"e_1_3_2_1_28_1","first-page":"1","author":"Grauer-Gray S.","year":"2012","unstructured":"S. Grauer-Gray , L. Xu , R. Searles , S. Ayalasomayajula , and J. Cavazos , \"Autotuning a High-Level Language Targeted to GPU Codes,\" in Proceedings of Innovative Parallel Computing(InPar) , pp. 1 -- 10 , May 2012 . S. Grauer-Gray, L. Xu, R. Searles, S. Ayalasomayajula, and J. Cavazos, \"Autotuning a High-Level Language Targeted to GPU Codes,\" in Proceedings of Innovative Parallel Computing(InPar), pp. 1--10, May 2012.","journal-title":"\"Autotuning a High-Level Language Targeted to GPU Codes,\" in Proceedings of Innovative Parallel Computing(InPar)"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/1454115.1454152"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2008.44"},{"key":"e_1_3_2_1_31_1","first-page":"144","volume-title":"POSTER:Accelerate GPU Concurrent Kernel Execution by Mitigating Memory Pipeline Stalls,\" in Proceedings of the International Conference on Parallel Architectures and Compilation (PACT)","author":"Dai H.","year":"2017","unstructured":"H. Dai , Z. Lin , C. Li , C. Zhao , F. Wang , N. Zheng , and H. Zhou , \" POSTER:Accelerate GPU Concurrent Kernel Execution by Mitigating Memory Pipeline Stalls,\" in Proceedings of the International Conference on Parallel Architectures and Compilation (PACT) , pp. 144 -- 145 , September 2017 . H. Dai, Z. Lin, C. Li, C. Zhao, F. Wang, N. Zheng, and H. Zhou, \"POSTER:Accelerate GPU Concurrent Kernel Execution by Mitigating Memory Pipeline Stalls,\" in Proceedings of the International Conference on Parallel Architectures and Compilation (PACT), pp. 144--145, September 2017."},{"key":"e_1_3_2_1_32_1","volume-title":"March","author":"Dai H.","year":"2018","unstructured":"H. Dai , Z. Lin , C. Li , C. Zhao , F. Wang , N. Zheng , and H. Zhou , \" Accelerate GPU Concurrent Kernel Execution by Mitigating Memory Pipeline Stalls,\" in Proceedings of the International Symposium on High Performance Computer Architecture (HPCA) , March 2018 . H. Dai, Z. Lin, C. Li, C. Zhao, F. Wang, N. Zheng, and H. Zhou, \"Accelerate GPU Concurrent Kernel Execution by Mitigating Memory Pipeline Stalls,\" in Proceedings of the International Symposium on High Performance Computer Architecture (HPCA), March 2018."},{"key":"e_1_3_2_1_33_1","first-page":"440","volume-title":"Fair Share: Allocation of GPU Resources for Both Performance and Fairness,\" in Proceedings of the International Conference on Computer Design (ICCD)","author":"Aguilera P.","year":"2014","unstructured":"P. Aguilera , K. Morrow , and N. S. Kim , \" Fair Share: Allocation of GPU Resources for Both Performance and Fairness,\" in Proceedings of the International Conference on Computer Design (ICCD) , pp. 440 -- 447 , October 2014 . P. Aguilera, K. Morrow, and N. S. Kim, \"Fair Share: Allocation of GPU Resources for Both Performance and Fairness,\" in Proceedings of the International Conference on Computer Design (ICCD), pp. 440--447, October 2014."},{"key":"e_1_3_2_1_34_1","first-page":"1","volume-title":"Automation Test in Europe Conference Exhibition (DATE)","author":"Li X.","year":"2016","unstructured":"X. Li and Y. Liang , \" Efficient Kernel Management on GPUs,\" in Proceedings of the Design , Automation Test in Europe Conference Exhibition (DATE) , pp. 115: 1 -- 115 :24, March 2016 . X. Li and Y. Liang, \"Efficient Kernel Management on GPUs,\" in Proceedings of the Design, Automation Test in Europe Conference Exhibition (DATE), pp. 115:1--115:24, March 2016."}],"event":{"name":"ICS '18: 2018 International Conference on Supercomputing","location":"Beijing China","acronym":"ICS '18","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 2018 International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3205289.3205311","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3205289.3205311","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T01:08:55Z","timestamp":1750208935000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3205289.3205311"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,6,12]]},"references-count":34,"alternative-id":["10.1145\/3205289.3205311","10.1145\/3205289"],"URL":"https:\/\/doi.org\/10.1145\/3205289.3205311","relation":{},"subject":[],"published":{"date-parts":[[2018,6,12]]},"assertion":[{"value":"2018-06-12","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}