{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T02:37:34Z","timestamp":1774579054614,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","license":[{"start":{"date-parts":[[2016,9,11]],"date-time":"2016-09-11T00:00:00Z","timestamp":1473552000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2016,9,11]]},"DOI":"10.1145\/2967938.2967952","type":"proceedings-article","created":{"date-parts":[[2016,8,31]],"date-time":"2016-08-31T12:32:08Z","timestamp":1472646728000},"page":"341-352","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["Automatically Exploiting Implicit Pipeline Parallelism from Multiple Dependent Kernels for GPUs"],"prefix":"10.1145","author":[{"given":"Gwangsun","family":"Kim","sequence":"first","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, South Korea"}]},{"given":"Jiyun","family":"Jeong","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, South Korea"}]},{"given":"John","family":"Kim","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, South Korea"}]},{"given":"Mark","family":"Stephenson","sequence":"additional","affiliation":[{"name":"NVIDIA, Austin, USA"}]}],"member":"320","published-online":{"date-parts":[[2016,9,11]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Intel Solid-State Drive DC P3608 Series Product Specification. Intel.  Intel Solid-State Drive DC P3608 Series Product Specification. Intel."},{"key":"e_1_3_2_1_2_1","unstructured":"NVIDIA's Next Generation CUDA Compute Architecture: Kepler GK110.  NVIDIA's Next Generation CUDA Compute Architecture: Kepler GK110."},{"key":"e_1_3_2_1_3_1","volume-title":"AMD White Paper","author":"APU","year":"2011","unstructured":"APU 101 : All about amd fusion accelerated processing units . AMD White Paper , 2011 . APU 101: All about amd fusion accelerated processing units. AMD White Paper, 2011."},{"key":"e_1_3_2_1_4_1","volume-title":"NVIDIA","author":"CUDA","year":"2011","unstructured":"CUDA C\/C++ SDK code samples . NVIDIA , 2011 . CUDA C\/C++ SDK code samples. NVIDIA, 2011."},{"key":"e_1_3_2_1_5_1","unstructured":"CUPTI User's Guide. NVIDIA 2015.  CUPTI User's Guide. NVIDIA 2015."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2012.6168946"},{"key":"e_1_3_2_1_7_1","first-page":"163","volume-title":"Proceedings of ISPASS'09","author":"Bakhoda A.","unstructured":"A. Bakhoda , G. Yuan , W. Fung , H. Wong , and T. Aamodt . Analyzing CUDA workloads using a detailed GPU simulator . In Proceedings of ISPASS'09 , pages 163 -- 174 . A. Bakhoda, G. Yuan, W. Fung, H. Wong, and T. Aamodt. Analyzing CUDA workloads using a detailed GPU simulator. In Proceedings of ISPASS'09, pages 163--174."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/2608020.2608024"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751218"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/2063384.2063401"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/367487.367501"},{"key":"e_1_3_2_1_13_1","unstructured":"D. Foley. NVLink Pascal and Stacked Memory: Feeding the Appetite for Big Data http:\/\/devblogs.nvidia.com\/parallelforall\/nvlink-pascal-stacked-memory-feeding-appetite-big-data.  D. Foley. NVLink Pascal and Stacked Memory: Feeding the Appetite for Big Data http:\/\/devblogs.nvidia.com\/parallelforall\/nvlink-pascal-stacked-memory-feeding-appetite-big-data."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/1736020.1736059"},{"key":"e_1_3_2_1_15_1","first-page":"1","volume-title":"Proceedings of InPar'12","author":"Grauer-Gray S.","unstructured":"S. Grauer-Gray , L. Xu , R. Searles , S. Ayalasomayajula , and J. Cavazos . Auto-tuning a high-level language targeted to gpu codes . In Proceedings of InPar'12 , pages 1 -- 10 . S. Grauer-Gray, L. Xu, R. Searles, S. Ayalasomayajula, and J. Cavazos. Auto-tuning a high-level language targeted to gpu codes. In Proceedings of InPar'12, pages 1--10."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.5555\/2015039.2015535"},{"key":"e_1_3_2_1_17_1","volume-title":"SC'13","author":"Harris M.","unstructured":"M. Harris . Unified memory in cuda 6.0. NVIDIA GPU Technology Theater , SC'13 . M. Harris. Unified memory in cuda 6.0. NVIDIA GPU Technology Theater, SC'13."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/2259016.2259038"},{"key":"e_1_3_2_1_19_1","unstructured":"A. Joshi. Accelerating various c++ applications using cuda. http:\/\/joshiscorner.com\/files\/src\/blog\/laplace-cuda-code.html.  A. Joshi. Accelerating various c++ applications using cuda. http:\/\/joshiscorner.com\/files\/src\/blog\/laplace-cuda-code.html."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/1941553.1941591"},{"key":"e_1_3_2_1_21_1","first-page":"201","volume-title":"Proceedings of OSDI'14","author":"Kim S.","unstructured":"S. Kim , S. Huh , X. Zhang , Y. Hu , A. Wated , E. Witchel , and M. Silberstein . Gpunet: Networking abstractions for gpu programs . In Proceedings of OSDI'14 , pages 201 -- 216 . S. Kim, S. Huh, X. Zhang, Y. Hu, A. Wated, E. Witchel, and M. Silberstein. Gpunet: Networking abstractions for gpu programs. In Proceedings of OSDI'14, pages 201--216."},{"key":"e_1_3_2_1_22_1","first-page":"546","volume-title":"Proceedings of HPCA'14","author":"Kim Y.","unstructured":"Y. Kim , J. Lee , J.-E. Jo , and J. Kim . GPUdmm: A high-performance and memory-oblivious gpu architecture using dynamic memory management . In Proceedings of HPCA'14 , pages 546 -- 557 . Y. Kim, J. Lee, J.-E. Jo, and J. Kim. GPUdmm: A high-performance and memory-oblivious gpu architecture using dynamic memory management. In Proceedings of HPCA'14, pages 546--557."},{"key":"e_1_3_2_1_23_1","volume-title":"AMD","author":"Kyriazis G.","year":"2012","unstructured":"G. Kyriazis . Heterogeneous system architecture: A technical review . AMD , 2012 . G. Kyriazis. Heterogeneous system architecture: A technical review. AMD, 2012."},{"key":"e_1_3_2_1_24_1","first-page":"1","volume-title":"Proceedings of HPEC'14","author":"Landaverde R.","unstructured":"R. Landaverde , T. Zhang , A. K. Coskun , and M. Herbordt . An investigation of unified memory access performance in cuda . In Proceedings of HPEC'14 , pages 1 -- 6 . R. Landaverde, T. Zhang, A. K. Coskun, and M. Herbordt. An investigation of unified memory access performance in cuda. In Proceedings of HPEC'14, pages 1--6."},{"key":"e_1_3_2_1_25_1","first-page":"75","volume-title":"Proceedings of CGO'04","author":"Lattner C.","unstructured":"C. Lattner and V. Adve . Llvm: A compilation framework for lifelong program analysis & transformation . In Proceedings of CGO'04 , pages 75 --. C. Lattner and V. Adve. Llvm: A compilation framework for lifelong program analysis & transformation. In Proceedings of CGO'04, pages 75--."},{"key":"e_1_3_2_1_26_1","first-page":"260","volume-title":"Proceedings of HPCA'14","author":"Lee M.","unstructured":"M. Lee , S. Song , J. Moon , J. Kim , W. Seo , Y. Cho , and S. Ryu . Improving gpgpu resource utilization through alternative thread block scheduling . In Proceedings of HPCA'14 , pages 260 -- 271 . M. Lee, S. Song, J. Moon, J. Kim, W. Seo, Y. Cho, and S. Ryu. Improving gpgpu resource utilization through alternative thread block scheduling. In Proceedings of HPCA'14, pages 260--271."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2013.6522332"},{"key":"e_1_3_2_1_28_1","volume-title":"Streams and Concurrency. GPU Technology Conference","author":"Rennich S.","year":"2011","unstructured":"S. Rennich . CUDA C\/C++ Streams and Concurrency. GPU Technology Conference , 2011 . S. Rennich. CUDA C\/C++ Streams and Concurrency. GPU Technology Conference, 2011."},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of ASBD'14","author":"Shihab M.","unstructured":"M. Shihab , K. Taht , and M. Jung . Gpudrive: Reconsidering storage accesses for gpu acceleration . In Proceedings of ASBD'14 . M. Shihab, K. Taht, and M. Jung. Gpudrive: Reconsidering storage accesses for gpu acceleration. In Proceedings of ASBD'14."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/2451116.2451169"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.30"},{"key":"e_1_3_2_1_32_1","volume-title":"Maximizing GPU Efficiency in Extreme Throughput Applications. GPU Technology Conference","author":"Stam J.","year":"2009","unstructured":"J. Stam . Maximizing GPU Efficiency in Extreme Throughput Applications. GPU Technology Conference , 2009 . J. Stam. Maximizing GPU Efficiency in Extreme Throughput Applications. GPU Technology Conference, 2009."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/2661229.2661250"},{"key":"e_1_3_2_1_34_1","volume-title":"Parboil: A Revised Benchmark Suite for Scientific and Commercial Throughput Computing","author":"Stratton J.","year":"2012","unstructured":"J. Stratton , C. Rodrigues , I. Sung , N. Obeid , L. Chang , N. Anssari , G. Liu , and W. Hwu . Parboil: A Revised Benchmark Suite for Scientific and Commercial Throughput Computing . Center for Reliable and High-Performance Computing , 2012 . J. Stratton, C. Rodrigues, I. Sung, N. Obeid, L. Chang, N. Anssari, G. Liu, and W. Hwu. Parboil: A Revised Benchmark Suite for Scientific and Commercial Throughput Computing. Center for Reliable and High-Performance Computing, 2012."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/1477926.1477930"},{"key":"e_1_3_2_1_36_1","first-page":"345","volume-title":"Proceedings of HPCA'16","author":"Zheng T.","unstructured":"T. Zheng , D. Nellans , A. Zulfiqar , M. Stephenson , and S. W. Keckler . Towards High Performance Paged Memory for GPUs . In Proceedings of HPCA'16 , pages 345 -- 357 . T. Zheng, D. Nellans, A. Zulfiqar, M. Stephenson, and S. W. Keckler. Towards High Performance Paged Memory for GPUs. In Proceedings of HPCA'16, pages 345--357."}],"event":{"name":"PACT '16: International Conference on Parallel Architectures and Compilation","location":"Haifa Israel","acronym":"PACT '16","sponsor":["IFIP WG 10.3 IFIP WG 10.3","IEEE TCCA IEEE Computer Society Technical Committee on Computer Architecture","SIGARCH ACM Special Interest Group on Computer Architecture","IEEE CS TCPP IEEE Computer Society Technical Committee on Parallel Processing"]},"container-title":["Proceedings of the 2016 International Conference on Parallel Architectures and Compilation"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2967938.2967952","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2967938.2967952","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T03:49:59Z","timestamp":1750218599000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2967938.2967952"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,9,11]]},"references-count":36,"alternative-id":["10.1145\/2967938.2967952","10.1145\/2967938"],"URL":"https:\/\/doi.org\/10.1145\/2967938.2967952","relation":{},"subject":[],"published":{"date-parts":[[2016,9,11]]},"assertion":[{"value":"2016-09-11","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}