{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,22]],"date-time":"2026-03-22T22:43:19Z","timestamp":1774219399624,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":70,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,6,22]],"date-time":"2019-06-22T00:00:00Z","timestamp":1561161600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000185","name":"Defense Advanced Research Projects Agency","doi-asserted-by":"publisher","award":["SRC JUMP"],"award-info":[{"award-number":["SRC JUMP"]}],"id":[{"id":"10.13039\/100000185","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1763681,1629915,1629129,1439021,1317560"],"award-info":[{"award-number":["1763681,1629915,1629129,1439021,1317560"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,6,22]]},"DOI":"10.1145\/3307650.3322212","type":"proceedings-article","created":{"date-parts":[[2019,6,14]],"date-time":"2019-06-14T12:42:33Z","timestamp":1560516153000},"page":"210-223","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":36,"title":["Opportunistic computing in GPU architectures"],"prefix":"10.1145","author":[{"given":"Ashutosh","family":"Pattnaik","sequence":"first","affiliation":[{"name":"The Pennsylvania State University"}]},{"given":"Xulong","family":"Tang","sequence":"additional","affiliation":[{"name":"The Pennsylvania State University"}]},{"given":"Onur","family":"Kayiran","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices, Inc."}]},{"given":"Adwait","family":"Jog","sequence":"additional","affiliation":[{"name":"College of William &amp; Mary"}]},{"given":"Asit","family":"Mishra","sequence":"additional","affiliation":[{"name":"NVIDIA Corp."}]},{"given":"Mahmut T.","family":"Kandemir","sequence":"additional","affiliation":[{"name":"The Pennsylvania State University"}]},{"given":"Anand","family":"Sivasubramaniam","sequence":"additional","affiliation":[{"name":"The Pennsylvania State University"}]},{"given":"Chita R.","family":"Das","sequence":"additional","affiliation":[{"name":"The Pennsylvania State University"}]}],"member":"320","published-online":{"date-parts":[[2019,6,22]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Vignesh Adhinarayanan et al. 2016. Measuring and Modeling On-Chip Interconnect Power on Real Hardware. In IISWC.","DOI":"10.1109\/IISWC.2016.7581263"},{"key":"e_1_3_2_1_2_1","unstructured":"Junwhan Ahn et al. 2015. PIM-enabled Instructions: A Low-overhead Locality-aware Processing-in-memory Architecture. In ISCA."},{"key":"e_1_3_2_1_3_1","unstructured":"Ashwin Mandayam Aji et al. 2015. Automatic Command Queue Scheduling for Task-Parallel Workloads in OpenCL. In CLUSTER."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2016.05.006"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","unstructured":"Jayvant Anantpur and R. Govindarajan. 2017. Taming Warp Divergence. In CGO.","DOI":"10.1109\/CGO.2017.7863728"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Ali Bakhoda et al. 2009. Analyzing CUDA workloads using a detailed GPU simulator. In ISPASS.","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","unstructured":"Ali Bakhoda et al. 2010. Throughput-Effective On-Chip Networks for Manycore Accelerators. In MICRO. 10.1109\/MICRO.2010.50","DOI":"10.1109\/MICRO.2010.50"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Amirali Boroumand et al. 2018. Google Workloads for Consumer Devices: Mitigating Data Movement Bottlenecks. In ASPLOS.","DOI":"10.1145\/3173162.3173177"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","unstructured":"Steve Carr et al. 1994. Compiler Optimizations for Improving Data Locality. In ASPLOS. 10.1145\/195473.195557","DOI":"10.1145\/195473.195557"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","unstructured":"J. Carter et al. 1999. Impulse: building a smarter memory controller. In HPCA.","DOI":"10.5555\/520549.822749"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/378239.379048"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","unstructured":"Anthony Danalis et al. 2010. The Scalable Heterogeneous Computing (SHOC) Benchmark Suite. In GPGPU. 10.1145\/1735688.1735702","DOI":"10.1145\/1735688.1735702"},{"key":"e_1_3_2_1_14_1","unstructured":"Advanced Micro Devices. 2017. Radeon Vega Architecture."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","unstructured":"Jeff Draper et al. 2002. The Architecture of the DIVA Processing-in-memory Chip. In ICS. 10.1145\/514191.514197","DOI":"10.1145\/514191.514197"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"S. Dublish et al. 2016. Characterizing Memory Bottlenecks in GPGPU Workloads. In IISWC.","DOI":"10.1109\/IISWC.2016.7581287"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"S. Dublish et al. 2017. Evaluating and mitigating bandwidth bottlenecks across the memory hierarchy in GPUs. In ISPASS.","DOI":"10.1109\/ISPASS.2017.7975295"},{"key":"e_1_3_2_1_18_1","volume-title":"DRAMA: An Architecture for Accelerated Processing Near Memory","author":"Amin Farmahini-Farahani","year":"2015","unstructured":"Amin Farmahini-Farahani et al. 2015. DRAMA: An Architecture for Accelerated Processing Near Memory. IEEE CAL 14, 1 (2015)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2007.12"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.5555\/2014698.2014893"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","unstructured":"Maya Gokhale et al. 1995. Processing in Memory: the Terasys Massively Parallel PIM Array. Computer 28 4 (1995). 10.1109\/2.375174","DOI":"10.1109\/2.375174"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","unstructured":"A. Gottlieb et al. 1983. The NYU Ultracomputer Designing an MIMD Shared Memory Parallel Computer. IEEE Trans. Comput. (1983). 10.1109\/TC.1983.1676201","DOI":"10.1109\/TC.1983.1676201"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Scott Grauer-Gray et al. 2012. Auto-tuning a High-level Language Targeted to GPU Codes. In 2012 Innovative Parallel Computing (InPar).","DOI":"10.1109\/InPar.2012.6339595"},{"key":"e_1_3_2_1_24_1","volume-title":"Spock: Exploiting Serverless Functions for SLO and Cost aware Resource Procurement in Public Cloud. In CLOUD.","author":"Jashwant Gunasekaran","year":"2019","unstructured":"Jashwant Gunasekaran et al. 2019. Spock: Exploiting Serverless Functions for SLO and Cost aware Resource Procurement in Public Cloud. In CLOUD."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3155287"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","unstructured":"Milad Hashemi et al. 2016. Accelerating Dependent Cache Misses with an Enhanced Memory Controller. In ISCA. 10.1109\/ISCA.2016.46","DOI":"10.1109\/ISCA.2016.46"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Kevin Hsieh et al. 2016. Transparent Offloading and Mapping (TOM): Enabling Programmer-Transparent Near-Data Processing in GPU Systems. In ISCA.","DOI":"10.1109\/ISCA.2016.27"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","unstructured":"Hyunjun Jang et al. 2015. Bandwidth-efficient On-chip Interconnect Designs for GPGPUs. In DAC. 10.1145\/2744769.2744803","DOI":"10.1145\/2744769.2744803"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Adwait Jog et al. 2015. Anatomy of GPU Memory System for Multi-Application Execution. In MEMSYS.","DOI":"10.1145\/2818950.2818979"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","unstructured":"Adwait Jog et al. 2016. Exploiting Core Criticality for Enhanced GPU Performance. In SIGMETRICS. 10.1145\/2896377.2901468","DOI":"10.1145\/2896377.2901468"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Onur Kayiran et al. 2014. Managing GPU Concurrency in Heterogeneous Architectures. In MICRO.","DOI":"10.1109\/MICRO.2014.62"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","unstructured":"Onur Kayiran et al. 2016. &mu;C-States: Fine-grained GPU Datapath Power Management. In PACT. 10.1145\/2967938.2967941","DOI":"10.1145\/2967938.2967941"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2011.89"},{"key":"e_1_3_2_1_34_1","unstructured":"Hyojong Kim et al. 2015. Understanding Energy Aspects of Processing-near-Memory for HPC Workloads. In MEMSYS."},{"key":"e_1_3_2_1_35_1","unstructured":"Kyung Hoon Kim et al. 2017. Packet Coalescing Exploiting Data Redundancy in GPGPU Architectures. In ICS."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","unstructured":"Peter M. Kogge. 1994. EXECUBE-A New Architecture for Scaleable MPPs. In ICPP. 10.1109\/ICPP.1994.108","DOI":"10.1109\/ICPP.1994.108"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","unstructured":"Jingwen Leng et al. 2013. GPUWattch: Enabling Energy Optimizations in GPGPUs. In ISCA. 10.1145\/2485922.2485964","DOI":"10.1145\/2485922.2485964"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","unstructured":"Gabriel H. Loh. 2008. 3D-Stacked Memory Architectures for Multi-core Processors. In ISCA. 10.1109\/ISCA.2008.15","DOI":"10.1109\/ISCA.2008.15"},{"key":"e_1_3_2_1_39_1","volume-title":"Loh et al","author":"Gabriel H.","year":"2013","unstructured":"Gabriel H. Loh et al. 2013. A Processing-in-Memory Taxonomy and a Case for Studying Fixed-function PIM. In WoNDP."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","unstructured":"Jiayuan Meng et al. 2010. Dynamic Warp Subdivision for Integrated Branch and Memory Divergence Tolerance. In ISCA. 10.1145\/1815961.1815992","DOI":"10.1145\/1815961.1815992"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155631"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/2000064.2000111"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Aaftab Munshi. 2009. The OpenCL Specification. (2009) 1--314.","DOI":"10.1061\/41031(341)300"},{"key":"e_1_3_2_1_44_1","unstructured":"Naveen Muralimanohar et al. 2009. CACTI 6.0: A tool to model large caches. HP Laboratories (2009) 22--31."},{"key":"e_1_3_2_1_45_1","unstructured":"Lifeng Nai et al. 2017. GraphPIM: Enabling Instruction-Level PIM Offloading in Graph Computing Frameworks. In HPCA."},{"key":"e_1_3_2_1_46_1","unstructured":"NVIDIA. 2008. Parallel Thread Execution (PTX). (2008)."},{"key":"e_1_3_2_1_47_1","unstructured":"NVIDIA. 2011. CUDA C\/C++ SDK Code Samples."},{"key":"e_1_3_2_1_48_1","unstructured":"NVIDIA. 2017. NVIDIA TESLA V100 GPU Architecture."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","unstructured":"David Patterson et al. 1997. A Case for Intelligent RAM. IEEE Micro. 10.1109\/40.592312","DOI":"10.1109\/40.592312"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Ashutosh Pattnaik et al. 2016. Scheduling Techniques for GPU Architectures with Processing-In-Memory Capabilities. In PACT.","DOI":"10.1145\/2967938.2967940"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","unstructured":"Mukund Ramakrishna et al. 2016. GCA:Global Congestion Awareness for Load Balance in Networks-on-Chip. IEEE TPDS (2016). 10.1109\/TPDS.2015.2477840","DOI":"10.1109\/TPDS.2015.2477840"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Rohit Sunkam Ramanujam and Bill Lin. 2010. Destination-based Adaptive Routing on 2D Mesh Networks. In ANCS.","DOI":"10.1145\/1872007.1872030"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Prasanna Venkatesh Rengasamy et al. 2017. Characterizing diverse handheld apps for customized hardware acceleration. In IISWC.","DOI":"10.1109\/IISWC.2017.8167776"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Prasanna Venkatesh Rengasamy et al. 2018. CritICs Critiquing Criticality in Mobile Apps. In MICRO.","DOI":"10.1109\/MICRO.2018.00075"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/2540708.2540718"},{"key":"e_1_3_2_1_56_1","volume-title":"Schulte et al","author":"Michael J.","year":"2015","unstructured":"Michael J. Schulte et al. 2015. Achieving Exascale Capabilities through Heterogeneous Computing. IEEE Micro 35, 4 (2015)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","unstructured":"Akbar Sharifi et al. 2012. Addressing End-to-End Memory Access Latency in NoC-Based Multicores. In MICRO. 10.1109\/MICRO.2012.35","DOI":"10.1109\/MICRO.2012.35"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.1970.5008902"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","unstructured":"Chen Sun et al. 2012. DSENT-a tool connecting emerging photonics with electronics for opto-electronic networks-on-chip modeling. In NoCS. 10.1109\/NOCS.2012.31","DOI":"10.1109\/NOCS.2012.31"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"crossref","unstructured":"Xulong Tang et al. 2017. Controlled Kernel Launch for Dynamic Parallelism in GPUs. In HPCA.","DOI":"10.1109\/HPCA.2017.14"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","unstructured":"Xulong Tang et al. 2017. Data Movement Aware Computation Partitioning. In MICRO. 10.1145\/3123939.3123954","DOI":"10.1145\/3123939.3123954"},{"key":"e_1_3_2_1_62_1","first-page":"3","article-title":"Quantifying Data Locality in Dynamic Parallelism in GPUs","volume":"2","author":"Xulong Tang","year":"2018","unstructured":"Xulong Tang et al. 2018. Quantifying Data Locality in Dynamic Parallelism in GPUs. Proc. ACM Meas. Anal. Comput. Syst. 2, 3 (Dec. 2018).","journal-title":"Proc. ACM Meas. Anal. Comput. Syst."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/35.568214"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"crossref","unstructured":"Prashanth Thinakaran et al. 2017. Phoenix: a constraint-aware scheduler for heterogeneous datacenters. In ICDCS. IEEE.","DOI":"10.1109\/ICDCS.2017.262"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.5555\/3195638.3195672"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","unstructured":"Dongping Zhang et al. 2014. TOP-PIM: Throughput-oriented Programmable Processing in Memory. In HPDC. 10.1145\/2600212.2600213","DOI":"10.1145\/2600212.2600213"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/1950365.1950408"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3123948"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"crossref","unstructured":"Shulin Zhao et al. 2019. Understanding Energy Efficiency in IoT App Executions. In ICDCS.","DOI":"10.1109\/ICDCS.2019.00079"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"crossref","unstructured":"Amir Kavyan Ziabari et al. 2015. Asymmetric NoC Architectures for GPU Systems. In NOCS.","DOI":"10.1145\/2786572.2786596"}],"event":{"name":"ISCA '19: The 46th Annual International Symposium on Computer Architecture","location":"Phoenix Arizona","acronym":"ISCA '19","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","IEEE-CS\\DATC IEEE Computer Society"]},"container-title":["Proceedings of the 46th International Symposium on Computer Architecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3307650.3322212","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3307650.3322212","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3307650.3322212","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:13:37Z","timestamp":1750202017000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3307650.3322212"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,6,22]]},"references-count":70,"alternative-id":["10.1145\/3307650.3322212","10.1145\/3307650"],"URL":"https:\/\/doi.org\/10.1145\/3307650.3322212","relation":{},"subject":[],"published":{"date-parts":[[2019,6,22]]},"assertion":[{"value":"2019-06-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}