{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,2]],"date-time":"2026-07-02T23:50:34Z","timestamp":1783036234113,"version":"3.54.6"},"publisher-location":"New York, NY, USA","reference-count":114,"publisher":"ACM","license":[{"start":{"date-parts":[[2017,9,11]],"date-time":"2017-09-11T00:00:00Z","timestamp":1505088000000},"content-version":"vor","delay-in-days":365,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1205618"],"award-info":[{"award-number":["1205618"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2016,9,11]]},"DOI":"10.1145\/2967938.2967940","type":"proceedings-article","created":{"date-parts":[[2016,8,31]],"date-time":"2016-08-31T08:32:08Z","timestamp":1472632328000},"page":"31-44","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":135,"title":["Scheduling Techniques for GPU Architectures with Processing-In-Memory Capabilities"],"prefix":"10.1145","author":[{"given":"Ashutosh","family":"Pattnaik","sequence":"first","affiliation":[{"name":"The Pennsylvania State University, University Park, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xulong","family":"Tang","sequence":"additional","affiliation":[{"name":"The Pennsylvania State University, University Park, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Adwait","family":"Jog","sequence":"additional","affiliation":[{"name":"The College of William and Mary, Williamsburg, VA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Onur","family":"Kayiran","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices Inc., Sunnyvale, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Asit K.","family":"Mishra","sequence":"additional","affiliation":[{"name":"Intel Corporation, Hillsboro, OR, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mahmut T.","family":"Kandemir","sequence":"additional","affiliation":[{"name":"The Pennsylvania State University, University Park, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Onur","family":"Mutlu","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chita R.","family":"Das","sequence":"additional","affiliation":[{"name":"The Pennsylvania State University, University Park, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2016,9,11]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"\"The Green500 List - June 2015.\""},{"key":"e_1_3_2_1_2_1","unstructured":"\"Top500 Supercomputer Sites - June 2015.\""},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","unstructured":"J. Ahn phet al. \"A Scalable Processing-in-memory Accelerator for Parallel Graph Processing \" in ISCA 2015. 10.1145\/2749469.2750386","DOI":"10.1145\/2749469.2750386"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750385"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","unstructured":"A. M. Aji phet al. \"Automatic Command Queue Scheduling for Task-Parallel Workloads in OpenCL \" in CLUSTER 2015. 10.1109\/CLUSTER.2015.15","DOI":"10.1109\/CLUSTER.2015.15"},{"key":"e_1_3_2_1_6_1","unstructured":"AMD \"Graphics Cores Next (GCN) Architecture \" 2012."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","unstructured":"N. Ardalani phet al. \"Cross-Architecture Performance Prediction (XAPP) Using CPU Code to Predict GPU Performance \" in MICRO 2015. 10.1145\/2830772.2830780","DOI":"10.1145\/2830772.2830780"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.1631"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.5555\/2337159.2337207"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","unstructured":"R. Ausavarungnirun phet al. \"Exploiting Inter-Warp Heterogeneity to Improve GPGPU Performance \" in PACT 2015. 10.1109\/PACT.2015.38","DOI":"10.1109\/PACT.2015.38"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"P. E. Bailey phet al. \"Adaptive Configuration Selection for Power-constrained Heterogeneous Systems \" in ICPP 2014.","DOI":"10.1109\/ICPP.2014.46"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"A. Bakhoda phet al. \"Analyzing CUDA workloads using a detailed GPU simulator \" in ISPASS 2009.","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"e_1_3_2_1_13_1","volume-title":"Insights from a MICRO-46 Workshop,\" in IEEE Micro","author":"R. Balasubramonian","year":"2014","unstructured":"R. Balasubramonian phet al., \"Near-Data Processing: Insights from a MICRO-46 Workshop,\" in IEEE Micro, 2014."},{"key":"e_1_3_2_1_14_1","volume-title":"An Efficient Cache Coherence Mechanism for Processing-in-Memory,\" IEEE CAL","author":"A. Boroumand","year":"2016","unstructured":"A. Boroumand phet al., \"LazyPIM: An Efficient Cache Coherence Mechanism for Processing-in-Memory,\" IEEE CAL, 2016."},{"key":"e_1_3_2_1_15_1","volume-title":"An Efficient Cache Coherence Mechanism for Processing-in-Memory,\" IEEE CAL","author":"A. Boroumand","year":"2014","unstructured":"A. Boroumand phet al., \"LazyPIM: An Efficient Cache Coherence Mechanism for Processing-in-Memory,\" IEEE CAL, 2014."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","unstructured":"A. Bulu\u00e7 phet al. \"Solving Path Problems on the GPU \" Parallel Computing 2010. 10.1016\/j.parco.2009.12.002","DOI":"10.1016\/j.parco.2009.12.002"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","unstructured":"M. Burtscher phet al. \"A Quantitative Study of Irregular Programs on GPUs \" in IISWC 2012. 10.1109\/IISWC.2012.6402918","DOI":"10.1109\/IISWC.2012.6402918"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.5555\/520549.822749"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","unstructured":"R. Chandra phet al. \"Scheduling and Page Migration for Multiprocessor Compute Servers \" in ASPLOS 1994. 10.1145\/195473.195485","DOI":"10.1145\/195473.195485"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"K. Chang phet al. \"Improving DRAM Performance by Parallelizing Refreshes with Accesses \" in HPCA 2014.","DOI":"10.1109\/HPCA.2014.6835946"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"L. Chen phet al. \"Dynamic Load Balancing on Single-and Multi-GPU Systems \" in IPDPS 2010.","DOI":"10.1109\/IPDPS.2010.5470413"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","unstructured":"G. Cybenko \"Dynamic load balancing for distributed memory multiprocessors \" JPDC 1989. 10.1016\/0743-7315(89)90021-X","DOI":"10.1016\/0743-7315(89)90021-X"},{"key":"e_1_3_2_1_24_1","unstructured":"W. J. Dally \"Challenges for Future Computing Systems \" HiPEAC Keynote 2015."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","unstructured":"A. Danalis phet al. \"The Scalable Heterogeneous Computing (SHOC) benchmark suite \" in GPGPU 2010. 10.1145\/1735688.1735702","DOI":"10.1145\/1735688.1735702"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","unstructured":"R. Das phet al. \"Application-to-core mapping policies to reduce memory system interference in multi-core systems \" in HPCA 2013. 10.1109\/HPCA.2013.6522311","DOI":"10.1109\/HPCA.2013.6522311"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","unstructured":"J. Draper phet al. \"The Architecture of the DIVA Processing-in-memory Chip \" in ICS 2002. 10.1145\/514191.514197","DOI":"10.1145\/514191.514197"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","unstructured":"C. Dubach phet al. \"Fast Compiler Optimisation Evaluation Using Code-Feature Based Performance Prediction \" in CF 2007. 10.1145\/1242531.1242553","DOI":"10.1145\/1242531.1242553"},{"key":"e_1_3_2_1_29_1","unstructured":"Y. Eckert phet al. \"Thermal Feasibility of Die-Stacked Processing in Memory \" in WoNDP 2014."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.5555\/1390681.1442794"},{"key":"e_1_3_2_1_31_1","volume-title":"Farmahini-Farahani phet al., \"DRAMA: An Architecture for Accelerated Processing near Memory,\" IEEE CAL","author":"A.","year":"2014","unstructured":"A. Farmahini-Farahani phet al., \"DRAMA: An Architecture for Accelerated Processing near Memory,\" IEEE CAL, 2014."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","unstructured":"M. R. Garey phet al. \"Some Simplified NP-complete Problems \" in STOC 1974. 10.1145\/800119.803884","DOI":"10.1145\/800119.803884"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/2.375174"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2010.5649549"},{"key":"e_1_3_2_1_35_1","unstructured":"GPGPU-Sim v3.2.1. Address mapping."},{"key":"e_1_3_2_1_36_1","unstructured":"GPGPU-Sim v3.2.1. GTX 480 Configuration."},{"key":"e_1_3_2_1_37_1","volume-title":"Grauer-Gray phet al., \"Auto-tuning a High-level Language targeted to GPU Codes,\" in InPar","author":"S.","year":"2012","unstructured":"S. Grauer-Gray phet al., \"Auto-tuning a High-level Language targeted to GPU Codes,\" in InPar, 2012."},{"key":"e_1_3_2_1_38_1","unstructured":"C. Gregg phet al. \"Dynamic Heterogeneous Scheduling Decisions using Historical Runtime Data \" in A4MMC 2011."},{"key":"e_1_3_2_1_39_1","volume-title":"Accelerator and System Design,\" in WoNDP","author":"Q. Guo","year":"2014","unstructured":"Q. Guo phet al., \"3D-Stacked Memory-Side Acceleration: Accelerator and System Design,\" in WoNDP, 2014."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/331532.331589"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"M. Hashemi phet al. \"Accelerating Dependent Cache Misses with an Enhanced Memory Controller \" in ISCA 2016.","DOI":"10.1109\/ISCA.2016.46"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/1454115.1454152"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1002\/0471722146"},{"key":"e_1_3_2_1_44_1","volume-title":"Enabling Programmer-Transparent Near-Data Processing in GPU Systems,\" in ISCA","author":"Offloading K. Hsieh phet al., \"Transparent","year":"2016","unstructured":"K. Hsieh phet al., \"Transparent Offloading and Mapping (TOM): Enabling Programmer-Transparent Near-Data Processing in GPU Systems,\" in ISCA, 2016."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","unstructured":"E. Ipek phet al. \"An Approach to Performance Prediction for Parallel Applications \" in Euro-Par 2005. 10.1007\/11549468_24","DOI":"10.1007\/11549468_24"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","unstructured":"E. Ipek phet al. \"Efficiently exploring architectural design spaces via predictive modeling \" in ASPLOS 2006. 10.1145\/1168857.1168882","DOI":"10.1145\/1168857.1168882"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2008.21"},{"key":"e_1_3_2_1_48_1","volume-title":"Oct.","author":"JEDEC","year":"2013","unstructured":"JEDEC, JESD235 High Bandwidth Memory (HBM) DRAM, Oct. 2013."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.51"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","unstructured":"J. A. Joao phet al. \"Bottleneck Identification and Scheduling in Multithreaded Applications \" in ASPLOS 2012. 10.1145\/2150976.2151001","DOI":"10.1145\/2150976.2151001"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","unstructured":"J. A. Joao phet al. \"Utility-based Acceleration of Multithreaded Applications on Asymmetric CMPs \" in ISCA 2013. 10.1145\/2485922.2485936","DOI":"10.1145\/2485922.2485936"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","unstructured":"A. Jog phet al. \"Anatomy of GPU Memory System for Multi-Application Execution \" in MEMSYS 2015. 10.1145\/2818950.2818979","DOI":"10.1145\/2818950.2818979"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/2451116.2451158"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","unstructured":"A. Jog phet al. \"Exploiting Core Criticality for Enhanced Performance in GPUs \" in SIGMETRICS 2016. 10.1145\/2896377.2901468","DOI":"10.1145\/2896377.2901468"},{"key":"e_1_3_2_1_55_1","volume-title":"Toward an Advanced Intelligent Memory System,\" in ICCD","author":"Y. Kang","year":"1999","unstructured":"Y. Kang phet al., \"FlexRAM: Toward an Advanced Intelligent Memory System,\" in ICCD, 1999."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","unstructured":"I. Karlin phet al. \"Exploring Traditional and Emerging Parallel Programming Models using a Proxy Application \" in IPDPS 2013. 10.1109\/IPDPS.2013.115","DOI":"10.1109\/IPDPS.2013.115"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.5555\/2523721.2523745"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","unstructured":"S. Keckler phet al. \"GPUs and the Future of Parallel Computing \" in IEEE Micro 2011. 10.1109\/MM.2011.89","DOI":"10.1109\/MM.2011.89"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"crossref","unstructured":"H. Kim phet al. \"Bounding Memory Interference Delay in COTS-based Multi-Core Systems \" in RTAS 2014.","DOI":"10.1109\/RTAS.2014.6925998"},{"key":"e_1_3_2_1_60_1","volume-title":"A Scalable and High-performance Scheduling Algorithm for Multiple Memory Controllers,\" in HPCA","author":"Y. Kim","year":"2010","unstructured":"Y. Kim phet al., \"ATLAS: A Scalable and High-performance Scheduling Algorithm for Multiple Memory Controllers,\" in HPCA, 2010."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2010.51"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","unstructured":"P. Kogge \"EXECUBE-A New Architecture for Scaleable MPPs \" in ICPP 1994. 10.1109\/ICPP.1994.108","DOI":"10.1109\/ICPP.1994.108"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","unstructured":"C. J. Lee phet al. \"Improving Memory Bank-level Parallelism in the Presence of Prefetching \" in MICRO 2009. 10.1145\/1669112.1669155","DOI":"10.1145\/1669112.1669155"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/2832911"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485964"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/781027.781048"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.5555\/554878.825093"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485928"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.5555\/2337159.2337161"},{"key":"e_1_3_2_1_70_1","unstructured":"G. H. Loh phet al. \"A Processing-in-Memory Taxonomy and a Case for Studying Fixed-function PIM \" in WoNDP 2013."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","unstructured":"G. Loh \"3D-Stacked Memory Architectures for Multi-core Processors \" in ISCA 2008. 10.1109\/ISCA.2008.15","DOI":"10.1109\/ISCA.2008.15"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2012.31"},{"key":"e_1_3_2_1_73_1","volume-title":"Exploring the Design Space,\" in ARCS","author":"S. Marko","year":"2015","unstructured":"S. Marko phet al., \"Processing-in-Memory: Exploring the Design Space,\" in ARCS, 2015."},{"key":"e_1_3_2_1_74_1","volume-title":"Introduction to Linear Regression Analysis","author":"Montgomery D. C.","year":"1992","unstructured":"D. C. Montgomery and E. Peck, Introduction to Linear Regression Analysis, 1992."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.5555\/1362903.1362921"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","unstructured":"S. P. Muralidhara phet al. \"Reducing Memory Interference in Multicore Systems via Application-Aware Memory Channel Partitioning \" in MICRO 2011. 10.1145\/2155620.2155664","DOI":"10.1145\/2155620.2155664"},{"key":"e_1_3_2_1_77_1","volume-title":"A systems architecture perspective,\" in IMW","author":"Mutlu O.","year":"2013","unstructured":"O. Mutlu, \"Memory scaling: A systems architecture perspective,\" in IMW, 2013."},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2007.40"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2008.7"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","unstructured":"H. Nagasaka phet al. \"Statistical Power Modeling of GPU Kernels using Performance Counters \" in IGCC 2010. 10.1109\/GREENCOMP.2010.5598315","DOI":"10.1109\/GREENCOMP.2010.5598315"},{"key":"e_1_3_2_1_81_1","unstructured":"NVIDIA \"CUDA C\/C+ SDK Code Samples \" 2011."},{"key":"e_1_3_2_1_82_1","volume-title":"NVIDIA's Next Generation CUDA Compute Architecture","author":"Fermi NVIDIA","year":"2011","unstructured":"NVIDIA, \"Fermi: NVIDIA's Next Generation CUDA Compute Architecture,\" 2011."},{"key":"e_1_3_2_1_83_1","volume-title":"Maxwell GM20x","author":"A's Next NVIDIA","year":"2015","unstructured":"NVIDIA, \"NVIDIA's Next Generation CUDA Compute Architecture: Maxwell GM20x,\" 2015."},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"publisher","unstructured":"L. S. Panwar phet al. \"Online Performance Projection for Clusters with Heterogeneous GPUs \" in ICPADS 2013. 10.1109\/.47","DOI":"10.1109\/.47"},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"publisher","unstructured":"D. Patterson phet al. \"A Case for Intelligent RAM \" in IEEE Micro 1997. 10.1109\/40.592312","DOI":"10.1109\/40.592312"},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"crossref","unstructured":"J. T. Pawlowski \"Hybrid Memory Cube \" Hotchips 2011.","DOI":"10.1109\/HOTCHIPS.2011.7477494"},{"key":"e_1_3_2_1_87_1","volume-title":"Analyzing the Impact of 3D-Stacked Memory Logic Devices on MapReduce Workloads,\" in ISPASS","author":"S. Pugsley","year":"2014","unstructured":"S. Pugsley phet al., \"NDC: Analyzing the Impact of 3D-Stacked Memory Logic Devices on MapReduce Workloads,\" in ISPASS, 2014."},{"key":"e_1_3_2_1_88_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2015.58"},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"publisher","unstructured":"A. Rahimi phet al. \"Energy-Efficient GPGPU Architectures via Collaborative Compilation and Memristive Memory-Based Computing \" in DAC 2014. 10.1145\/2593069.2593132","DOI":"10.1145\/2593069.2593132"},{"key":"e_1_3_2_1_90_1","doi-asserted-by":"publisher","unstructured":"S. F. Reddaway \"DAP - a Distributed Array Processor \" in ISCA 1973. 10.1145\/800123.803971","DOI":"10.1145\/800123.803971"},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"publisher","unstructured":"T. G. Rogers phet al. \"Cache-Conscious Wavefront Scheduling \" in MICRO 2012. 10.1109\/MICRO.2012.16","DOI":"10.1109\/MICRO.2012.16"},{"key":"e_1_3_2_1_92_1","doi-asserted-by":"publisher","unstructured":"T. G. Rogers phet al. \"Divergence-Aware Warp Scheduling \" in MICRO 2013. 10.1145\/2540708.2540718","DOI":"10.1145\/2540708.2540718"},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"publisher","DOI":"10.5555\/648191.750559"},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"publisher","unstructured":"V. Seshadri phet al. \"Fast Bulk Bitwise AND and OR in DRAM \" IEEE CAL 2015. 10.1109\/LCA.2015.2434872","DOI":"10.1109\/LCA.2015.2434872"},{"key":"e_1_3_2_1_95_1","doi-asserted-by":"publisher","DOI":"10.1145\/2540708.2540725"},{"key":"e_1_3_2_1_96_1","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830820"},{"key":"e_1_3_2_1_97_1","doi-asserted-by":"publisher","DOI":"10.5555\/583069"},{"key":"e_1_3_2_1_98_1","volume-title":"Bit-Serial SIMD on the CM-2 and the Cray 2,\" in SIAM PP","author":"Smitley D.","year":"1990","unstructured":"D. Smitley and K. Iobst, \"Bit-Serial SIMD on the CM-2 and the Cray 2,\" in SIAM PP, 1990."},{"key":"e_1_3_2_1_99_1","doi-asserted-by":"publisher","unstructured":"H. S. Stone \"A Logic-in-Memory Computer \" IEEE TC 1970. 10.1109\/TC.1970.5008902","DOI":"10.1109\/TC.1970.5008902"},{"key":"e_1_3_2_1_100_1","volume-title":"Achieving High Performance and Fairness at Low Cost,\" in ICCD","author":"L. Subramanian","year":"2014","unstructured":"L. Subramanian phet al., \"The Blacklisting Memory Scheduler: Achieving High Performance and Fairness at Low Cost,\" in ICCD, 2014."},{"key":"e_1_3_2_1_101_1","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830803"},{"key":"e_1_3_2_1_102_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2013.6522356"},{"key":"e_1_3_2_1_103_1","doi-asserted-by":"publisher","unstructured":"M. A. Suleman phet al. \"Data marshaling for multi-core architectures \" in phISCA 2010. 10.1145\/1815961.1816020","DOI":"10.1145\/1815961.1816020"},{"key":"e_1_3_2_1_104_1","doi-asserted-by":"publisher","unstructured":"M. A. Suleman phet al. \"Accelerating Critical Section Execution with Asymmetric Multi-core Architectures \" in ASPLOS 2009. 10.1145\/1508244.1508274","DOI":"10.1145\/1508244.1508274"},{"key":"e_1_3_2_1_105_1","doi-asserted-by":"publisher","unstructured":"H. Topcuoglu phet al. \"Performance-effective and Low-complexity Task Scheduling for Heterogeneous Computing \" IEEE TPDS 2002. 10.1109\/71.993206","DOI":"10.1109\/71.993206"},{"key":"e_1_3_2_1_106_1","doi-asserted-by":"publisher","DOI":"10.1145\/2847255"},{"key":"e_1_3_2_1_107_1","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750399"},{"key":"e_1_3_2_1_108_1","doi-asserted-by":"publisher","DOI":"10.1145\/2731186.2731202"},{"key":"e_1_3_2_1_109_1","volume-title":"High-Density TSV Bandwidth,\" in HPCA","author":"D. H.","year":"2010","unstructured":"D. H. Woo phet al., \"An Optimized 3D-Stacked Memory Architecture by Exploiting Excessive, High-Density TSV Bandwidth,\" in HPCA, 2010."},{"key":"e_1_3_2_1_110_1","doi-asserted-by":"crossref","unstructured":"G. Wu phet al. \"GPGPU Performance and Power Estimation using Machine Learning \" in HPCA 2015.","DOI":"10.1109\/HPCA.2015.7056063"},{"key":"e_1_3_2_1_111_1","doi-asserted-by":"publisher","DOI":"10.5555\/548748"},{"key":"e_1_3_2_1_112_1","doi-asserted-by":"publisher","DOI":"10.1145\/2600212.2600213"},{"key":"e_1_3_2_1_113_1","doi-asserted-by":"publisher","unstructured":"L. Zhang phet al. \"Accurate Online Power Estimation and Automatic Battery Behavior based Power Model Generation for Smartphones \" in CODES ISSS 2010. 10.1145\/1878961.1878982","DOI":"10.1145\/1878961.1878982"},{"key":"e_1_3_2_1_114_1","doi-asserted-by":"publisher","DOI":"10.1145\/2429384.2429400"}],"event":{"name":"PACT '16: International Conference on Parallel Architectures and Compilation","location":"Haifa Israel","acronym":"PACT '16","sponsor":["IFIP WG 10.3 IFIP WG 10.3","IEEE TCCA IEEE Computer Society Technical Committee on Computer Architecture","SIGARCH ACM Special Interest Group on Computer Architecture","IEEE CS TCPP IEEE Computer Society Technical Committee on Parallel Processing"]},"container-title":["Proceedings of the 2016 International Conference on Parallel Architectures and Compilation"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2967938.2967940","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2967938.2967940","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2967938.2967940","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T09:27:45Z","timestamp":1763458065000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2967938.2967940"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,9,11]]},"references-count":114,"alternative-id":["10.1145\/2967938.2967940","10.1145\/2967938"],"URL":"https:\/\/doi.org\/10.1145\/2967938.2967940","relation":{},"subject":[],"published":{"date-parts":[[2016,9,11]]},"assertion":[{"value":"2016-09-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}