{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T04:14:53Z","timestamp":1750306493090,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2015,12,5]],"date-time":"2015-12-05T00:00:00Z","timestamp":1449273600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2015,12,5]]},"DOI":"10.1145\/2830772.2830788","type":"proceedings-article","created":{"date-parts":[[2016,1,11]],"date-time":"2016-01-11T13:38:13Z","timestamp":1452519493000},"page":"750-761","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Enabling portable energy efficiency with memory accelerated library"],"prefix":"10.1145","author":[{"given":"Qi","family":"Guo","sequence":"first","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tze-Meng","family":"Low","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nikolaos","family":"Alachiotis","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Berkin","family":"Akin","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Larry","family":"Pileggi","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"James C.","family":"Hoe","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Franz","family":"Franchetti","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2015,12,5]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/1736020.1736044"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155640"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2012.132"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/2159542.2159547"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485944"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541961"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2694344.2694358"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485925"},{"key":"e_1_3_2_1_9_1","volume-title":"Active memory cube: A processing-in-memory architecture for exascale systems,\" IBM Journal of Research and Development","author":"Nair R.","year":"2015","unstructured":"R. Nair , \" Active memory cube: A processing-in-memory architecture for exascale systems,\" IBM Journal of Research and Development , 2015 . R. Nair et al., \"Active memory cube: A processing-in-memory architecture for exascale systems,\" IBM Journal of Research and Development, 2015."},{"key":"e_1_3_2_1_10_1","unstructured":"\"BLAS (basic linear algebra subprograms).\" http:\/\/www.netlib.org\/blas\/.  \"BLAS (basic linear algebra subprograms).\" http:\/\/www.netlib.org\/blas\/."},{"key":"e_1_3_2_1_11_1","volume-title":"The design and implementation of FFTW3,\" Proceedings of the IEEE","author":"Frigo M.","year":"2005","unstructured":"M. Frigo and S. Johnson , \" The design and implementation of FFTW3,\" Proceedings of the IEEE , 2005 . M. Frigo and S. Johnson, \"The design and implementation of FFTW3,\" Proceedings of the IEEE, 2005."},{"key":"e_1_3_2_1_12_1","unstructured":"\"The R project for statistical computing.\" http:\/\/www.r-project.org\/.  \"The R project for statistical computing.\" http:\/\/www.r-project.org\/."},{"key":"e_1_3_2_1_13_1","volume-title":"PERFECT (Power Efficiency Revolution For Embedded Computing Technologies) Benchmark Suite Manual","author":"Barker K.","year":"2013","unstructured":"K. Barker , PERFECT (Power Efficiency Revolution For Embedded Computing Technologies) Benchmark Suite Manual . Pacific Northwest National Laboratory and Georgia Tech Research Institute , 2013 . K. Barker et al., PERFECT (Power Efficiency Revolution For Embedded Computing Technologies) Benchmark Suite Manual. Pacific Northwest National Laboratory and Georgia Tech Research Institute, 2013."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/1454115.1454128"},{"key":"e_1_3_2_1_15_1","unstructured":"\"Intel math kernel library (MKL).\" http:\/\/software.intel.com\/en-us\/articles\/intel-mkl\/.  \"Intel math kernel library (MKL).\" http:\/\/software.intel.com\/en-us\/articles\/intel-mkl\/."},{"key":"e_1_3_2_1_16_1","volume-title":"Optimized hardware for suboptimal software: The case for SIMD-aware benchmarks,\" in ISPASS","author":"Cebrian J.","year":"2014","unstructured":"J. Cebrian , \" Optimized hardware for suboptimal software: The case for SIMD-aware benchmarks,\" in ISPASS , 2014 . J. Cebrian et al., \"Optimized hardware for suboptimal software: The case for SIMD-aware benchmarks,\" in ISPASS, 2014."},{"key":"e_1_3_2_1_17_1","volume-title":"Hybrid memory cube new dram architecture increases density and performance,\" in VLSIT","author":"Jeddeloh J.","year":"2012","unstructured":"J. Jeddeloh and B. Keeth , \" Hybrid memory cube new dram architecture increases density and performance,\" in VLSIT , 2012 . J. Jeddeloh and B. Keeth, \"Hybrid memory cube new dram architecture increases density and performance,\" in VLSIT, 2012."},{"key":"e_1_3_2_1_18_1","unstructured":"P. Colella \"Defining software requirements for scientific computing \" 2004.  P. Colella \"Defining software requirements for scientific computing \" 2004."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2008.15"},{"key":"e_1_3_2_1_20_1","volume-title":"An optimized 3D-stacked memory architecture by exploiting excessive, high-density tsv bandwidth,\" in HPCA","author":"Woo D. H.","year":"2010","unstructured":"D. H. Woo , \" An optimized 3D-stacked memory architecture by exploiting excessive, high-density tsv bandwidth,\" in HPCA , 2010 . D. H. Woo et al., \"An optimized 3D-stacked memory architecture by exploiting excessive, high-density tsv bandwidth,\" in HPCA, 2010."},{"key":"e_1_3_2_1_21_1","volume-title":"3D-maps: 3d massively parallel processor with stacked memory,\" in ISSCC","author":"Kim D. H.","year":"2012","unstructured":"D. H. Kim , \" 3D-maps: 3d massively parallel processor with stacked memory,\" in ISSCC , 2012 . D. H. Kim et al., \"3D-maps: 3d massively parallel processor with stacked memory,\" in ISSCC, 2012."},{"key":"e_1_3_2_1_22_1","unstructured":"Samsung \"Samsung to release 3D memory modules with 50% greater density \" 2010.  Samsung \"Samsung to release 3D memory modules with 50% greater density \" 2010."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750397"},{"key":"e_1_3_2_1_24_1","volume-title":"Understanding the design space of dram optimized hardware FFT accelerators,\" in ASAP","author":"Akin B.","year":"2014","unstructured":"B. Akin , \" Understanding the design space of dram optimized hardware FFT accelerators,\" in ASAP , 2014 . B. Akin et al., \"Understanding the design space of dram optimized hardware FFT accelerators,\" in ASAP, 2014."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/1736020.1736045"},{"key":"e_1_3_2_1_27_1","volume-title":"Algorithm\/hardware co-optimized sar image reconstruction with 3d-stacked logic in memory,\" in HPEC","author":"Sadi F.","year":"2014","unstructured":"F. Sadi , \" Algorithm\/hardware co-optimized sar image reconstruction with 3d-stacked logic in memory,\" in HPEC , 2014 . F. Sadi et al., \"Algorithm\/hardware co-optimized sar image reconstruction with 3d-stacked logic in memory,\" in HPEC, 2014."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/71.503777"},{"key":"e_1_3_2_1_29_1","volume-title":"Farmahini-Farahani et al., \"Nda: Near-dram acceleration architecture leveraging commodity dram devices and standard memory modules,\" in HPCA","author":"A.","year":"2015","unstructured":"A. Farmahini-Farahani et al., \"Nda: Near-dram acceleration architecture leveraging commodity dram devices and standard memory modules,\" in HPCA , 2015 . A. Farmahini-Farahani et al., \"Nda: Near-dram acceleration architecture leveraging commodity dram devices and standard memory modules,\" in HPCA, 2015."},{"key":"e_1_3_2_1_30_1","unstructured":"\"More knights landing xeon phi secrets unveiled.\" http:\/\/www.theplatform.net\/2015\/03\/25\/more-knights-landing-xeon-phi-secrets-unveiled\/.  \"More knights landing xeon phi secrets unveiled.\" http:\/\/www.theplatform.net\/2015\/03\/25\/more-knights-landing-xeon-phi-secrets-unveiled\/."},{"key":"e_1_3_2_1_31_1","series-title":"2","volume-title":"desktop intel pentium processor family, and desktop intel celeron processor family datasheet -","year":"2014","unstructured":"\"Desktop 4th generation intel core processor family , desktop intel pentium processor family, and desktop intel celeron processor family datasheet - volume 1 of 2 ,\" 2014 . \"Desktop 4th generation intel core processor family, desktop intel pentium processor family, and desktop intel celeron processor family datasheet - volume 1 of 2,\" 2014."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1177\/109434200001400303"},{"key":"e_1_3_2_1_33_1","unstructured":"\"Intel 64 and IA-32 architectures software developer\u015b.\" http:\/\/www.intel.com\/content\/dam\/www\/public\/us\/en\/documents\/manuals\/64-ia-32-architectures-software developer-vol-3b-part-2-manual.pdf.  \"Intel 64 and IA-32 architectures software developer\u015b.\" http:\/\/www.intel.com\/content\/dam\/www\/public\/us\/en\/documents\/manuals\/64-ia-32-architectures-software developer-vol-3b-part-2-manual.pdf."},{"key":"e_1_3_2_1_34_1","volume-title":"CACTI-3DD: Architecture-level modeling for 3d die-stacked dram main memory,\" in DATE","author":"Chen K.","year":"2012","unstructured":"K. Chen , \" CACTI-3DD: Architecture-level modeling for 3d die-stacked dram main memory,\" in DATE , 2012 . K. Chen et al., \"CACTI-3DD: Architecture-level modeling for 3d die-stacked dram main memory,\" in DATE, 2012."},{"key":"e_1_3_2_1_35_1","volume-title":"Accelerating sparse matrix-matrix multiplication with 3D-stacked logic-in-memory hardware,\" in HPEC","author":"Zhu Q.","year":"2013","unstructured":"Q. Zhu , \" Accelerating sparse matrix-matrix multiplication with 3D-stacked logic-in-memory hardware,\" in HPEC , 2013 . Q. Zhu et al., \"Accelerating sparse matrix-matrix multiplication with 3D-stacked logic-in-memory hardware,\" in HPEC, 2013."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/2049662.2049663"},{"key":"e_1_3_2_1_37_1","author":"Gonzalez R.","year":"1996","unstructured":"R. Gonzalez and M. Horowitz , \"Energy dissipation in general purpose microprocessors,\" IEEE Journal of Solid-State Circuits , 1996 . R. Gonzalez and M. Horowitz, \"Energy dissipation in general purpose microprocessors,\" IEEE Journal of Solid-State Circuits, 1996.","journal-title":"\"Energy dissipation in general purpose microprocessors,\" IEEE Journal of Solid-State Circuits"},{"key":"e_1_3_2_1_38_1","volume-title":"NDC: analyzing the impact of 3d-stacked memory+logic devices on mapreduce workloads,\" in ISPASS","author":"Pugsley S. H.","year":"2014","unstructured":"S. H. Pugsley , \" NDC: analyzing the impact of 3d-stacked memory+logic devices on mapreduce workloads,\" in ISPASS , 2014 . S. H. Pugsley et al., \"NDC: analyzing the impact of 3d-stacked memory+logic devices on mapreduce workloads,\" in ISPASS, 2014."},{"key":"e_1_3_2_1_39_1","unstructured":"\"Intel integrated performance primitives.\" https:\/\/software.intel.com\/en-us\/intel-ipp.  \"Intel integrated performance primitives.\" https:\/\/software.intel.com\/en-us\/intel-ipp."},{"key":"e_1_3_2_1_40_1","unstructured":"\"GotoBLAS2.\" https:\/\/www.tacc.utexas.edu\/research-development\/tacc-software\/gotoblas2.  \"GotoBLAS2.\" https:\/\/www.tacc.utexas.edu\/research-development\/tacc-software\/gotoblas2."},{"key":"e_1_3_2_1_41_1","volume-title":"Optimizing sparse matrix computations for register reuse in sparsity,\" in Computational Science","author":"Im E.-J.","year":"2001","unstructured":"E.-J. Im and K. Yelick , \" Optimizing sparse matrix computations for register reuse in sparsity,\" in Computational Science , 2001 . E.-J. Im and K. Yelick, \"Optimizing sparse matrix computations for register reuse in sparsity,\" in Computational Science, 2001."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/MCSE.2009.207"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/2688500.2688538"},{"key":"e_1_3_2_1_44_1","unstructured":"\"GPU-accelerated libraries.\" https:\/\/developer.nvidia.com\/gpu-accelerated-libraries.  \"GPU-accelerated libraries.\" https:\/\/developer.nvidia.com\/gpu-accelerated-libraries."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2012.48"},{"key":"e_1_3_2_1_46_1","volume-title":"General-purpose code acceleration with limited-precision analog computation,\" in ISCA","author":"St. Amant R.","year":"2014","unstructured":"R. St. Amant , \" General-purpose code acceleration with limited-precision analog computation,\" in ISCA , 2014 . R. St. Amant et al., \"General-purpose code acceleration with limited-precision analog computation,\" in ISCA, 2014."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485926"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/2540708.2540748"},{"key":"e_1_3_2_1_49_1","volume-title":"Race logic: A hardware acceleration for dynamic programming algorithms,\" in ISCA","author":"Madhavan A.","year":"2014","unstructured":"A. Madhavan , \" Race logic: A hardware acceleration for dynamic programming algorithms,\" in ISCA , 2014 . A. Madhavan et al., \"Race logic: A hardware acceleration for dynamic programming algorithms,\" in ISCA, 2014."},{"key":"e_1_3_2_1_50_1","unstructured":"\"OpenACC: Directives for accelerators.\" http:\/\/www.openacc-standard.org\/.  \"OpenACC: Directives for accelerators.\" http:\/\/www.openacc-standard.org\/."},{"key":"e_1_3_2_1_51_1","unstructured":"\"CUDA C programming guide.\" http:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide.  \"CUDA C programming guide.\" http:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide."},{"key":"e_1_3_2_1_52_1","unstructured":"\"OpenCL: the open standard for paralle programming of heterogeneous systems.\" https:\/\/www.khronos.org\/opencl\/.  \"OpenCL: the open standard for paralle programming of heterogeneous systems.\" https:\/\/www.khronos.org\/opencl\/."},{"key":"e_1_3_2_1_53_1","unstructured":"\"Nvidia's next generation cuda compute architecture: Fermi.\" http:\/\/www.nvidia.com\/content\/pdf\/fermi_white_papers\/nvidia_fermi_compute_architecture_whitepaper.pdf.  \"Nvidia's next generation cuda compute architecture: Fermi.\" http:\/\/www.nvidia.com\/content\/pdf\/fermi_white_papers\/nvidia_fermi_compute_architecture_whitepaper.pdf."},{"key":"e_1_3_2_1_54_1","volume-title":"Supporting x86-64 address translation for 100s of gpu lanes,\" in HPCA","author":"Power J.","year":"2014","unstructured":"J. Power , \" Supporting x86-64 address translation for 100s of gpu lanes,\" in HPCA , 2014 . J. Power et al., \"Supporting x86-64 address translation for 100s of gpu lanes,\" in HPCA, 2014."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541942"},{"key":"e_1_3_2_1_56_1","volume-title":"Comparing implementations of near-data computing with in-memory mapreduce workloads,\" IEEE Micro","author":"Pugsley S.","year":"2014","unstructured":"S. Pugsley , \" Comparing implementations of near-data computing with in-memory mapreduce workloads,\" IEEE Micro , 2014 . S. Pugsley et al., \"Comparing implementations of near-data computing with in-memory mapreduce workloads,\" IEEE Micro, 2014."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/2600212.2600213"},{"key":"e_1_3_2_1_58_1","volume-title":"3D-stacked memory-side acceleration: Accelerator and system design,\" in WoNDP","author":"Guo Q.","year":"2014","unstructured":"Q. Guo , \" 3D-stacked memory-side acceleration: Accelerator and system design,\" in WoNDP , 2014 . Q. Guo et al., \"3D-stacked memory-side acceleration: Accelerator and system design,\" in WoNDP, 2014."},{"key":"e_1_3_2_1_59_1","volume-title":"High performance AXI-4.0 based interconnect for extensible smart memory cubes,\" in DATE","author":"Azarkhish E.","year":"2015","unstructured":"E. Azarkhish , \" High performance AXI-4.0 based interconnect for extensible smart memory cubes,\" in DATE , 2015 . E. Azarkhish et al., \"High performance AXI-4.0 based interconnect for extensible smart memory cubes,\" in DATE, 2015."}],"event":{"name":"MICRO-48: The 48th Annual IEEE\/ACM International Symposium of Microarchitecture","sponsor":["IEEE Computer Society TC-uARCH","SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"],"location":"Waikiki Hawaii","acronym":"MICRO-48"},"container-title":["Proceedings of the 48th International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2830772.2830788","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2830772.2830788","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T05:48:39Z","timestamp":1750225719000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2830772.2830788"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,12,5]]},"references-count":58,"alternative-id":["10.1145\/2830772.2830788","10.1145\/2830772"],"URL":"https:\/\/doi.org\/10.1145\/2830772.2830788","relation":{},"subject":[],"published":{"date-parts":[[2015,12,5]]},"assertion":[{"value":"2015-12-05","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}