{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T16:48:25Z","timestamp":1759942105292},"reference-count":41,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2014,12,5]],"date-time":"2014-12-05T00:00:00Z","timestamp":1417737600000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2015,8]]},"DOI":"10.1007\/s11227-014-1331-6","type":"journal-article","created":{"date-parts":[[2014,12,4]],"date-time":"2014-12-04T05:52:26Z","timestamp":1417672346000},"page":"2808-2829","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["Power-efficient prefetching on GPGPUs"],"prefix":"10.1007","volume":"71","author":[{"given":"Hajar","family":"Falahati","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shaahin","family":"Hessabi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mania","family":"Abdi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Amirali","family":"Baniasadi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2014,12,5]]},"reference":[{"key":"1331_CR1","doi-asserted-by":"crossref","DOI":"10.1007\/978-1-4419-0263-4","volume-title":"Multicore processors and systems","author":"SW Keckler","year":"2009","unstructured":"Keckler SW, Olukotun L, Hofstee HP (2009) Multicore processors and systems. Springer, New York"},{"key":"1331_CR2","unstructured":"ITRS (2008) Update. http:\/\/www.itrs.net\/Links\/2008ITRS\/Home2008.htm"},{"key":"1331_CR3","doi-asserted-by":"crossref","unstructured":"Agarwal V, Hrishikesh MS, Keckler SW, Burger D (2000) Clock rate versus IPC: the end of the road for conventional microarchitectures. In: Proceedings of the 27th annual international symposium on computer architecture (ISCA \u201900), pp 248\u2013259","DOI":"10.1145\/339647.339691"},{"key":"1331_CR4","doi-asserted-by":"crossref","unstructured":"Amodt TM (2009) Architecting graphics processors for non-graphics compute acceleration. In: IEEE Pacific Rim conference on communications, computers and signal processing, Victoria, BC, 23\u201326 August 2009, pp 963\u2013968","DOI":"10.1109\/PACRIM.2009.5291239"},{"key":"1331_CR5","doi-asserted-by":"crossref","unstructured":"Owens JD, Houston M, Luebke D, Green S, Stone JE, Phillips JC (2008) GPU computing graphics: processing units-powerful, programmable, and highly parallel-are increasingly targeting general-purpose computing applications. Proc IEEE 96(5):879\u2013899","DOI":"10.1109\/JPROC.2008.917757"},{"key":"1331_CR6","unstructured":"Owens JD, Luebke D, Govindaraju N, Harris M, Kr\u00fcger J, Lefohn AE, Purcell TJ (2005) A survey of general-purpose computation on graphics hardware. In: Proceedings of EUROGRAPHICS 2005, pp 21\u201351"},{"key":"1331_CR7","unstructured":"NVIDIA. http:\/\/www.nvidia.com\/object\/what-is-gpu-computing.html"},{"key":"1331_CR8","doi-asserted-by":"crossref","unstructured":"Hong S, Kim H (2009) An analytical model for a GPU architecture with memory-level and thread-level parallelism awareness. In: Proceedings of the 36th annual international symposium on computer architecture (ISCA \u201909), pp 152\u2013163","DOI":"10.1145\/1555754.1555775"},{"key":"1331_CR9","doi-asserted-by":"crossref","unstructured":"Gou C, Gaydadjiev GN (2011) Elastic pipeline: addressing GPU on-chip shared memory bank conflicts. In: Proceedings of the 8th ACM international conference on computing frontiers (CF \u201911)","DOI":"10.1145\/2016604.2016608"},{"key":"1331_CR10","doi-asserted-by":"crossref","unstructured":"Bakhoda A, Yuan G, Fung W, Wong H, Aamodt T (2009) Analyzing CUDA workloads using a detailed GPU simulator. In: IEEE international symposium on performance analysis of systems and software, ISPASS 2009, Boston, MA, 26\u201328 April 2009, pp 163\u2013174","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"1331_CR11","doi-asserted-by":"crossref","unstructured":"Hong S, Kim H (2010) An integrated GPU power and performance model. In: Proceedings of the 37th annual international symposium on computer architecture (ISCA \u201910), 280\u2013289","DOI":"10.1145\/1815961.1815998"},{"key":"1331_CR12","doi-asserted-by":"crossref","unstructured":"Tarjan D, Skadron K (2010) The sharing tracker: using ideas from cache coherence hardware to reduce off-chip memory traffic with non-coherent caches. In: International conference for high performance computing, networking, storage and analysis (SC), New Orleans, LA, 13\u201319 November 2010, pp 1\u201310","DOI":"10.1109\/SC.2010.54"},{"key":"1331_CR13","doi-asserted-by":"crossref","unstructured":"Scogland TRW, Lin H, Feng W (2010) A first look at integrated GPUs for green high-performance computing. Comput Sci Res Dev 25:125\u2013134","DOI":"10.1007\/s00450-010-0128-y"},{"key":"1331_CR14","doi-asserted-by":"crossref","unstructured":"Wang PH, Chen YM, Yang CL, Cheng YJ (2009) A predictive shutdown technique for GPU shader processors. IEEE Comput Archit Lett 8(1):9\u201312","DOI":"10.1109\/L-CA.2009.1"},{"key":"1331_CR15","doi-asserted-by":"crossref","unstructured":"Gebhart M, Keckler SW, Khailany B, Krashinsky R, Dally WJ (2012) Unifying primary cache, scratch, and register file memories in a throughput processor. In: MICRO-45 proceedings of the 2012 45th annual IEEE\/ACM international symposium on microarchitecture, pp 96\u2013106","DOI":"10.1109\/MICRO.2012.18"},{"key":"1331_CR16","doi-asserted-by":"crossref","unstructured":"Lindholm E et al. (2008) NVIDIA tesla: a unified graphics and computing architecture. IEEE Micro 28(2):39\u201355","DOI":"10.1109\/MM.2008.31"},{"key":"1331_CR17","unstructured":"NVIDIA Crop. CUDA C programming guide. http:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/"},{"key":"1331_CR18","doi-asserted-by":"crossref","unstructured":"Falahati H, Abdi M, Baniasadi A, Hessabi S (2013) ISP: using idle SMs in hardware-based prefetching. In: 17th CSI international symposium on computer architecture and digital systems (CADS), 2013, Tehran, 30\u201331 October 2013, pp 3\u20138","DOI":"10.1109\/CADS.2013.6714230"},{"key":"1331_CR19","unstructured":"NVIDIA\u2019s next generation CUDA compute architecture: Fermi. http:\/\/www.nvidia.com\/content\/pdf\/fermi_white_papers\/nvidia_fermi_compute_architecture_whitepaper.pdf"},{"key":"1331_CR20","unstructured":"AMS\u2019s Radeon. http:\/\/developer.amd.com\/resources\/documentation-articles\/gpu-demos\/radeon-hd-6900-series-graphics-real-time-demo\/"},{"key":"1331_CR21","unstructured":"NVIDIAs. http:\/\/developer.nvidia.com\/nvidia-gpu-computing-documentation"},{"key":"1331_CR22","unstructured":"AMD. Chu MM (2010) GPU Computing: past, present and future with ATI stream technology."},{"key":"1331_CR23","unstructured":"Hennessey J, Patterson D (2006) Computer architecture: a quantitative approach, 4th edn. Morgan Kaufmann. http:\/\/www.amazon.com\/Computer-Architecture-Quantitative-Approach-Edition\/dp\/0123704901"},{"key":"1331_CR24","doi-asserted-by":"crossref","unstructured":"Fung WL et al. (2007) Dynamic warp formation and scheduling for efficient GPU control flow. In: 40th annual IEEE\/ACM international symposium on microarchitecture, 2007 (MICRO 2007), Chicago, IL, 1\u20135 December 2007, pp 407\u2013420","DOI":"10.1109\/MICRO.2007.30"},{"key":"1331_CR25","doi-asserted-by":"crossref","unstructured":"Gebhart M, Johnson DR, Tarjan D, Keckler SW, Dally WJ, Lindholm E, Skadron K (2011) Energy-efficient mechanisms for managing thread context in throughput processors. In: Proceedings of the 38th annual international symposium on computer architecture (ISCA \u201911 ), pp 235\u2013246","DOI":"10.1145\/2000064.2000093"},{"key":"1331_CR26","doi-asserted-by":"crossref","unstructured":"Gilani SZ, Kim NS, Schulte MJ (2013) Exploiting GPU peak-power and performance tradeoffs through reduced effective pipeline latency. In: Proceedings of the 46th annual IEEE\/ACM international symposium on microarchitecture (MICRO-46), pp 74\u201385","DOI":"10.1145\/2540708.2540716"},{"key":"1331_CR27","doi-asserted-by":"crossref","unstructured":"Abdel-Majeed M, Wong D, Annavaram M (2013) Warped gates: gating aware scheduling and power gating for GPGPUs. In: Proceedings of the 46th annual IEEE\/ACM international symposium on microarchitecture (MICRO-46), pp 111\u2013122","DOI":"10.1145\/2540708.2540719"},{"key":"1331_CR28","doi-asserted-by":"crossref","unstructured":"Leng J, Hetherington T, Eitantawy A, Gilani S, Kim NS, Aamodt TM, Reddi VJ (2013) GPUWattch: enabling energy optimizations in GPGPUs. In: Proceedings of the 40th annual international symposium on computer architecture, pp 487\u2013498","DOI":"10.1145\/2485922.2485964"},{"key":"1331_CR29","doi-asserted-by":"crossref","unstructured":"Lucas J, Lal S, Andersch M, Mesa MA, Juurlink B (2013) How a single chip causes massive power bills GPUSimPow: a GPGPU power simulator. In: Proceedings of ISPASS, 2013","DOI":"10.1109\/ISPASS.2013.6557150"},{"key":"1331_CR30","doi-asserted-by":"crossref","unstructured":"Li S et al. (2009) McPAT: an integrated power, area, and timing modeling framework for multicore and manycore architectures. In: 42nd annual IEEE\/ACM international symposium on microarchitecture, 2009 (MICRO-42), New York, NY, 12\u201316 December 2009, pp 469\u2013480","DOI":"10.1145\/1669112.1669172"},{"key":"1331_CR31","doi-asserted-by":"crossref","unstructured":"Keramidas G, Spiliopoulos V, Kaxiras S (2010) Interval-based models for run-time DVFS orchestration in superscalar processors. In: Proceedings of the 7th ACM international conference on computing frontiers (CF \u201910), pp 287\u2013296","DOI":"10.1145\/1787275.1787338"},{"key":"1331_CR32","doi-asserted-by":"crossref","unstructured":"Eyerman S, Eeckhout L, Karkhanis T, Smith JE (2010) A mechanistic performance model for superscalar out-of-order processors. In: ACM Trans Comput Syst 27(2). doi: 10.1145\/1534909.1534910","DOI":"10.1145\/1534909.1534910"},{"key":"1331_CR33","unstructured":"Aamodt TM et al. (2012) GPGPU-Sim 3.x Manual. University of BritishColumbi. http:\/\/gpgpu-sim.org\/manual\/index.php\/GPGPU-Sim_3.x_Manual"},{"key":"1331_CR34","doi-asserted-by":"crossref","unstructured":"Che S et al. (2009) Rodinia: a benchmark suite for heterogeneous computing. In: IEEE international symposium on workload characterization, 2009 (IISWC 2009), Austin, TX, 4\u20136 October 2009, pp 44\u201354","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"1331_CR35","unstructured":"NVIDIA Corp. CUDA SDK 2.3. https:\/\/developer.nvidia.com\/cuda-toolkit-23-downloads"},{"key":"1331_CR36","unstructured":"NVIDIA Corp. CUDA SDK 3.1. https:\/\/developer.nvidia.com\/cuda-toolkit-31-downloads"},{"key":"1331_CR37","unstructured":"Rofouei M, Stathopoulos T, Ryffel S, Kaiser W, Sarrafzadeh M (2008) Energy-aware high performance computing with graphic processing units. In: Proceedings of the 2008 conference on power aware computing and systems (HotPower\u201908), pp 11\u201311"},{"key":"1331_CR38","doi-asserted-by":"crossref","unstructured":"Huang S, Xiao S, Feng W (2009) On the energy efficiency of graphics processing units for scientific computing. In: IEEE international symposium on parallel & distributed processing, 2009 (IPDPS 2009), Rome, 23\u201329 May 2009, pp 1\u20138","DOI":"10.1109\/IPDPS.2009.5160980"},{"key":"1331_CR39","doi-asserted-by":"crossref","unstructured":"Jiao Y, Lin H, Balaji P, Feng W (2010) Power and performance characterization of computational kernels on the GPU. In: IEEE\/ACM international conference on green computing and communications, 2010 (GreenCom\u201910) & international conference on cyber, physical and social computing (CPSCom), Hangzhou, 18\u201320 December 2010, pp 221\u2013228","DOI":"10.1109\/GreenCom-CPSCom.2010.143"},{"key":"1331_CR40","doi-asserted-by":"crossref","unstructured":"Byna S, Chen Y, Sun XH (2009) Taxonomy of data prefetching for multicore processors. J Comput Sci Technol 24(3): 405\u2013417. (Taxonomy of data prefetching for multicore processors).","DOI":"10.1007\/s11390-009-9233-4"},{"key":"1331_CR41","doi-asserted-by":"crossref","unstructured":"Woo DH, Lee HS (2010) COMPASS: a programmable data prefetcher using idle GPU shaders. In: Proceedings of the fifteenth edition of ASPLOS on architectural support for programming languages and operating systems (ASPLOS XV), pp 297\u2013310","DOI":"10.1145\/1736020.1736054"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-014-1331-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11227-014-1331-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-014-1331-6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,6,1]],"date-time":"2019-06-01T06:40:37Z","timestamp":1559371237000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11227-014-1331-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2014,12,5]]},"references-count":41,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2015,8]]}},"alternative-id":["1331"],"URL":"https:\/\/doi.org\/10.1007\/s11227-014-1331-6","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"value":"0920-8542","type":"print"},{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2014,12,5]]}}}