{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T15:25:10Z","timestamp":1759937110968,"version":"3.37.3"},"reference-count":23,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2018,1,27]],"date-time":"2018-01-27T00:00:00Z","timestamp":1517011200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100005856","name":"Faculdade de Ci\u00eancias e Tecnologia, Universidade Nova de Lisboa","doi-asserted-by":"publisher","award":["UID\/CEC\/50021\/2013","SFRH\/BD\/100697\/2014"],"award-info":[{"award-number":["UID\/CEC\/50021\/2013","SFRH\/BD\/100697\/2014"]}],"id":[{"id":"10.13039\/501100005856","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2018,6]]},"DOI":"10.1007\/s11227-018-2260-6","type":"journal-article","created":{"date-parts":[[2018,1,27]],"date-time":"2018-01-27T10:09:00Z","timestamp":1517047740000},"page":"2314-2328","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Stream data prefetcher for the GPU memory interface"],"prefix":"10.1007","volume":"74","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0628-2259","authenticated-orcid":false,"given":"Nuno","family":"Neves","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8083-4432","authenticated-orcid":false,"given":"Pedro","family":"Tom\u00e1s","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2491-4977","authenticated-orcid":false,"given":"Nuno","family":"Roma","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,1,27]]},"reference":[{"key":"2260_CR1","doi-asserted-by":"crossref","unstructured":"Amilkanthwar M, Balachandran S (2013) CUPL: A compile-time uncoalesced memory access pattern locator for CUDA. In: Proceedings of the 27th ACM International Conference On Supercomputing. ACM, pp 459\u2013460","DOI":"10.1145\/2464996.2467288"},{"issue":"3","key":"2260_CR2","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/2366231.2337169","volume":"40","author":"JM Arnau","year":"2012","unstructured":"Arnau JM, Parcerisa JM, Xekalakis P (2012) Boosting mobile GPU performance with a decoupled access\/execute fragment processor. ACM SIGARCH Comput Archit News 40(3):84\u201393","journal-title":"ACM SIGARCH Comput Archit News"},{"key":"2260_CR3","doi-asserted-by":"crossref","unstructured":"Bakhoda A, Yuan GL, Fung WW, Wong H, Aamodt TM (2009) Analyzing CUDA workloads using a detailed GPU simulator. In: IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS), pp 163\u2013174","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"2260_CR4","doi-asserted-by":"crossref","unstructured":"Che S, Boyer M, Meng J, Tarjan D, Sheaffer JW, Lee SH, Skadron K (2009) Rodinia: a benchmark suite for heterogeneous computing. In: IEEE International Symposium on Workload Characterization (IISWC), pp 44\u201354","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"2260_CR5","doi-asserted-by":"crossref","unstructured":"Ghosh S, Martonosi M, Malik S (1997) Cache miss equations: An analytical representation of cache misses. In: ACM International Conference on Supercomputing. ACM Press, pp 317\u2013324","DOI":"10.1145\/263580.263657"},{"key":"2260_CR6","doi-asserted-by":"crossref","unstructured":"Grauer-Gray S, Xu L, Searles R, Ayalasomayajula S, Cavazos J (2012) Auto-tuning a high-level language targeted to GPU codes. In: Innovative Parallel Computing (InPar), 2012. IEEE, pp 1\u201310","DOI":"10.1109\/InPar.2012.6339595"},{"issue":"04","key":"2260_CR7","doi-asserted-by":"publisher","first-page":"1250010","DOI":"10.1142\/S0129626412500107","volume":"22","author":"T Grosser","year":"2012","unstructured":"Grosser T, Groesslinger A, Lengauer C (2012) Polly\u2014performing polyhedral optimizations on a low-level intermediate representation. Parallel Process Lett 22(04):1250010","journal-title":"Parallel Process Lett"},{"key":"2260_CR8","doi-asserted-by":"crossref","unstructured":"Jia W, Shaw K, Martonosi M (2014) MRPB: Memory request prioritization for massively parallel processors. In: 2014 IEEE 20th International Symposium on High Performance Computer Architecture (HPCA). IEEE, pp 272\u2013283","DOI":"10.1109\/HPCA.2014.6835938"},{"key":"2260_CR9","doi-asserted-by":"crossref","unstructured":"Jia W, Shaw KA, Martonosi M (2012) Characterizing and improving the use of demand-fetched caches in GPUs. In: Proceedings of the 26th ACM International Conference on Supercomputing. ACM, pp 15\u201324","DOI":"10.1145\/2304576.2304582"},{"issue":"3","key":"2260_CR10","doi-asserted-by":"publisher","first-page":"332","DOI":"10.1145\/2508148.2485951","volume":"41","author":"A Jog","year":"2013","unstructured":"Jog A, Kayiran O, Mishra AK, Kandemir MT, Mutlu O, Iyer R, Das CR (2013) Orchestrated scheduling and prefetching for GPGPUs. ACM SIGARCH Comput Archit News 41(3):332\u2013343","journal-title":"ACM SIGARCH Comput Archit News"},{"key":"2260_CR11","doi-asserted-by":"crossref","unstructured":"Lakshminarayana NB, Kim H (2014) Spare register aware prefetching for graph algorithms on gpus. In: IEEE 20th International Symposium on High Performance Computer Architecture (HPCA), pp 614\u2013625","DOI":"10.1109\/HPCA.2014.6835970"},{"key":"2260_CR12","doi-asserted-by":"crossref","unstructured":"Lee J, Lakshminarayana NB, Kim H, Vuduc R (2010) Many-thread aware prefetching mechanisms for GPGPU applications. In: 43rd Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO), pp 213\u2013224","DOI":"10.1109\/MICRO.2010.44"},{"key":"2260_CR13","doi-asserted-by":"crossref","unstructured":"Lee S, Kim K, Koo G, Jeon H, Ro WW, Annavaram M (2015) Warped-compression: enabling power efficient GPUs through register compression. In: 42nd Intl Symposium on Computer Architecture. ACM, pp 502\u2013514","DOI":"10.1145\/2749469.2750417"},{"issue":"3","key":"2260_CR14","doi-asserted-by":"publisher","first-page":"487","DOI":"10.1145\/2508148.2485964","volume":"41","author":"J Leng","year":"2013","unstructured":"Leng J, Hetherington T, ElTantawy A, Gilani S, Kim NS, Aamodt TM, Reddi VJ (2013) GPUWattch: enabling energy optimizations in GPGPUs. ACM SIGARCH Comput Archit News 41(3):487\u2013498","journal-title":"ACM SIGARCH Comput Archit News"},{"issue":"7","key":"2260_CR15","doi-asserted-by":"publisher","first-page":"2130","DOI":"10.1109\/TVLSI.2017.2671405","volume":"25","author":"N Neves","year":"2017","unstructured":"Neves N, Tom\u00e1s P, Roma N (2017) Adaptive in-cache streaming for efficient data management. IEEE Trans Very Large Scale Integr (VLSI) Syst 25(7):2130\u20132143","journal-title":"IEEE Trans Very Large Scale Integr (VLSI) Syst"},{"key":"2260_CR16","unstructured":"NVIDIA (2009) NVIDIA\u2019s Next Generation CUDATM Compute Architecture: FermiTM. NVIDIA, Santa Clara, Calif, USA"},{"key":"2260_CR17","unstructured":"NVIDIA (2016) NVIDIA GP100 Pascal Architecture. White paper (Online). \n                    https:\/\/images.nvidia.com\/content\/pdf\/tesla\/whitepaper\/pascal-architecture-whitepaper.pdf"},{"key":"2260_CR18","doi-asserted-by":"crossref","unstructured":"Panda R, Eckert Y, Jayasena N, Kayiran O, Boyer M, John LK (2016) Prefetching techniques for near-memory throughput processors. In: Proceedings of the 2016 International Conference on Supercomputing, ICS \u201916. ACM, New York, pp. 40:1\u201340:14","DOI":"10.1145\/2925426.2926282"},{"key":"2260_CR19","unstructured":"Sethia A, Dasika G, Samadi M, Mahlke S (2013) APOGEE: Adaptive prefetching on GPUs for energy efficiency. In: Proceedings of the 22nd International Conference on Parallel Architectures and Compilation Techniques. IEEE, pp 73\u201382"},{"key":"2260_CR20","doi-asserted-by":"crossref","unstructured":"Stephenson M, Hari SKS, Lee Y, Ebrahimi E, Johnson DR, Nellans D, O\u2019Connor M, Keckler SW (2015) Flexible software profiling of GPU architectures. In: 42nd International Symposium on Computer Architecture. ACM, pp 185\u2013197","DOI":"10.1145\/2749469.2750375"},{"key":"2260_CR21","doi-asserted-by":"crossref","unstructured":"Torres Y, Gonzalez-Escribano A, Llanos DR (2011) Understanding the impact of CUDA tuning techniques for Fermi. In: International Conference on High Performance Computing and Simulation (HPCS). IEEE, pp 631\u2013639","DOI":"10.1109\/HPCSim.2011.5999886"},{"issue":"8","key":"2260_CR22","doi-asserted-by":"publisher","first-page":"57","DOI":"10.1145\/2517327.2442523","volume":"48","author":"B Wu","year":"2013","unstructured":"Wu B, Zhao Z, Zhang EZ, Jiang Y, Shen X (2013) Complexity analysis and algorithm design for reorganizing data to minimize non-coalesced memory accesses on GPU. ACM SIGPLAN Not 48(8):57\u201368","journal-title":"ACM SIGPLAN Not"},{"key":"2260_CR23","doi-asserted-by":"crossref","unstructured":"Xie X, Liang Y, Wang Y, Sun G, Wang T (2015) Coordinated static and dynamic cache bypassing for GPUs. In: 2015 IEEE 21st International Symposium on High Performance Computer Architecture (HPCA). IEEE, pp 76\u201388","DOI":"10.1109\/HPCA.2015.7056023"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11227-018-2260-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-018-2260-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-018-2260-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,1,26]],"date-time":"2019-01-26T20:01:29Z","timestamp":1548532889000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11227-018-2260-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,1,27]]},"references-count":23,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2018,6]]}},"alternative-id":["2260"],"URL":"https:\/\/doi.org\/10.1007\/s11227-018-2260-6","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"type":"print","value":"0920-8542"},{"type":"electronic","value":"1573-0484"}],"subject":[],"published":{"date-parts":[[2018,1,27]]},"assertion":[{"value":"27 January 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}