{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:38:55Z","timestamp":1740123535350,"version":"3.37.3"},"reference-count":44,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2019,11,16]],"date-time":"2019-11-16T00:00:00Z","timestamp":1573862400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2019,11,16]],"date-time":"2019-11-16T00:00:00Z","timestamp":1573862400000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61572508","61872374"],"award-info":[{"award-number":["61572508","61872374"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61433019"],"award-info":[{"award-number":["61433019"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2020,4]]},"DOI":"10.1007\/s11227-019-03079-y","type":"journal-article","created":{"date-parts":[[2019,11,16]],"date-time":"2019-11-16T15:01:26Z","timestamp":1573916486000},"page":"2958-2985","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["A quantitative evaluation of unified memory in GPUs"],"prefix":"10.1007","volume":"76","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8105-8682","authenticated-orcid":false,"given":"Qi","family":"Yu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bruce","family":"Childers","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Libo","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cheng","family":"Qian","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhiying","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2019,11,16]]},"reference":[{"key":"3079_CR1","doi-asserted-by":"crossref","unstructured":"Ausavarungnirun R, Landgraf J, Miller V, Ghose S, Gandhi J, Rossbach CJ, Mutlu O (2017) Mosaic: a GPU memory manager with application-transparent support for multiple page sizes. In: Proceedings of the 50th IEEE\/ACM International Symposium on Microarchitecture, pp 136\u2013150","DOI":"10.1145\/3123939.3123975"},{"key":"3079_CR2","doi-asserted-by":"crossref","unstructured":"Ausavarungnirun R, Miller V, Landgraf J, Ghose S, Gandhi J, Jog A, Rossbach CJ, Mutlu O (2018) MASK: redesigning the GPU memory hierarchy to support multi-application concurrency. In: Proceedings of the 23rd ACM International Conference on Architectural Support for Programming Languages and Operating Systems, pp 503\u2013518","DOI":"10.1145\/3173162.3173169"},{"key":"3079_CR3","doi-asserted-by":"crossref","unstructured":"Bakhoda A, Yuan GL, Fung WWL, Wong H, Aamodt TM (2009) Analyzing CUDA workloads using a detailed GPU simulator. In: Proceedings of 2009 IEEE International Symposium on Performance Analysis of Systems and Software, pp 163\u2013174","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"3079_CR4","unstructured":"Bansal S, Modha DS (2004) CAR: clock with adaptive replacement. In: Proceedings of the 3rd USENIX Conference on File and Storage Technologies, pp 187\u2013200"},{"key":"3079_CR5","doi-asserted-by":"crossref","unstructured":"Che S, Boyer M, Meng J, Tarjan D, Sheaffer JW, Lee S, Skadron K (2009) Rodinia: a benchmark suite for heterogeneous computing. In: Proceedings of 2009 IEEE International Symposium on Workload Characterization, pp 44\u201354","DOI":"10.1109\/IISWC.2009.5306797"},{"issue":"2","key":"3079_CR6","doi-asserted-by":"publisher","first-page":"42","DOI":"10.1109\/MM.2018.022071134","volume":"38","author":"J Choquette","year":"2018","unstructured":"Choquette J, Giroux O, Foley D (2018) Volta: performance and programmability. IEEE Micro 38(2):42\u201352","journal-title":"IEEE Micro"},{"key":"3079_CR7","doi-asserted-by":"crossref","unstructured":"Danskin J (2016) PASCAL GPU WITH NVLINK. http:\/\/hotchips.org\/wp-content\/uploads\/hc_archives\/hc28\/HC28.22-Monday-Epub\/HC28.22.10-GPU-HPC-Epub\/HC28.22.121-Pascal-GPU-DanskinFoley-NVIDIA-v06-6_7.pdf. Accessed 5 May 2019","DOI":"10.1109\/HOTCHIPS.2016.7936202"},{"key":"3079_CR8","doi-asserted-by":"crossref","unstructured":"Dashti M, Fedorova A (2017) Analyzing memory management methods on integrated CPU-GPU systems. In: Proceedings of the 2017 ACM SIGPLAN International Symposium on Memory Management, pp 59\u201369","DOI":"10.1145\/3092255.3092256"},{"issue":"2","key":"3079_CR9","doi-asserted-by":"publisher","first-page":"7","DOI":"10.1109\/MM.2017.37","volume":"37","author":"D Foley","year":"2017","unstructured":"Foley D, Danskin J (2017) Ultra-performance pascal GPU and NVLink interconnect. IEEE Micro 37(2):7\u201317","journal-title":"IEEE Micro"},{"key":"3079_CR10","doi-asserted-by":"crossref","unstructured":"Fung WW, Sham I, Yuan G, Aamodt TM (2007) Dynamic warp formation and scheduling for efficient GPU control flow. In: Proceedings of the 40th Annual IEEE\/ACM International Symposium on Microarchitecture, pp 407\u2013420","DOI":"10.1109\/MICRO.2007.30"},{"key":"3079_CR11","doi-asserted-by":"crossref","unstructured":"Ganguly D, Zhang Z, Yang J, Melhem R (2019) Interplay between hardware prefetcher and page eviction policy in CPU-GPU unified virtual memory. In: ISCA, pp 224\u2013235","DOI":"10.1145\/3307650.3322224"},{"key":"3079_CR12","doi-asserted-by":"crossref","unstructured":"Grauer-Gray S, Xu L, Searles R, Ayalasomayajula S, Cavazos J (2012) Auto-tuning a high-level language targeted to GPU codes. In: Proceedings of 2012 Innovative Parallel Computing, pp 1\u201310","DOI":"10.1109\/InPar.2012.6339595"},{"key":"3079_CR13","doi-asserted-by":"crossref","unstructured":"Hao Y, Fang Z, Reinman G, Cong J (2017) Supporting address translation for accelerator-centric architectures. In: Proceedings of the 23rd IEEE International Symposium on High Performance Computer Architecture, pp 37\u201348","DOI":"10.1109\/HPCA.2017.19"},{"key":"3079_CR14","unstructured":"Harris M (2013) Unified memory in CUDA 6. https:\/\/devblogs.nvidia.com\/unified-memory-in-cuda-6\/. Accessed 8 May 2019"},{"key":"3079_CR15","doi-asserted-by":"crossref","unstructured":"Hestness J, Keckler SW, Wood DA (2014) A comparative analysis of microarchitecture effects on CPU and GPU memory system behavior. In: Proceedings of 2014 IEEE International Symposium on Workload Characterization, pp 150\u2013160","DOI":"10.1109\/IISWC.2014.6983054"},{"issue":"2","key":"3079_CR16","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1145\/3224430","volume":"2","author":"A Jain","year":"2018","unstructured":"Jain A, Khairy M, Rogers TG (2018) A quantitative evaluation of contemporary gpu simulation methodology. Proc ACM Meas Anal Comput Syst 2(2):35","journal-title":"Proc ACM Meas Anal Comput Syst"},{"key":"3079_CR17","doi-asserted-by":"crossref","unstructured":"Jaleel A, Theobald KB, Steely\u00a0Jr SC, Emer J (2010) High performance cache replacement using re-reference interval prediction (RRIP). In: Proceedings of the 37th International Symposium on Computer Architecture, pp 60\u201371","DOI":"10.1145\/1816038.1815971"},{"issue":"12","key":"3079_CR18","doi-asserted-by":"publisher","first-page":"5378","DOI":"10.1007\/s11227-017-2091-x","volume":"73","author":"\u0141 Jarz\u0105bek","year":"2017","unstructured":"Jarz\u0105bek \u0141, Czarnul P (2017) Performance evaluation of unified memory and dynamic parallelism for selected parallel cuda applications. J Supercomput 73(12):5378\u20135401","journal-title":"J Supercomput"},{"key":"3079_CR19","unstructured":"Jiang S, Chen F, Zhang X (2005) CLOCK-Pro: an effective improvement of the CLOCK replacement. In: Proceedings of USENIX Annual Technical Conference, pp 323\u2013336"},{"key":"3079_CR20","doi-asserted-by":"crossref","unstructured":"Jog A, Kayiran O, Mishra AK, Kandemir MT, Mutlu O, Iyer R, Das CR (2013) Orchestrated scheduling and prefetching for GPGPUs. In: Proceedings of the 40th Annual International Symposium on Computer Architecture, pp 332\u2013343","DOI":"10.1145\/2485922.2485951"},{"key":"3079_CR21","doi-asserted-by":"crossref","unstructured":"Kehne J, Metter J, Bellosa F (2015) GPUswap: enabling oversubscription of GPU memory through transparent swapping. In: Proceedings of the 11th ACM SIGPLAN\/SIGOPS International Conference on Virtual Execution Environments, pp 65\u201377","DOI":"10.1145\/2731186.2731192"},{"key":"3079_CR22","unstructured":"Xu JY (2008) OpenCL \u2013 the open standard for parallel programming of heterogeneous systems. https:\/\/pdfs.semanticscholar.org\/fb16\/3d7fe546bb950294ffaf5ef6e225f630c76d.pdf. Accessed 14 Nov 2019"},{"key":"3079_CR23","doi-asserted-by":"crossref","unstructured":"Landaverde R, Zhang T, Coskun AK, Herbordt M (2014) An investigation of unified memory access performance in CUDA. In: Proceedings of 2014 IEEE High Performance Extreme Computing Conference, pp 1\u20136","DOI":"10.1109\/HPEC.2014.7040988"},{"key":"3079_CR24","doi-asserted-by":"crossref","unstructured":"Li C, Ausavarungnirun R, Rossbach CJ, Zhang Y, Mutlu O, Guo Y, Yang J (2019) A framework for memory oversubscription management in graphics processing units. In: Proceedings of the 24th ACM International Conference on Architectural Support for Programming Languages and Operating System","DOI":"10.1145\/3297858.3304044"},{"key":"3079_CR25","doi-asserted-by":"crossref","unstructured":"Li W, Jin G, Cui X, See S (2015) An evaluation of unified memory technology on NVIDIA GPUs. In: Proceedings of the 15th IEEE\/ACM International Symposium on Cluster, Cloud and Grid Computing, pp 1092\u20131098","DOI":"10.1109\/CCGrid.2015.105"},{"issue":"2","key":"3079_CR26","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1109\/MM.2008.31","volume":"28","author":"E Lindholm","year":"2008","unstructured":"Lindholm E, Nickolls J, Oberman S, Montrym J (2008) NVIDIA tesla: a unified graphics and computing architecture. IEEE Micro 28(2):39\u201355","journal-title":"IEEE Micro"},{"key":"3079_CR27","unstructured":"NVIDIA (2009) NVIDIA next generation CUDA compute architecture: Fermi. https:\/\/www.nvidia.com\/content\/PDF\/fermi_white_papers\/NVIDIA_Fermi_Compute_Architecture_Whitepaper.pdf. Accessed 10 May 2019"},{"key":"3079_CR28","unstructured":"NVIDIA (2018) CUDA C programming guide. https:\/\/docs.nvidia.com\/cuda\/archive\/9.1\/pdf\/CUDA_C_Programming_Guide.pdf. Accessed 14 Nov 2019"},{"key":"3079_CR29","unstructured":"NVIDIA (2016) Pascal P100. https:\/\/images.nvidia.com\/content\/pdf\/tesla\/whitepaper\/pascal-architecture-whitepaper.pdf. Accessed 10 May 2019"},{"key":"3079_CR30","unstructured":"NVIDIA (2017) TESLA V100 GPU ARCHITECTURE. https:\/\/images.nvidia.com\/content\/volta-architecture\/pdf\/volta-architecture-whitepaper.pdf. Accessed 10 May 2019"},{"key":"3079_CR31","doi-asserted-by":"crossref","unstructured":"Pichai B, Hsu L, Bhattacharjee A (2014) Architectural support for address translation on GPUs: designing memory management units for CPU\/GPUs with unified address spaces. In: Proceedings of the 19th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, pp 743\u2013758","DOI":"10.1145\/2541940.2541942"},{"key":"3079_CR32","doi-asserted-by":"crossref","unstructured":"Power J, Hill MD, Wood DA (2014) Supporting x86-64 address translation for 100s of GPU lanes. In: Proceedings of the 20th IEEE International Symposium on High Performance Computer Architecture, pp 568\u2013578","DOI":"10.1109\/HPCA.2014.6835965"},{"key":"3079_CR33","doi-asserted-by":"crossref","unstructured":"Qureshi MK, Jaleel A, Patt YN, Steely SC, Emer J (2007) Adaptive insertion policies for high performance caching. In: Proceedings of the 34th International Symposium on Computer Architecture, pp 381\u2013391","DOI":"10.1145\/1273440.1250709"},{"key":"3079_CR34","doi-asserted-by":"crossref","unstructured":"Rogers TG, O\u2019Connor M, Aamodt TM (2012) Cache-conscious wavefront scheduling. In: Proceedings of the 2012 45th Annual IEEE\/ACM International Symposium on Microarchitecture, pp 72\u201383","DOI":"10.1109\/MICRO.2012.16"},{"key":"3079_CR35","unstructured":"Sakharnykh N (2016) Beyond GPU memory limits with unified memory on pascal. https:\/\/devblogs.nvidia.com\/beyond-gpu-memory-limits-unified-memory-pascal\/. Accessed 11 May 2019"},{"key":"3079_CR36","unstructured":"Sakharnykh N (2017) Unified memory on pascal and volta. http:\/\/on-demand.gputechconf.com\/gtc\/2017\/presentation\/s7285-nikolay-sakharnykh-unified-memory-on-pascal-and-volta.pdf. Accessed 11 May 2019"},{"key":"3079_CR37","unstructured":"Sakharnykh N (2018) Everything you need to know about unified memory. http:\/\/on-demand.gputechconf.com\/gtc\/2018\/presentation\/s8430-everything-you-need-to-know-about-unified-memory.pdf. Accessed 11 May 2019"},{"key":"3079_CR38","doi-asserted-by":"crossref","unstructured":"Shin S, Cox G, Oskin M, Loh GH, Solihin Y, Bhattacharjee A, Basu A (2018) Scheduling page table walks for irregular GPU applications. In: Proceedings of the 45th International Symposium on Computer Architecture, pp 180\u2013192","DOI":"10.1109\/ISCA.2018.00025"},{"key":"3079_CR39","doi-asserted-by":"crossref","unstructured":"Shin S, LeBeane M, Solihin Y, Basu A (2018) Neighborhood-aware address translation for irregular GPU applications. In: Proceedings of the 51st IEEE\/ACM International Symposium on Microarchitecture, pp 352\u2013363","DOI":"10.1109\/MICRO.2018.00036"},{"key":"3079_CR40","unstructured":"Stratton JA, Rodrigues C, Sung I, Obeid N, Chang L, Anssari N, Liu GD, Hwu WW (2012) Parboil: a revised benchmark suite for scientific and commercial throughput computing. IMPACT Technical Report, pp 1\u201312"},{"key":"3079_CR41","doi-asserted-by":"crossref","unstructured":"Vesely J, Basu A, Oskin M, Loh GH, Bhattacharjee A (2016) Observations and opportunities in architecting shared virtual memory for heterogeneous systems. In: Proceedings of 2016 IEEE International Symposium on Performance Analysis of Systems and Software, pp 161\u2013171","DOI":"10.1109\/ISPASS.2016.7482091"},{"key":"3079_CR42","doi-asserted-by":"publisher","unstructured":"Yu Q, Childers B, Huang L, Qian C, Wang Z. HPE: Hierarchical page eviction policy for unified memory in GPUs. IEEE Trans Comput-Aided Des Integr Circuits Syst. https:\/\/doi.org\/10.1109\/TCAD.2019.2944790","DOI":"10.1109\/TCAD.2019.2944790"},{"key":"3079_CR43","doi-asserted-by":"crossref","unstructured":"Yu Q, Childers B, Huang L, Qian C, Wang Z (2019) Hierarchical page eviction policy for unified memory in GPUs. In: 2019 IEEE International Symposium on Performance Analysis of Systems and Software, pp 149\u2013150","DOI":"10.1109\/ISPASS.2019.00027"},{"key":"3079_CR44","doi-asserted-by":"crossref","unstructured":"Zheng T, Nellans D, Zulfiqar A, Stephenson M, Keckler SW (2016) Towards high performance paged memory for GPUs. In: Proceedings of the 22nd IEEE International Symposium on High Performance Computer Architecture, pp 345\u2013357","DOI":"10.1109\/HPCA.2016.7446077"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-019-03079-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11227-019-03079-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-019-03079-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,11,15]],"date-time":"2020-11-15T00:41:43Z","timestamp":1605400903000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11227-019-03079-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,11,16]]},"references-count":44,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2020,4]]}},"alternative-id":["3079"],"URL":"https:\/\/doi.org\/10.1007\/s11227-019-03079-y","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"type":"print","value":"0920-8542"},{"type":"electronic","value":"1573-0484"}],"subject":[],"published":{"date-parts":[[2019,11,16]]},"assertion":[{"value":"16 November 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}