{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,29]],"date-time":"2025-08-29T10:39:17Z","timestamp":1756463957191,"version":"3.37.3"},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2018,5,19]],"date-time":"2018-05-19T00:00:00Z","timestamp":1526688000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"name":"National Key R&D Program of China","award":["2017YFB0203201","2017YFC0820100"],"award-info":[{"award-number":["2017YFB0203201","2017YFC0820100"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61732002","61202425"],"award-info":[{"award-number":["61732002","61202425"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2018,7]]},"DOI":"10.1007\/s11227-018-2389-3","type":"journal-article","created":{"date-parts":[[2018,5,19]],"date-time":"2018-05-19T01:43:27Z","timestamp":1526694207000},"page":"3388-3414","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["SRAM- and STT-RAM-based hybrid, shared last-level cache for on-chip CPU\u2013GPU heterogeneous architectures"],"prefix":"10.1007","volume":"74","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5637-9417","authenticated-orcid":false,"given":"Lan","family":"Gao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rui","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yunlong","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hailong","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhongzhi","family":"Luan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Depei","family":"Qian","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Han","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jihong","family":"Cai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2018,5,19]]},"reference":[{"unstructured":"Yuffe M, Knoll E, Mehalel M, Shor J, Kurts T (2011) A fully integrated multi-CPU, GPU and memory controller 32\u00a0nm processor. In: Proceedings of the International Solid-State Circuits Conference, pp 264\u2013266","key":"2389_CR1"},{"unstructured":"AMD (2017) AMD and HSA. \n                    http:\/\/www.amd.com\/en-us\/innovations\/software-technologies\/hsa\n                    \n                  . Accessed 25 Apr 2018","key":"2389_CR2"},{"unstructured":"Nvidia (2014) NVIDIA Jetson TK1 development kit: bringing GPU-accelerated computing to embedded systems. \n                    http:\/\/developer.download.nvidia.com\/embedded\/jetson\/TK1\/docs\/Jetson_platform_brief_May2014.pdf\n                    \n                  . Accessed 25 Apr 2018","key":"2389_CR3"},{"unstructured":"Nvidia (2015) NVIDIA Tegra X1: NVIDIA\u2019S new mobile superchip. \n                    https:\/\/international.download.nvidia.com\/pdf\/tegra\/Tegra-X1-whitepaper-v1.0.pdf\n                    \n                  . Accessed 25 Apr 2018","key":"2389_CR4"},{"unstructured":"Chang M, Rosenfeld P, Lu S, Jacob B (2013) Technology comparison for large last-level caches (L3Cs): low-leakage SRAM, low write-energy STT-RAM, and refresh-optimized eDRAM. In: Proceedings of the 19th International High Performance Computer Architecture Symposium, pp 143\u2013154","key":"2389_CR5"},{"issue":"7","key":"2389_CR6","doi-asserted-by":"publisher","first-page":"994","DOI":"10.1109\/TCAD.2012.2185930","volume":"31","author":"X Dong","year":"2012","unstructured":"Dong X, Xu C, Xie Y, Jouppi NP (2012) NVSim: a circuit-level performance, energy, and area model for emerging non-volatile memory. IEEE Trans Comput Aided Des Integr Circuits Syst 31(7):994\u20131007","journal-title":"IEEE Trans Comput Aided Des Integr Circuits Syst"},{"doi-asserted-by":"crossref","unstructured":"Jog A, Mishra AK, Xu C, Xie Y, Narayanan V, Iyer R, Das CR (2012) Cache revive: architecting volatile STT-RAM caches for enhanced performance in CMPs. In: Proceedings of the 49th Design Automation Conference, pp 243\u2013252","key":"2389_CR7","DOI":"10.1145\/2228360.2228406"},{"doi-asserted-by":"crossref","unstructured":"Dong X, Wu X, Sun G, Xie Y, Li H, Chen Y (2008) Circuit and microarchitecture evaluation of 3D stacking magnetic RAM (MRAM) as a universal memory replacement. In: Proceedings of the 45th Design Automation Conference, pp 554\u2013559","key":"2389_CR8","DOI":"10.1145\/1391469.1391610"},{"unstructured":"Nvidia (2017) NVIDIA CUDA programming guide. \n                    http:\/\/docs.nvidia.com\/cuda\/pdf\/CUDA_C_Programming_Guide.eps\n                    \n                  . Accessed 25 Apr 2018","key":"2389_CR9"},{"unstructured":"Khronos OpenCL Working Group (2017) Khronos OpenCL. \n                    http:\/\/www.khronos.org\/opencl\/\n                    \n                  . Accessed 25 Apr 2018","key":"2389_CR10"},{"doi-asserted-by":"crossref","unstructured":"Che S, Boyer M, Meng J, Tarjan D, Sheaffer JW, Lee SH, Skadron K (2009) Rodinia: a benchmark suite for heterogeneous computing. In: IEEE Proceedings of the International Symposium on Workload Characterization, pp 44\u201354","key":"2389_CR11","DOI":"10.1109\/IISWC.2009.5306797"},{"doi-asserted-by":"crossref","unstructured":"Wang Z, Jimenez DA, Xu C, Sun G, Xie Y (2014) Adaptive placement and migration policy for an STT-RAM-based hybrid cache. In: Proceedings of the 20th International High Performance Computer Architecture Symposium, pp 13\u201324","key":"2389_CR12","DOI":"10.1109\/HPCA.2014.6835933"},{"unstructured":"Chen Y, Cong J, Huang H, Liu B, Liu C, Potkonjak M, Reinman G (2012) Dynamically reconfigurable hybrid cache: an energy-efficient last-level cache design. In: Proceedings of the Design, Automation, and Test in Europe Conference and Exhibition, pp 45\u201350","key":"2389_CR13"},{"doi-asserted-by":"crossref","unstructured":"Chen Y, Cong J, Huang H, Liu C, Prabhakar R, Reinman G (2012) Static and dynamic co-optimizations for blocks mapping in hybrid caches. In: Proceedings of the International Low Power Electronics and Design Design Symposium, pp 237\u2013242","key":"2389_CR14","DOI":"10.1145\/2333660.2333717"},{"doi-asserted-by":"crossref","unstructured":"Lee J, Kim H (2012) TAP: a TLP-aware cache management policy for a CPU\u2013GPU heterogeneous architecture. In: Proceedings of the 18th International High Performance Computer Architecture Symposium, pp 1\u201312","key":"2389_CR15","DOI":"10.1109\/HPCA.2012.6168947"},{"issue":"1","key":"2389_CR16","doi-asserted-by":"publisher","first-page":"34","DOI":"10.1109\/LCA.2014.2299539","volume":"14","author":"J Power","year":"2015","unstructured":"Power J, Hestness J, Orr MS, Hill MD, Wood DA (2015) gem5-gpu: a heterogeneous CPU-GPU simulator. IEEE Comput Archit Lett 14(1):34\u201336","journal-title":"IEEE Comput Archit Lett"},{"issue":"1","key":"2389_CR17","doi-asserted-by":"publisher","first-page":"130","DOI":"10.1145\/1241601.1241625","volume":"35","author":"CD Spradling","year":"2007","unstructured":"Spradling CD (2007) SPEC CPU2006 benchmark tools. ACM SIGARCH Comput Archit News 35(1):130\u2013134","journal-title":"ACM SIGARCH Comput Archit News"},{"unstructured":"Thapliyal H, Arabnia HR, Bajpai R, Sharma KK (2007) Combined integer and variable precision (CIVP) floating point multiplication architecture for FPGAs. In: Proceedings of the 13th International Conference on Parallel and Distributed Processing Techniques and Applications, pp 449\u2013450","key":"2389_CR18"},{"doi-asserted-by":"crossref","unstructured":"Thapliyal H, Arabnia HR, Vinod AP (2006) Combined integer and floating point multiplication architecture (CIFM) for FPGAs and its reversible logic implementation. In: Proceedings of the 49th IEEE International Midwest Symposium on Circuits and Systems, pp 148\u2013154","key":"2389_CR19","DOI":"10.1109\/MWSCAS.2006.382306"},{"issue":"7420","key":"2389_CR20","doi-asserted-by":"crossref","first-page":"73","DOI":"10.1007\/978-3-642-35840-1_4","volume":"17","author":"H Thapliyal","year":"2013","unstructured":"Thapliyal H, Jayashree HV, Nagamani AN, Arabnia HR (2013) Progress in reversible processor design: a novel methodology for reversible carry look-ahead adder. Trans Comput Sci 17(7420):73\u201397","journal-title":"Trans Comput Sci"},{"issue":"5300","key":"2389_CR21","first-page":"99","volume":"3","author":"H Thapliyal","year":"2009","unstructured":"Thapliyal H, Arabnia HR, Srinivas MB (2009) Efficient reversible logic design of BCD subtractors. Trans Comput Sci 3(5300):99\u2013121","journal-title":"Trans Comput Sci"},{"issue":"3","key":"2389_CR22","doi-asserted-by":"publisher","first-page":"179","DOI":"10.1111\/j.1467-8659.1986.tb00296.x","volume":"5","author":"HR Arabnia","year":"1986","unstructured":"Arabnia HR, Oliver MA (1986) Fast operations on raster images with SIMD machine architectures. Comput Graph Forum 5(3):179\u2013188","journal-title":"Comput Graph Forum"},{"issue":"4","key":"2389_CR23","doi-asserted-by":"publisher","first-page":"1477","DOI":"10.1007\/s11227-016-1676-0","volume":"72","author":"HV Jayashree","year":"2016","unstructured":"Jayashree HV, Thapliyal H, Arabnia HR, Agrawal VK (2016) Ancilla-input and garbage-output optimized design of a reversible quantum integer multiplier. J Supercomput 72(4):1477\u20131493","journal-title":"J Supercomput"},{"issue":"1\u20132","key":"2389_CR24","doi-asserted-by":"publisher","first-page":"185","DOI":"10.1023\/A:1019119117297","volume":"10","author":"HR Arabnia","year":"1998","unstructured":"Arabnia HR, Taha TR (1998) A parallel numerical algorithm on a reconfigurable multi-ring network. Telecommun Syst 10(1\u20132):185\u2013203","journal-title":"Telecommun Syst"},{"unstructured":"Mekkat V, Holey A, Yew P, Zhai A (2013) Managing shared last-level cache in a heterogeneous multicore processor. In: Proceedings of the 22nd International Parallel Architectures and Compilation Techniques Symposium, pp 225\u2013234","key":"2389_CR25"},{"doi-asserted-by":"crossref","unstructured":"Rai S, Chaudhuri M (2016) Exploiting dynamic reuse probability to manage shared last-level caches in CPU\u2013GPU heterogeneous processors. In: Proceedings of the 30th International Supercomputing Symposium, pp 3\u201314","key":"2389_CR26","DOI":"10.1145\/2925426.2926266"},{"doi-asserted-by":"crossref","unstructured":"Zhan J, Kayiran O, Loh GH, Das CR, Xie Y (2016) OSCAR: orchestrating STT-RAM cache traffic in heterogeneous architectures. In: Proceedings of the 49th International Microarchitecture Symposium, pp 1\u201313","key":"2389_CR27","DOI":"10.1109\/MICRO.2016.7783731"},{"unstructured":"Garca V, GomezLuna J, Grass T, Rico A, Ayguade E, Pena AJ (2016) Evaluating the effect of last-level cache sharing on integrated GPU-CPU systems with heterogeneous applications. In: IEEE Proceedings of the International Symposium on Workload Characterization, pp 1\u201310","key":"2389_CR28"},{"doi-asserted-by":"crossref","unstructured":"Jadidi A, Arjomand M, Sarbazi-Azad H (2011) High-endurance and performance-efficient design of hybrid cache architectures through adaptive line replacement. In: Proceedings of the 17th International Low Power Electronics and Design Symposium, pp 79\u201384","key":"2389_CR29","DOI":"10.1109\/ISLPED.2011.5993611"},{"doi-asserted-by":"crossref","unstructured":"Sun G, Dong X, Xie Y, Li J, Chen Y (2009) A novel architecture of the 3D stacked MRAM L2 cache for CMPs. In: Proceedings of the 15th International High Performance Computer Architecture Symposium, pp 239\u2013249","key":"2389_CR30","DOI":"10.1109\/HPCA.2009.4798259"},{"unstructured":"Lin I, Chiou JS (2013) High-endurance hybrid cache design in cmp architecture with cache partitioning and access-aware policy. In: Proceedings of the 23rd International Great Lakes Symposium on VLSI, pp 19\u201324","key":"2389_CR31"},{"doi-asserted-by":"crossref","unstructured":"Wang J, Tim Y, Wong WF, Ong ZL, Sun Z, Li H (2014) A coherent hybrid SRAM and STT-RAM L1 cache architecture for shared memory multicores. In: Proceedings of the 19th Asia and South Pacific Design Automation Conference, pp 610\u2013615","key":"2389_CR32","DOI":"10.1109\/ASPDAC.2014.6742958"},{"doi-asserted-by":"crossref","unstructured":"Wu X, Li J, Zhang L, Speight E, Rajamony R, Xie Y (2009) Hybrid cache architecture with disparate memory technologies. In: Proceedings of the International the 36th Computer Architecture Symposium, pp 34\u201345","key":"2389_CR33","DOI":"10.1145\/1555815.1555761"},{"issue":"3","key":"2389_CR34","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1145\/1880037.1880040","volume":"7","author":"X Wu","year":"2010","unstructured":"Wu X, Li J, Zhang L, Speight E, Rajamony R, Xie Y (2010) Design exploration of hybrid caches with disparate memory technologies. ACM Trans Archit Code Optim 7(3):15","journal-title":"ACM Trans Archit Code Optim"},{"unstructured":"Wu X, Li J, Zhang L, Speight E, Xie Y (2009) Power and performance of read-write aware hybrid caches with non-volatile memories. In: Proceedings of the Design, Automation and Test in Europe Conference and Exhibition, pp 737\u2013742","key":"2389_CR35"},{"doi-asserted-by":"crossref","unstructured":"Li Y, Chen Y, Jones AK (2012) A software approach for combating asymmetries of non-volatile memories. In: Proceedings of the International Low Power Electronics and Design Symposium, pp 191\u2013196","key":"2389_CR36","DOI":"10.1145\/2333660.2333708"},{"doi-asserted-by":"crossref","unstructured":"Li Q, Li J, Shi L, Xue CJ, He Y (2012) MAC: migration-aware compilation for STT-RAM based hybrid cache in embedded systems. In: Proceedings of the International Low Power Electronics and Design Symposium, pp 351\u2013356","key":"2389_CR37","DOI":"10.1145\/2333660.2333738"},{"doi-asserted-by":"crossref","unstructured":"Li J, Shi L, Xue CJ, Yang C, Xu Y (2011) Exploiting set-level write non-uniformity for energy-efficient NVM-based hybrid cache. In: Proceedings of the 9th Embedded Systems for Real-Time Multimedia Symposium, pp 19\u201328","key":"2389_CR38","DOI":"10.1109\/ESTIMedia.2011.6088521"},{"issue":"2","key":"2389_CR39","doi-asserted-by":"publisher","first-page":"241","DOI":"10.1109\/TCAD.2016.2582872","volume":"36","author":"R Wang","year":"2017","unstructured":"Wang R, Jia D, Li T, Qian DP (2017) Achieving versatile and simultaneous cache optimizations with nonvolatile SRAM. IEEE Trans Comput Aided Des Integr Circuits Syst 36(2):241\u2013254","journal-title":"IEEE Trans Comput Aided Des Integr Circuits Syst"},{"doi-asserted-by":"crossref","unstructured":"Smullen CW, Mohan V, Nigam A, Gurumurthi S, Stan MR (2011) Relaxing non-volatility for fast and energy-efficient STT-RAM caches. In: Proceedings of the 17th International High Performance Computer Architecture Symposium, pp 50\u201361","key":"2389_CR40","DOI":"10.1109\/HPCA.2011.5749716"},{"unstructured":"Samavatian MH, Abbasitabar H, Arjomand M, Sarbazi-Azad H (2014) An efficient STT-RAM last-level cache architecture for GPUs. In: Proceedings of the 51st Design Automation Conference, pp 1\u20136","key":"2389_CR41"},{"doi-asserted-by":"crossref","unstructured":"Chen X, Chang LW, Rodrigues CI, Lv J, Wang Z, Hwu WM (2014) Adaptive cache management for energy-efficient GPU computing. In: Proceedings of the 47th International Microarchitecture Symposium, pp 343\u2013355","key":"2389_CR42","DOI":"10.1109\/MICRO.2014.11"},{"doi-asserted-by":"crossref","unstructured":"Goswami N, Cao B, Li T (2013) Power-performance co-optimization of throughput core architecture using resistive memory. In: Proceedings of the 19th International High Performance Computer Architecture Symposium, pp 342\u2013353","key":"2389_CR43","DOI":"10.1109\/HPCA.2013.6522331"},{"doi-asserted-by":"crossref","unstructured":"Li G, Chen X, Sun G, Hoffmann H, Liu Y, Wang Y, Yang H (2015) A STT-RAM-based low-power hybrid register file for GPGPUs. In: Proceedings of the 52nd Design Automation Conference, pp 1\u20136","key":"2389_CR44","DOI":"10.1145\/2744769.2744785"},{"doi-asserted-by":"crossref","unstructured":"Liu X, Mao M, Bi X, Li H, Chen Y (2015) An efficient STT-RAM-based register file in GPU architectures. In: Proceedings of the 20th Asia and South Pacific Design Automation Conference, pp 490\u2013495","key":"2389_CR45","DOI":"10.1109\/ASPDAC.2015.7059054"},{"doi-asserted-by":"crossref","unstructured":"Deng Q, Zhang Y, Zhang M, Yang J (2017) Towards warp-scheduler friendly STT-RAM\/SRAM hybrid GPGPU register file design. In: Proceedings of the 36th IEEE\/ACM International Conference on Computer-Aided Design, pp 736\u2013742","key":"2389_CR46","DOI":"10.1109\/ICCAD.2017.8203850"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11227-018-2389-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-018-2389-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-018-2389-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,5,19]],"date-time":"2019-05-19T12:43:51Z","timestamp":1558269831000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11227-018-2389-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,5,19]]},"references-count":46,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2018,7]]}},"alternative-id":["2389"],"URL":"https:\/\/doi.org\/10.1007\/s11227-018-2389-3","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"type":"print","value":"0920-8542"},{"type":"electronic","value":"1573-0484"}],"subject":[],"published":{"date-parts":[[2018,5,19]]},"assertion":[{"value":"19 May 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}