{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,26]],"date-time":"2025-09-26T13:19:55Z","timestamp":1758892795707},"reference-count":48,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2016,2,1]],"date-time":"2016-02-01T00:00:00Z","timestamp":1454284800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2016,2]]},"DOI":"10.1007\/s11227-015-1608-4","type":"journal-article","created":{"date-parts":[[2016,2,4]],"date-time":"2016-02-04T09:54:59Z","timestamp":1454579699000},"page":"718-752","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Locality-aware data replication in the last-level cache for large scale multicores"],"prefix":"10.1007","volume":"72","author":[{"given":"Farrukh","family":"Hijaz","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qingchuan","family":"Shi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"George","family":"Kurian","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Srinivas","family":"Devadas","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Omer","family":"Khan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2016,2,4]]},"reference":[{"issue":"2","key":"1608_CR1","doi-asserted-by":"crossref","first-page":"8","DOI":"10.1109\/MM.2013.4","volume":"33","author":"RG Dreslinski","year":"2013","unstructured":"Dreslinski RG, Fick D, Giridhar B, Kim G, Seo S, Fojtik M, Satpathy S, Lee Y, Kim D, Liu N, Wieckowski M, Chen G, Sylvester D, Blaauw D, Mudge T (2013) Centip3de: a 64-core, 3d stacked near-threshold system. IEEE Micro 33(2):8\u201316. doi: 10.1109\/MM.2013.4","journal-title":"IEEE Micro"},{"key":"1608_CR2","unstructured":"Kaul H, Anders M, Hsu S, Agarwal A, Krishnamurthy R, Borkar S (2012) Nearthreshold voltage (ntv) design: opportunities and challenges. In: Design Automation Conference. ACM, pp 1149\u20131154"},{"key":"1608_CR3","doi-asserted-by":"crossref","unstructured":"Borkar S (2007) Thousand core chips: a technology perspective. In: Proceedings of the 44th annual design automation conference. ACM, New York, NY, USA, DAC\u201907, pp 746\u2013749. doi: 10.1145\/1278480.1278667","DOI":"10.1145\/1278480.1278667"},{"key":"1608_CR4","doi-asserted-by":"crossref","unstructured":"Bell S, Edwards B, Amann J, Conlin R, Joyce K, Leung V, MacKay J, Reif M, Bao L, Brown J, Mattina M, Miao CC, Ramey C, Wentzlaff D, Anderson W, Berger E, Fairbanks N, Khan D, Montenegro F, Stickney J, Zook J (2008) Tile64-processor: a 64-core soc with mesh interconnect. In: IEEE international solid-state circuits conference, 2008. ISSCC 2008. Digest of Technical Papers, pp 88\u2013598. doi: 10.1109\/ISSCC.2008.4523070","DOI":"10.1109\/ISSCC.2008.4523070"},{"key":"1608_CR5","doi-asserted-by":"crossref","unstructured":"Agarwal A, Simoni R, Hennessy JL, Horowitz M (1988) An Evaluation of Directory Schemes for Cache Coherence. In: International symposium on computer architecture","DOI":"10.1109\/ISCA.1988.5238"},{"issue":"7","key":"1608_CR6","doi-asserted-by":"crossref","first-page":"78","DOI":"10.1145\/2209249.2209269","volume":"55","author":"MMK Martin","year":"2012","unstructured":"Martin MMK, Hill MD, Sorin DJ (2012) Why on-chip cache coherence is here to stay. Commun ACM 55(7):78\u201389","journal-title":"Commun ACM"},{"key":"1608_CR7","doi-asserted-by":"crossref","unstructured":"Sanchez D, Kozyrakis C (2012) SCD: a scalable coherence directory with flexible sharer set encoding. In: International symposium on high-performance computer architecture","DOI":"10.1109\/HPCA.2012.6168950"},{"key":"1608_CR8","doi-asserted-by":"crossref","unstructured":"Zhao H, Shriraman A, Dwarkadas S (2010) SPACE: sharing pattern-based directory coherence for multicore scalability. In: International conference on parallel architectures and compilation techniques, pp 135\u2013146","DOI":"10.1145\/1854273.1854294"},{"key":"1608_CR9","doi-asserted-by":"crossref","unstructured":"Zebchuk J, Srinivasan V, Qureshi MK, Moshovos A (2009) A tagless coherence directory. In: International symposium on microarchitecture","DOI":"10.1145\/1669112.1669166"},{"key":"1608_CR10","first-page":"321","volume":"39","author":"N Eisley","year":"2006","unstructured":"Eisley N, Peh LS, Shang L (2006) In-network cache coherence. In: IEEE\/ACM International symposium on microarchitecture, MICRO 39:321\u2013332. doi: 10.1109\/MICRO.2006.27","journal-title":"In: IEEE\/ACM International symposium on microarchitecture, MICRO"},{"key":"1608_CR11","doi-asserted-by":"crossref","unstructured":"Kurian G, Khan O, Devadas S (2013) The locality-aware adaptive cache coherence protocol. In: Proceedings of the 40th annual international symposium on computer architecture. ACM, New York, NY, USA, ISCA\u201913, pp 523\u2013534. doi: 10.1145\/2485922.2485967","DOI":"10.1145\/2485922.2485967"},{"issue":"2","key":"1608_CR12","doi-asserted-by":"crossref","first-page":"16","DOI":"10.1109\/MM.2010.31","volume":"30","author":"P Conway","year":"2010","unstructured":"Conway P, Kalyanasundharam N, Donley G, Lepak K, Hughes B (2010) Cache hierarchy and memory subsystem of the amd opteron processor. Micro IEEE 30(2):16\u201329. doi: 10.1109\/MM.2010.31","journal-title":"Micro IEEE"},{"key":"1608_CR13","unstructured":"First the tick, now the tock: next generation intel microarchitecture (Nehalem). White Paper (2008)"},{"key":"1608_CR14","doi-asserted-by":"crossref","unstructured":"Kim C, Burger D, Keckler SW (2002) An adaptive, non-uniform cache structure for wire-delay dominated on-chip caches. In: International conference on architectural support for programming languages and operating systems (ASPLOS), pp 211\u2013222","DOI":"10.1145\/605397.605420"},{"key":"1608_CR15","doi-asserted-by":"crossref","unstructured":"Chishti Z, Powell MD, Vijaykumar TN (2005) Optimizing replication, communication, and capacity allocation in cmps. In: Proceedings of the 32Nd Annual international symposium on computer architecture, IEEE computer society, Washington, DC, USA, ISCA\u201905, pp 357\u2013368. doi: 10.1109\/ISCA.2005.39","DOI":"10.1109\/ISCA.2005.39"},{"key":"1608_CR16","doi-asserted-by":"crossref","unstructured":"Zhang M, Asanovic K (2005) Victim replication: Maximizing capacity while hiding wire delay in tiled chip multiprocessors. In: international symposium on computer architecture. doi: 10.1109\/ISCA.2005.53","DOI":"10.1109\/ISCA.2005.53"},{"key":"1608_CR17","doi-asserted-by":"crossref","unstructured":"Beckmann BM, Marty MR, Wood DA (2006) Wood. Asr: adaptive selective replication for cmp caches. In: Proceedings of the 39th Annual IEEE\/ACM International Symposium on Microarchitecture, IEEE computer society, Washington, DC, USA, MICRO 39, pp 443\u2013454. doi: 10.1109\/MICRO.2006.10","DOI":"10.1109\/MICRO.2006.10"},{"key":"1608_CR18","doi-asserted-by":"crossref","unstructured":"Chaudhuri M (2009) PageNUCA: selected policies for page-grain locality management in large shared chip-multiprocessor caches. In: HPCA, pp 227\u2013238","DOI":"10.1109\/HPCA.2009.4798258"},{"key":"1608_CR19","first-page":"184","volume-title":"In: Proceedings of the 36th annual international symposium on computer architecture (ISCA\u201909)","author":"N Hardavellas","year":"2009","unstructured":"Hardavellas N, Ferdman M, Falsafi B, Ailamaki A (2009) Reactive NUCA: Near-Optimal Block Placement and Replication in Distributed Caches. In: Proceedings of the 36th annual international symposium on computer architecture (ISCA\u201909). ACM, New York, NY, USA, pp 184\u2013195"},{"key":"1608_CR20","doi-asserted-by":"crossref","unstructured":"Shi Q, Hijaz F, Khan O (2013) Towards efficient dynamic data placement in noc-based multicores. In: IEEE 31st International Conference on Computer Design (ICCD), 2013, pp 369\u2013376. doi: 10.1109\/ICCD.2013.6657067","DOI":"10.1109\/ICCD.2013.6657067"},{"key":"1608_CR21","doi-asserted-by":"crossref","unstructured":"Merino J, Puente V, Gregorio J (2010) Esp-nuca: a low-cost adaptive non-uniform cache architecture. In: IEEE 16th international symposium on high performance computer architecture (HPCA), 2010, pp 1\u201310. doi: 10.1109\/HPCA.2010.5416641","DOI":"10.1109\/HPCA.2010.5416641"},{"issue":"12","key":"1608_CR22","doi-asserted-by":"crossref","first-page":"1112","DOI":"10.1109\/TC.1978.1675013","volume":"27","author":"LM Censier","year":"1978","unstructured":"Censier LM, Feautrier P (1978) A new solution to coherence problems in multicache systems. IEEE Trans Comput 27(12):1112\u20131118. doi: 10.1109\/TC.1978.1675013","journal-title":"IEEE Trans Comput"},{"key":"1608_CR23","doi-asserted-by":"crossref","unstructured":"Bell S, Edwards B, Amann J, Conlin R, Joyce K, Leung V, MacKay J, Reif M, Bao L, Brown J, Mattina M, Miao C, Ramey C, Wentzlaff D, Anderson W, Berger E, Fairbanks N, Khan D, Montenegro F, Stickney J, Zook J (2008) TILE64-processor: a 64-Core SoC with mesh interconnect. In: International Solid-State Circuits Conference","DOI":"10.1109\/ISSCC.2008.4523070"},{"key":"1608_CR24","unstructured":"Kurian G, Miller J, Psota J, Eastep J, Liu J, Michel J, Kimerling L, Agarwal A (2010) ATAC: a 1000-core cache-coherent processor with on-chip optical network. In: International conference on parallel architectures and compilation techniques"},{"key":"1608_CR25","doi-asserted-by":"crossref","unstructured":"Cho S, Jin L (2006) Managing distributed, shared l2 caches through os-level page allocation. In: Proceedings of the 39th annual IEEE\/ACM international symposium on microarchitecture, IEEE computer society, Washington, DC, USA, MICRO 39, pp 455\u2013468. doi: 10.1109\/MICRO.2006.31 . http:\/\/dl.acm.org\/citation.cfm?id=1194858","DOI":"10.1109\/MICRO.2006.31"},{"key":"1608_CR26","doi-asserted-by":"crossref","unstructured":"Awasthi M, Sudan K, Balasubramonian R, Carter J (2009) Dynamic hardware-assisted software-controlled page placement to manage capacity allocation and sharing within large caches. In: IEEE 15th international symposium on high performance computer architecture, 2009. HPCA 2009, pp 250\u2013261. doi: 10.1109\/HPCA.2009.4798260","DOI":"10.1109\/HPCA.2009.4798260"},{"key":"1608_CR27","doi-asserted-by":"crossref","unstructured":"Kurian G, Devadas S, Khan O (2014) Locality-aware data replication in the last-level cache. In: IEEE 120th international symposium on high performance computer architecture (HPCA2014), 2014","DOI":"10.1109\/HPCA.2014.6835921"},{"key":"1608_CR28","doi-asserted-by":"crossref","unstructured":"Chang J, Sohi G (2006) Cooperative caching for chip multiprocessors. In: 33rd international symposium on computer architecture, 2006. ISCA\u201906, pp 264\u2013276. doi: 10.1109\/ISCA.2006.17","DOI":"10.1109\/ISCA.2006.17"},{"key":"1608_CR29","doi-asserted-by":"crossref","unstructured":"Herrero E, Gonz\u00e1lez J, Canal R (2010) Elastic cooperative caching: an autonomous dynamically adaptive memory hierarchy for chip multiprocessors. In: Proceedings of the 37th Annual international symposium on computer architecture. ACM, New York, NY, USA, ISCA\u201910, pp 419\u2013428. doi: 10.1145\/1815961.1816018","DOI":"10.1145\/1815961.1816018"},{"key":"1608_CR30","doi-asserted-by":"crossref","unstructured":"Qureshi MK (2009) Adaptive spill-receive for robust high-performance caching in cmps. In: IEEE 15th international symposium on high performance computer architecture, 2009. HPCA 2009, pp 45\u201354. doi: 10.1109\/HPCA.2009.4798236","DOI":"10.1109\/HPCA.2009.4798236"},{"key":"1608_CR31","doi-asserted-by":"crossref","unstructured":"Srikantaiah S, Kultursay E, Zhang T, Kandemir M, Irwin MJ, Xie Y (2011) Morphcache: a reconfigurable adaptive multi-level cache hierarchy. In: IEEE 17th international symposium on high performance computer architecture (HPCA), 2011 pp 231\u2013242. doi: 10.1109\/HPCA.2011.5749732","DOI":"10.1109\/HPCA.2011.5749732"},{"key":"1608_CR32","doi-asserted-by":"crossref","unstructured":"Lee H, Cho S, Childers B (2011) Cloudcache: Expanding and shrinking private caches. In: IEEE 17th international symposium on high performance computer architecture (HPCA), 2011 pp 219\u2013230. doi: 10.1109\/HPCA.2011.5749731","DOI":"10.1109\/HPCA.2011.5749731"},{"key":"1608_CR33","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-031-01733-9","volume-title":"A primer on memory consistency and cache coherence","author":"DJ Sorin","year":"2011","unstructured":"Sorin DJ, Hill MD, Wood DA (2011) A primer on memory consistency and cache coherence. Synthesis lectures in computer architecture. Morgan Claypool Publishers, San Rafael"},{"key":"1608_CR34","doi-asserted-by":"crossref","unstructured":"Jaleel A, Borch E, Bhandaru M, Steely Jr SC, Emer J (2010) Achieving non-inclusive cache performance with inclusive caches: Temporal locality aware (tla) cache management policies. In: Proceedings of the 2010 43rd annual IEEE\/ACM international symposium on microarchitecture, IEEE computer society, Washington, DC, USA, MICRO\u201943, pp 151\u2013162. doi: 10.1109\/MICRO.2010.52","DOI":"10.1109\/MICRO.2010.52"},{"key":"1608_CR35","unstructured":"Miller JE, Kasture H, Kurian G, Gruenwald C, Beckmann N, Celio C, Eastep J, Agarwal A (2010) A distributed parallel simulator for multicores. In: 16th international symposium on high performance computer architecture (HPCA), pp 1\u201312"},{"key":"1608_CR36","unstructured":"Dally WJ, Towles B (2004) Principles and practices of interconnection networks. Morgan Kaufmann"},{"key":"1608_CR37","doi-asserted-by":"crossref","first-page":"398","DOI":"10.1145\/2228360.2228431","volume-title":"In: Proceedings of the 49th annual design automation conference (DAC\u201912)","author":"S Park","year":"2012","unstructured":"Park S, Krishna T, Chen CH, Daya B, Chandrakasan A, Peh LS (2012) Approaching the theoretical limits of a mesh noc with a 16-node chip prototype in 45nm soi. In: Proceedings of the 49th annual design automation conference (DAC\u201912). ACM, New York, NY, USA, pp 398\u2013405"},{"key":"1608_CR38","doi-asserted-by":"crossref","unstructured":"Sun C, Chen CHO, Kurian G, Wei L, Miller J, Agarwal A, Peh LS, Stojanovic V (2012) DSENT-a tool connecting emerging photonics with electronics for opto-electronic networks-on-chip modeling. In: 6th IEEE\/ACM international symposium on symposium on networks-on-chip (NoCS), pp 201\u2013210, 9\u201311 May 2012","DOI":"10.1109\/NOCS.2012.31"},{"key":"1608_CR39","doi-asserted-by":"crossref","unstructured":"Li S, Ahn JH, Strong RD, Brockman JB, Tullsen DM, Jouppi NP (2009) Mcpat: an integrated power, area, and timing modeling framework for multicore and manycore architectures. In: 42nd annual IEEE\/ACM international symposium on microarchitecture, MICRO-42, pp 469\u2013480, 12\u201316 Dec 2009","DOI":"10.1145\/1669112.1669172"},{"key":"1608_CR40","doi-asserted-by":"crossref","unstructured":"Thoziyoor S, Ahn JH, Monchiero M, Brockman JB, Jouppi NP (2008) A comprehensive memory modeling tool and its application to the design and analysis of future memory hierarchies. In: 35th international symposium on computer architecture, ISCA\u201908, pp 51\u201362, 21\u201325 June 2008","DOI":"10.1109\/ISCA.2008.16"},{"issue":"8","key":"1608_CR41","doi-asserted-by":"crossref","first-page":"1674","DOI":"10.1109\/TED.2009.2024022","volume":"56","author":"A Khakifirooz","year":"2009","unstructured":"Khakifirooz A, Nayfeh OM, Antoniadis D (2009) A simple semiempirical short-channel MOSFET current-voltage model continuous across all regions of operation and employing only physical parameters. IEEE Transactions Electron Devices 56(8):1674\u20131680","journal-title":"IEEE Transactions Electron Devices"},{"issue":"5","key":"1608_CR42","doi-asserted-by":"crossref","first-page":"1361","DOI":"10.1109\/TED.2011.2121912","volume":"58","author":"L Wei","year":"2011","unstructured":"Wei L, Boeuf F, Skotnicki T, Wong HS (2011) Parasitic capacitances: analytical models and impact on circuit-Level performance. IEEE Transactions on Electron Devices 58(5):1361\u20131370","journal-title":"IEEE Transactions on Electron Devices"},{"key":"1608_CR43","doi-asserted-by":"crossref","unstructured":"Woo SC, Ohara M, Torrie E, Singh JP, Gupta A (1995) The SPLASH-2 Programs: characterization and methodological considerations. In: Proceedings of 22nd annual international symposium on computer architecture, pp 24\u201336, 22\u201324 June 1995","DOI":"10.1145\/223982.223990"},{"key":"1608_CR44","doi-asserted-by":"crossref","first-page":"72","DOI":"10.1145\/1454115.1454128","volume-title":"In: Proceedings of the 17th international conference on parallel architectures and compilation techniques (PACT\u201908)","author":"C Bienia","year":"2008","unstructured":"Bienia C, Kumar S, Singh JP, Li K (2008) The PARSEC Benchmark Suite: characterization and architectural implications. In: Proceedings of the 17th international conference on parallel architectures and compilation techniques (PACT\u201908). ACM, New York, NY, USA, pp 72\u201381"},{"issue":"3","key":"1608_CR45","doi-asserted-by":"crossref","first-page":"209","DOI":"10.14778\/2735508.2735511","volume":"8","author":"X Yu","year":"2014","unstructured":"Yu X, Bezerra G, Pavlo A, Devadas S, Stonebraker M (2014) Staring into the abyss: an evaluation of concurrency control with one thousand cores. Proc VLDB Endow 8(3):209\u2013220. doi: 10.14778\/2735508.2735511","journal-title":"Proc VLDB Endow"},{"key":"1608_CR46","doi-asserted-by":"crossref","unstructured":"Iqbal S, Liang Y, Grahn H (2010) ParMiBench - an open-source benchmark for embedded multiprocessor systems. Comput Archit Lett","DOI":"10.1109\/L-CA.2010.14"},{"key":"1608_CR47","unstructured":"DARPA UHPC Program BAA. https:\/\/www.fbo.gov\/spg\/ODA\/DARPA\/CMO\/DARPA-BAA-10-37\/listing.html (2010)"},{"key":"1608_CR48","doi-asserted-by":"crossref","unstructured":"Ahmad M, Hijaz F, Shi Q, Khan O (2015) A benchmark suite for multithreaded graph algorithms executing on futuristic multicores. In: IEEE international symposium on workload characterization (IISWC), 2015 pp 44\u201355. doi: 10.1109\/IISWC.2015.11","DOI":"10.1109\/IISWC.2015.11"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-015-1608-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11227-015-1608-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-015-1608-4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,16]],"date-time":"2023-08-16T19:04:25Z","timestamp":1692212665000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11227-015-1608-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,2]]},"references-count":48,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2016,2]]}},"alternative-id":["1608"],"URL":"https:\/\/doi.org\/10.1007\/s11227-015-1608-4","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"value":"0920-8542","type":"print"},{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2016,2]]}}}