{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2023,1,7]],"date-time":"2023-01-07T21:07:27Z","timestamp":1673125647430},"reference-count":67,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2011,8,7]],"date-time":"2011-08-07T00:00:00Z","timestamp":1312675200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Int J Parallel Prog"],"published-print":{"date-parts":[[2012,2]]},"DOI":"10.1007\/s10766-011-0178-1","type":"journal-article","created":{"date-parts":[[2011,8,6]],"date-time":"2011-08-06T07:24:06Z","timestamp":1312615446000},"page":"57-83","source":"Crossref","is-referenced-by-count":3,"title":["Managing Data Placement in Memory Systems with Multiple Memory Controllers"],"prefix":"10.1007","volume":"40","author":[{"given":"M.","family":"Awasthi","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"D.","family":"Nellans","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"K.","family":"Sudan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"R.","family":"Balasubramonian","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"A.","family":"Davis","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2011,8,7]]},"reference":[{"key":"178_CR1","doi-asserted-by":"crossref","unstructured":"Abts, D., Jerger, N., Kim, J., Gibson, D., Lipasti, M.: Achieving predictable performance through better memory controller in many-core CMPs. In: Proceedings of ISCA (2009)","DOI":"10.1145\/1555754.1555810"},{"key":"178_CR2","doi-asserted-by":"crossref","unstructured":"Awasthi, M., Sudan, K., Balasubramonian, R., Carter, J.: Dynamic hardware-assisted software-controlled page placement to manage capacity allocation and sharing within large caches. In: Proceedings of HPCA (2009)","DOI":"10.1109\/HPCA.2009.4798260"},{"key":"178_CR3","doi-asserted-by":"crossref","unstructured":"Benia, C., et\u00a0al.: The PARSEC benchmark suite: characterization and architectural implications. Technical report, Department of Computer Science, Princeton University (2008)","DOI":"10.1145\/1454115.1454128"},{"key":"178_CR4","doi-asserted-by":"crossref","unstructured":"Bershad, B., Chen, B., Lee, D., Romer, T.: Avoiding conflict misses dynamically in large direct-mapped caches. In: Proceedings of ASPLOS (1994)","DOI":"10.1145\/195473.195527"},{"key":"178_CR5","unstructured":"Burr, G.W., Breitwisch, M.J., Franceschini, M., Garetto, D., Gopalakrishnan, K., Jackson, B., Kurdi, B., Lam, C., Lastras, L.A., Padilla, A., Rajendran, B., Raoux, S., Shenoy, R.S.: Phase Change Memory Technology. (2010). http:\/\/arxiv.org\/abs\/1001.1164v1"},{"key":"178_CR6","doi-asserted-by":"crossref","unstructured":"Chandra, R., Devine, S., Verghese, B., Gupta, A., Rosenblum, M.: Scheduling and page migration for multiprocessor compute servers. In: Proceedings of ASPLOS (1994)","DOI":"10.1145\/195473.195485"},{"key":"178_CR7","doi-asserted-by":"crossref","unstructured":"Chang, J., Sohi, G.: Co-operative caching for chip multiprocessors. In: Proceedings of ISCA (2006)","DOI":"10.1145\/1150019.1136509"},{"key":"178_CR8","doi-asserted-by":"crossref","unstructured":"Chaudhuri, M.: PageNUCA: selected policies for page-grain locality management in large shared chip-multiprocessor caches. In: Proceedings of HPCA (2009)","DOI":"10.1109\/HPCA.2009.4798258"},{"key":"178_CR9","doi-asserted-by":"crossref","unstructured":"Chishti, Z., Powell, M., Vijaykumar, T.: Optimizing replication, communication, and capacity allocation in CMPs. In: Proceedings of ISCA-32 (June 2005)","DOI":"10.1145\/1080695.1070001"},{"key":"178_CR10","doi-asserted-by":"crossref","unstructured":"Cho, S., Jin, L.: Managing distributed, shared L2 caches through OS-level page allocation. In: Proceedings of MICRO (2006)","DOI":"10.1109\/MICRO.2006.31"},{"key":"178_CR11","doi-asserted-by":"crossref","unstructured":"Corbalan, J., Martorell X., Labarta J.: Page Migration with dynamic space-sharing scheduling policies: the case of SGI 02000. Int. J. Parallel Prog. 32(4) (2004)","DOI":"10.1023\/B:IJPP.0000035815.13969.ec"},{"key":"178_CR12","doi-asserted-by":"crossref","unstructured":"Cuppu, V., Jacob, B.: Concurrency, latency, or system overhead: which has the largest impact on uniprocessor DRAM-System performance. In: Proceedings of ISCA (2001)","DOI":"10.1145\/379240.379252"},{"key":"178_CR13","doi-asserted-by":"crossref","unstructured":"Cuppu, V., Jacob, B., Davis, B., Mudge, T.: A performance comparison of contemporary DRAM architectures. In: Proceedings of ISCA (1999)","DOI":"10.1145\/307338.300998"},{"key":"178_CR14","unstructured":"Dally, W.: Report from Workshop on On- and Off-Chip Interconnection Networks for Multicore Systems (OCIN). (2006). http:\/\/www.ece.ucdavis.edu\/~ocin06\/"},{"key":"178_CR15","doi-asserted-by":"crossref","unstructured":"Deng, Q., Meisner, D., Ramos, L., Wenisch, T., Bianchini, R.: MemScale: active low-power modes for main memory. In: Proceedings of ASPLOS (2011)","DOI":"10.1145\/1950365.1950392"},{"key":"178_CR16","unstructured":"Ding, X., Nikopoulosi, D.S., Jiang, S., Zhang, X.: MESA: Reducing cache conflicts by integrating static and run-time methods. In: Proceedings of ISPASS (2006)"},{"key":"178_CR17","doi-asserted-by":"crossref","unstructured":"Dybdahl, H., Stenstrom, P.: An adaptive shared\/private NUCA cache partitioning scheme for chip multiprocessors. In: Proceedings of HPCA (2007)","DOI":"10.1109\/HPCA.2007.346180"},{"key":"178_CR18","doi-asserted-by":"crossref","unstructured":"Fan, X., Zeng, H., Ellis, C.: Memory controller policies for DRAM power management. In: Proceedings of ISLPED (2001)","DOI":"10.1145\/383082.383118"},{"key":"178_CR19","doi-asserted-by":"crossref","unstructured":"Gara, A., Blumrich, M.A., Chen, D., Chiu, G.L.-T., Coteus, P., Giampapa, M.E., Haring, R.A., Heidelberger, P., Hoenicke, D., Kopcsay, G.V., Liebsch, T.A., Ohmacht, M., Steinmacher-Burow, B.D., Takken, T., Vranas, P.: Overview of the blue gene\/l system architecture. IBM J. Res. Dev. 49 (2005)","DOI":"10.1147\/rd.492.0195"},{"key":"178_CR20","doi-asserted-by":"crossref","unstructured":"Hardavellas, N., Ferdman, M., Falsafi, B., Ailamaki, A.: Reactive NUCA: near-optimal block placement and replication in distributed caches. In: Proceedings of ISCA (2009)","DOI":"10.1145\/1555754.1555779"},{"key":"178_CR21","unstructured":"Intel 845G\/845GL\/845GV Chipset Datasheet: Intel 82845G\/82845GL\/82845GV Graphics and Memory Controller Hub (GMCH) (2002)"},{"key":"178_CR22","doi-asserted-by":"crossref","unstructured":"Ipek, E., Mutlu, O., Martinez, J., Caruana, R.: Self optimizing memory controllers: a reinforcement learning approach. In: Proceedings of ISCA (2008)","DOI":"10.1109\/ISCA.2008.21"},{"key":"178_CR23","unstructured":"ITRS. International Technology Roadmap for Semiconductors, 2007 Edition"},{"key":"178_CR24","volume-title":"Memory systems\u2014cache, DRAM disk","author":"B. Jacob","year":"2008","unstructured":"Jacob B., Ng S.W., Wang D.T.: Memory systems\u2014cache, DRAM disk. Elsevier, New York (2008)"},{"key":"178_CR25","doi-asserted-by":"crossref","unstructured":"Kessler, R.E., Hill, M.D.: Page placement algorithms for large real-indexed caches. ACM Trans. Comput. Syst. 10(4) (1992)","DOI":"10.1145\/138873.138876"},{"key":"178_CR26","doi-asserted-by":"crossref","unstructured":"Kim, C., Burger, D., Keckler, S.: An Adaptive, non-uniform cache structure for wire-dominated on-chip caches. In: Proceedings of ASPLOS (2002)","DOI":"10.1145\/605397.605420"},{"key":"178_CR27","unstructured":"Kim, Y., Han, D., Mutlu, O., Harchol-Balter, M.: ATLAS: a scalable and high-performance scheduling algorithm for multiple memory controllers. In: Proceedings of HPCA (2010)"},{"key":"178_CR28","unstructured":"LaRowe, R., Ellis, C.: Experimental comparison of memory management policies for NUMA multiprocessors. Technical report (1990)"},{"key":"178_CR29","doi-asserted-by":"crossref","unstructured":"LaRowe, R., Ellis, C.: Page placement policies for NUMA multiprocessors. J. Parallel Distrib. Comput. 11(2) (1991)","DOI":"10.1016\/0743-7315(91)90117-R"},{"key":"178_CR30","doi-asserted-by":"crossref","unstructured":"LaRowe, R., Wilkes, J., Ellis, C.: Exploiting operating system support for dynamic page placement on a NUMA shared memory multiprocessor. In: Proceedings of PPOPP (1991)","DOI":"10.1145\/109625.109639"},{"key":"178_CR31","doi-asserted-by":"crossref","unstructured":"Lebeck, A., Fan, X., Zeng, H., Ellis, C.: Power aware page allocation. In: Proceedings of ASPLOS (2000)","DOI":"10.1145\/378993.379007"},{"key":"178_CR32","doi-asserted-by":"crossref","unstructured":"Lee, B., Ipek, E., Mutlu, O., Burger, D.: Architecting phase change memory as a scalable DRAM alternative. In: Proceedings of ISCA (2009)","DOI":"10.1145\/1555754.1555758"},{"key":"178_CR33","doi-asserted-by":"crossref","unstructured":"Lee, C., Mutlu, O., Narasiman, V., Patt, Y.: Prefetch-aware DRAM controllers. In: Proceedings of MICRO (2008)","DOI":"10.1109\/MICRO.2008.4771791"},{"key":"178_CR34","unstructured":"Lin, J., Lu, Q., Ding, X., Zhang, Z., Zhang, X., Sadayappan, P.: Gaining insights into multicore cache partitioning: bridging the gap between simulation and real systems. In: Proceedings of HPCA (2008)"},{"key":"178_CR35","unstructured":"Lin, W., Reinhardt, S., Burger, D.: Designing a Modern memory hierarchy with hardware prefetching. In: Proceedings of IEEE transactions on computers (2001)"},{"key":"178_CR36","doi-asserted-by":"crossref","unstructured":"Loh, G.: 3D-stacked memory architectures for multi-core processors. In: Proceedings of ISCA (2008)","DOI":"10.1109\/ISCA.2008.15"},{"issue":"2","key":"178_CR37","doi-asserted-by":"crossref","first-page":"50","DOI":"10.1109\/2.982916","volume":"35","author":"P. Magnusson","year":"2002","unstructured":"Magnusson P., Christensson M., Eskilson J., Forsgren D., Hallberg G., Hogberg J., Larsson F., Moestedt A., Werner B.: Simics: a full system simulation platform. IEEE Comput. 35(2), 50\u201358 (2002)","journal-title":"IEEE Comput."},{"key":"178_CR38","doi-asserted-by":"crossref","unstructured":"McCurdy, C., Vetter, J.: Memphis: Finding and fixing numa-related performance problems on multi-core platforms. In: Proceedings of ISPASS (2010)","DOI":"10.1109\/ISPASS.2010.5452060"},{"key":"178_CR39","unstructured":"Micron DDR3 SDRAM Part MT41J512M4.(2006) http:\/\/download.micron.com\/pdf\/datasheets\/dram\/ddr3\/2Gb_DDR3_SDRAM.pdf ,"},{"key":"178_CR40","unstructured":"Micron Technology Inc. Micron DDR2 SDRAM Part MT47H64M8. (2004)"},{"key":"178_CR41","unstructured":"Micron Technology Inc. Micron DDR2 SDRAM Part MT47H128M8HQ-25. (2007)"},{"key":"178_CR42","doi-asserted-by":"crossref","unstructured":"Min, R., Hu, Y.: Improving performance of large physically indexed caches by decoupling memory addresses from cache addresses. IEEE Trans. Comput. 50(11) (2001)","DOI":"10.1109\/12.966494"},{"key":"178_CR43","doi-asserted-by":"crossref","unstructured":"Muralimanohar, N., Balasubramonian, R., Jouppi, N.: Optimizing NUCA organizations and wiring alternatives for large caches with CACTI 6.0. In: Proceedings of MICRO (2007)","DOI":"10.1109\/MICRO.2007.4408241"},{"key":"178_CR44","doi-asserted-by":"crossref","unstructured":"Mutlu, O., Moscibroda, T.: Stall-time fair memory access scheduling for chip multiprocessors. In: Proceedings of MICRO (2007)","DOI":"10.1109\/MICRO.2007.21"},{"key":"178_CR45","doi-asserted-by":"crossref","unstructured":"Mutlu, O., Moscibroda, T.: Parallelism-aware batch scheduling: enhancing both performance and fairness of shared DRAM systems. In: Proceedings of ISCA (2008)","DOI":"10.1109\/ISCA.2008.7"},{"key":"178_CR46","unstructured":"Perfmon2 Project Homepage. http:\/\/perfmon2.sourceforge.net\/"},{"key":"178_CR47","unstructured":"Performance of the AMD Opteron LS21 for IBM BladeCenter. ftp:\/\/ftp.software.ibm.com\/eserver\/benchmarks\/wp_ls21_081506.pdf"},{"key":"178_CR48","doi-asserted-by":"crossref","unstructured":"Phadke, S., Narayanasamy, S.: MLP-aware Heterogeneous Main Memory. In: Proceedings of DATE (2011)","DOI":"10.1109\/DATE.2011.5763155"},{"key":"178_CR49","unstructured":"Powell, M., Gomaa, M., Vijaykumar, T.: Heat-and-run: leveraging SMT and CMP to manage power density through the operating system. In: Proceedings of ASPLOS (2004)"},{"key":"178_CR50","doi-asserted-by":"crossref","unstructured":"Qureshi, M.K.: Adaptive spill-receive for robust high-performance caching in CMPs. In: Proceedings of HPCA (2009)","DOI":"10.1109\/HPCA.2009.4798236"},{"key":"178_CR51","doi-asserted-by":"crossref","unstructured":"Rafique, N., Lim, W., Thottethodi, M.: Architectural support for operating system driven CMP cache management. In: Proceedings of PACT (2006)","DOI":"10.1145\/1152154.1152160"},{"key":"178_CR52","doi-asserted-by":"crossref","unstructured":"Rixner, S., Dally, W., Kapasi, U., Mattson, P., Owens, J.: Memory access scheduling. In: Proceedings of ISCA (2000)","DOI":"10.1145\/339647.339668"},{"key":"178_CR53","unstructured":"Romanchenko, V.: Quad-Core Opteron: Architecture and Roadmaps. http:\/\/www.digital-daily.com\/cpu\/quad_core_opteron"},{"key":"178_CR54","doi-asserted-by":"crossref","unstructured":"Sherwood, T., Calder, B., Emer, J.: Reducing cache misses using hardware and software page placement. In: Proceedings of SC (1999)","DOI":"10.1145\/305138.305189"},{"key":"178_CR55","doi-asserted-by":"crossref","unstructured":"Snavely, A., Tullsen, D., Voelker, G.: Symbiotic jobscheduling with priorities for a simultaneous multithreading processor. In: Proceedings of SIGMETRICS (2002)","DOI":"10.1145\/511334.511343"},{"key":"178_CR56","doi-asserted-by":"crossref","unstructured":"Speight, E., Shafi, H., Zhang, L., Rajamony, R.: Adaptive mechanisms and policies for managing cache hierarchies in chip multiprocessors. In: Proceedings of ISCA (2005)","DOI":"10.1109\/ISCA.2005.8"},{"key":"178_CR57","unstructured":"Swinburne, R.: Intel Core i7\u2014Nehalem Architecture Dive. http:\/\/www.bit-tech.net\/hardware\/2008\/11\/03\/intel-core-i7-nehalem-architecture-dive\/"},{"key":"178_CR58","doi-asserted-by":"crossref","unstructured":"Vantrease, D., et\u00a0al.: Corona: system implications of emerging nanophotonic technology. In: Proceedings of ISCA (2008)","DOI":"10.1109\/ISCA.2008.35"},{"key":"178_CR59","doi-asserted-by":"crossref","unstructured":"Verghese, B., Devine, S., Gupta, A., Rosenblum, M.: Operating system support for improving data locality on CC-NUMA compute servers. SIGPLAN Not. 31(9) (1996)","DOI":"10.1145\/248209.237205"},{"key":"178_CR60","unstructured":"Wallin, D., Zeffer, H., Karlsson, M., Hagersten, E.: VASA: a simulator infrastructure with adjustable fidelity. In: Proceedings of IASTED International Conference on Parallel and Distributed Computing and Systems (2005)"},{"key":"178_CR61","doi-asserted-by":"crossref","unstructured":"Wang, D., et\u00a0al.: DRAMsim: A memory-system simulator. In: SIGARCH Computer Architecture News (September 2005)","DOI":"10.1145\/1105734.1105748"},{"key":"178_CR62","doi-asserted-by":"crossref","unstructured":"Wentzlaff, D., et al.: On-Chip Interconnection Architecture of the Tile Processor. In: IEEE Micro 22, (2007)","DOI":"10.1109\/MM.2007.4378780"},{"key":"178_CR63","doi-asserted-by":"crossref","unstructured":"Zhang, M., Asanovic, K.: Victim replication: maximizing capacity while hiding wire delay in tiled chip multiprocessors. In: Proceedings of ISCA (2005)","DOI":"10.1145\/1080695.1069998"},{"key":"178_CR64","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Zhu, Z., Zhand, X.: A permutation-based page interleaving scheme to reduce row-buffer conflicts and exploit data locality. In: Proceedings of MICRO (2000)","DOI":"10.1145\/360128.360134"},{"key":"178_CR65","doi-asserted-by":"crossref","unstructured":"Zheng, H., et\u00a0al.: Mini-Rank: Adaptive DRAM architecture for improving memory power efficiency. In: Proceedings of MICRO (2008)","DOI":"10.1109\/MICRO.2008.4771792"},{"key":"178_CR66","doi-asserted-by":"crossref","unstructured":"Zhou, X., Xu, Y., Du, Y., Zhang, Y., Yang, J.: Thermal management for 3D processor via task scheduling. In: Proceedings of ICPP (2008)","DOI":"10.1109\/ICPP.2008.51"},{"key":"178_CR67","unstructured":"Zhu, Z., Zhang, Z.: A Performance comparison of DRAM memory system optimizations for SMT processors. In: Proceedings of HPCA (2005)"}],"container-title":["International Journal of Parallel Programming"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-011-0178-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10766-011-0178-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-011-0178-1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,6,13]],"date-time":"2019-06-13T20:57:28Z","timestamp":1560459448000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10766-011-0178-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2011,8,7]]},"references-count":67,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2012,2]]}},"alternative-id":["178"],"URL":"https:\/\/doi.org\/10.1007\/s10766-011-0178-1","relation":{},"ISSN":["0885-7458","1573-7640"],"issn-type":[{"value":"0885-7458","type":"print"},{"value":"1573-7640","type":"electronic"}],"subject":[],"published":{"date-parts":[[2011,8,7]]}}}