{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2023,10,6]],"date-time":"2023-10-06T20:41:52Z","timestamp":1696624912491},"reference-count":32,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2011,8,18]],"date-time":"2011-08-18T00:00:00Z","timestamp":1313625600000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Int J Parallel Prog"],"published-print":{"date-parts":[[2012,2]]},"DOI":"10.1007\/s10766-011-0182-5","type":"journal-article","created":{"date-parts":[[2011,8,17]],"date-time":"2011-08-17T12:58:06Z","timestamp":1313585886000},"page":"4-24","source":"Crossref","is-referenced-by-count":6,"title":["Data Layout Transformation Exploiting Memory-Level Parallelism in Structured Grid Many-Core Applications"],"prefix":"10.1007","volume":"40","author":[{"given":"I-Jui","family":"Sung","sequence":"first","affiliation":[]},{"given":"Nasser","family":"Anssari","sequence":"additional","affiliation":[]},{"given":"John A.","family":"Stratton","sequence":"additional","affiliation":[]},{"given":"Wen-Mei W.","family":"Hwu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2011,8,18]]},"reference":[{"issue":"8","key":"182_CR1","doi-asserted-by":"crossref","first-page":"166","DOI":"10.1145\/209937.209954","volume":"30","author":"J.M. Anderson","year":"1995","unstructured":"Anderson J.M., Amarasinghe S.P., Lam M.S.: Data and computation transformations for multiprocessors. SIGPLAN Not. 30(8), 166\u2013178 (1995)","journal-title":"SIGPLAN Not."},{"key":"182_CR2","unstructured":"Asanovic, K., Bodik, R., Catanzaro, B.C., Gebis, J.J., Husbands, P., Keutzer, K., Patterson, D.A., Plishker, W.L., Shalf, J., Williams, S.W., Yelick, K.A.: The landscape of parallel computing research: a view from berkeley. Technical report UCB\/EECS-2006-183, EECS Department, University of California, Berkeley, Dec 2006"},{"key":"182_CR3","doi-asserted-by":"crossref","unstructured":"Bakhoda, A., Yuan, G.L., Fung, W.W.L., Wong, H., Aamodt, T.M.: Analyzing cuda workloads using a detailed gpu simulator. In: ISPASS, pp. 163\u2013174. IEEE (2009)","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"182_CR4","doi-asserted-by":"crossref","unstructured":"Baskaran, M.M., Bondhugula, U., Krishnamoorthy, S., Ramanujam J., Rountev, A., Sadayappan, P.: A compiler framework for optimization of affine loop nests for gpgpus. In: ICS \u201908: Proceedings of the 22nd annual international conference on Supercomputing, pp. 225\u2013234. ACM, New York, NY, USA (2008)","DOI":"10.1145\/1375527.1375562"},{"key":"182_CR5","doi-asserted-by":"crossref","unstructured":"Datta, K., Murphy, M., Volkov, V., Williams, S., Carter, J., Oliker, L., Patterson, D., Shalf, J., Yelick, K.: Stencil computation optimization and auto-tuning on state-of-the-art multicore architectures. In: SC08: Proceedings of the 2008 Conference on Supercomputing, pp. 1\u201312. Piscataway, NJ, USA (2008)","DOI":"10.1109\/SC.2008.5222004"},{"key":"182_CR6","doi-asserted-by":"crossref","DOI":"10.1137\/1.9781611971446","volume-title":"Applied Numerical Linear Algebra","author":"J.W. Demmel","year":"1997","unstructured":"Demmel J.W.: Applied Numerical Linear Algebra. Society for Industrial and Applied Mathematics, Philadelphia, PA (1997)"},{"key":"182_CR7","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-642-98037-4","volume-title":"Computational Methods for Fluid Dynamics","author":"J.H. Ferziger","year":"1999","unstructured":"Ferziger J.H., Peric M.: Computational Methods for Fluid Dynamics. Springer, Berlin (1999)"},{"issue":"3","key":"182_CR8","doi-asserted-by":"crossref","first-page":"261","DOI":"10.1007\/s10766-006-0012-3","volume":"34","author":"S. Girbal","year":"2006","unstructured":"Girbal S., Vasilache N., Bastoul C., Cohen A., Parello D., Sigler M., Temam O.: Semi-automatic composition of loop transformations for deep parallelism and memory hierarchies. Int. J. Parallel Prog. 34(3), 261\u2013317 (2006)","journal-title":"Int. J. Parallel Prog."},{"key":"182_CR9","first-page":"8","volume":"33","author":"C.D. Gundolf","year":"2000","unstructured":"Gundolf C.D., Douglas C.C., Haase G., Hu J., Kowarschik M., Weiss C.: Portable memory hierarchy techniques for PDE solvers, part II. SIAM News 33, 8\u20139 (2000)","journal-title":"SIAM News"},{"issue":"3","key":"182_CR10","doi-asserted-by":"crossref","first-page":"39","DOI":"10.1145\/1394608.1382172","volume":"36","author":"E. Ipek","year":"2008","unstructured":"Ipek E., Mutlu O., Mart\u00ednez J.F., Caruana R.: Self-optimizing memory controllers: A reinforcement learning approach. Comp. Arch. News 36(3), 39\u201350 (2008)","journal-title":"Comp. Arch. News"},{"key":"182_CR11","doi-asserted-by":"crossref","unstructured":"Jang, B., Mistry, P., Schaa, D., Dominguez, R., Kaeli, D.: Data transformations enabling loop vectorization on multithreaded data parallel architectures. In: PPoPP \u201910: Proceedings of the 15th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, pp. 353\u2013354. ACM, New York, NY, USA (2010)","DOI":"10.1145\/1693453.1693510"},{"key":"182_CR12","doi-asserted-by":"crossref","unstructured":"Ju Y.-L., Dietz, H.G.: Reduction of cache coherence overhead by compiler data layout and loop transformation. In: Proceedings of the Fourth International Workshop on Languages and Compilers for Parallel Computing, pp. 344\u2013358. Springer, London, UK (1992)","DOI":"10.1007\/BFb0038675"},{"key":"182_CR13","doi-asserted-by":"crossref","unstructured":"Kennedy, K., Kremer, U.: Automatic data layout for high performance fortran. In: Supercomputing \u201995: Proceedings of the 1995 ACM\/IEEE conference on Supercomputing (CDROM), pp. 76. ACM, New York, NY, USA (1995)","DOI":"10.1145\/224170.224495"},{"key":"182_CR14","doi-asserted-by":"crossref","unstructured":"Kindratenko, V., Enos, J., Shi, G.: Gpu clusters for high-performance computing. In: Proceedings of the Workshop on Parallel Programming on Accelerator Clusters. Jan 2009","DOI":"10.1109\/CLUSTR.2009.5289128"},{"key":"182_CR15","doi-asserted-by":"crossref","unstructured":"Kwon, Y.-S., Koo, B.-T., Eum, N.-W.: Partial conflict-relieving programmable address shuffler for parallel memories in multi-core processor. In: ASP-DAC \u201909: Proceedings of the 2009 Asia and South Pacific Design Automation Conference, pp. 329\u2013334. IEEE Press, Piscataway, NJ, USA (2009)","DOI":"10.1109\/ASPDAC.2009.4796502"},{"key":"182_CR16","doi-asserted-by":"crossref","unstructured":"Lu, Q., Alias, C., Bondhugula, U., Henretty, T., Krishnamoorthy, S., Ramanujam, J., Rountev, A., Sadayappan, P., Chen, Y., Lin, H., Ngai, T.-f.: Data layout transformation for enhancing data locality on nuca chip multiprocessors. In: Proceedings of the 18th International Conference on Parallel Architectures and Compilation Techniques, pp. 348\u2013357 (2009)","DOI":"10.1109\/PACT.2009.36"},{"key":"182_CR17","doi-asserted-by":"crossref","DOI":"10.1007\/978-1-4613-2001-2","volume-title":"Memory Storage Patterns in Parallel Processing","author":"M.E. Mace","year":"1987","unstructured":"Mace M.E.: Memory Storage Patterns in Parallel Processing. Kluwer, Boston (1987)"},{"issue":"3es","key":"182_CR18","doi-asserted-by":"crossref","first-page":"2","DOI":"10.1145\/357783.331677","volume":"5","author":"N.R. Mahapatra","year":"1999","unstructured":"Mahapatra N.R., Venkatrao B.: The processor-memory bottleneck: problems and solutions. Crossroads 5(3es), 2 (1999)","journal-title":"Crossroads"},{"key":"182_CR19","unstructured":"McVoy, L., Staelin, C.: lmbench: portable tools for performance analysis. In: Proceedings of the 1996 USENIX Annual Technical Conference, pp. 23\u201323 (1996)"},{"key":"182_CR20","doi-asserted-by":"crossref","DOI":"10.1017\/CBO9780511812248","volume-title":"Numerical Solution of Partial Differential Equations: An Introduction","author":"K.W. Morton","year":"2005","unstructured":"Morton K.W., Mayers D.F.: Numerical Solution of Partial Differential Equations: An Introduction. Cambridge University Press, New York, NY (2005)"},{"key":"182_CR21","doi-asserted-by":"crossref","unstructured":"Moscibroda, T., Mutlu, O.: Distributed order scheduling and its application to multi-core DRAM controllers. In: Proceedings of the 27th Symposium on Principles of Distributed Computing, pp. 365\u2013374 (2008)","DOI":"10.1145\/1400751.1400799"},{"issue":"3","key":"182_CR22","doi-asserted-by":"crossref","first-page":"63","DOI":"10.1145\/1394608.1382128","volume":"36","author":"O. Mutlu","year":"2008","unstructured":"Mutlu O., Moscibroda T.: Parallelism-aware batch scheduling: enhancing both performance and fairness of shared DRAM systems. Comput. Arch. News 36(3), 63\u201374 (2008)","journal-title":"Comput. Arch. News"},{"key":"182_CR23","unstructured":"nVIDIA: nvidia cuda programming guide 2.0 (2008)"},{"issue":"4","key":"182_CR24","doi-asserted-by":"crossref","first-page":"549","DOI":"10.1142\/S0129626403001501","volume":"13","author":"T. Pohl","year":"2003","unstructured":"Pohl T., Kowarschik M., Wilke J., Iglberger K., R\u00fcde U.: Optimization and profiling of the cache performance of parallel lattice boltzmann codes. Parallel Process. Lett. 13(4), 549\u2013560 (2003)","journal-title":"Parallel Process. Lett."},{"issue":"6","key":"182_CR25","doi-asserted-by":"crossref","first-page":"479","DOI":"10.1209\/0295-5075\/17\/6\/001","volume":"17","author":"Y.H. Qian","year":"1992","unstructured":"Qian Y.H., D\u2019Humieres D., Lallemand P.: Lattice BGK models for Navier-Stokes equation. Europhys. Lett. 17(6), 479\u2013484 (1992)","journal-title":"Europhys. Lett."},{"key":"182_CR26","doi-asserted-by":"crossref","unstructured":"Rivera, G., Tseng, C.-W.: Tiling optimizations for 3D scientific computations. In: SC00: Proceedings of the 2000 conference on Supercomputing, p. 32 (2000)","DOI":"10.1109\/SC.2000.10015"},{"key":"182_CR27","doi-asserted-by":"crossref","unstructured":"Ryoo, S., Rodrigues, C.I., Baghsorkhi, S.S., Stone, S.S., Kirk, D.B., Hwu, W.-m.W.: Optimization principles and application performance evaluation of a multithreaded gpu using cuda. In: Proceedings of the 13th Symposium on Principles and Practice of Parallel Programming, pp. 73\u201382 (2008)","DOI":"10.1145\/1345206.1345220"},{"issue":"1","key":"182_CR28","doi-asserted-by":"crossref","first-page":"115","DOI":"10.1177\/1094342004041295","volume":"18","author":"S Sellappa","year":"2004","unstructured":"Sellappa S, Chatterjee S.: Cache-Efficient multigrid algorithms. Int. J. High Perform. Comput. Appl. 18(1), 115\u2013133 (2004)","journal-title":"Int. J. High Perform. Comput. Appl."},{"key":"182_CR29","doi-asserted-by":"crossref","unstructured":"Shao, J., Davis, B.T.: A burst scheduling access reordering mechanism. In: Proceedings of the 13th International Symposium on High Performance Computer Architecture, pp. 285\u2013294 (2007)","DOI":"10.1109\/HPCA.2007.346206"},{"issue":"1","key":"182_CR30","doi-asserted-by":"crossref","first-page":"130","DOI":"10.1145\/1241601.1241625","volume":"35","author":"C.D. Spradling","year":"2007","unstructured":"Spradling C.D.: Spec cpu2006 benchmark tools. Comput. Arch. News 35(1), 130\u2013134 (2007)","journal-title":"Comput. Arch. News"},{"key":"182_CR31","doi-asserted-by":"crossref","unstructured":"Volkov, V., Demmel, J.W.: Benchmarking gpus to tune dense linear algebra. In: SC08: Proceedings of the 2008 Conference on Supercomputing, pp. 1\u201311 (2008)","DOI":"10.1109\/SC.2008.5214359"},{"issue":"5","key":"182_CR32","doi-asserted-by":"crossref","first-page":"323","DOI":"10.1007\/s00371-007-0191-y","volume":"24","author":"Y. Zhao","year":"2008","unstructured":"Zhao Y.: Lattice Boltzmann based PDE solver on the GPU. Vis. Comput. 24(5), 323\u2013333 (2008)","journal-title":"Vis. Comput."}],"container-title":["International Journal of Parallel Programming"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-011-0182-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10766-011-0182-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-011-0182-5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,6,14]],"date-time":"2019-06-14T04:07:55Z","timestamp":1560485275000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10766-011-0182-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2011,8,18]]},"references-count":32,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2012,2]]}},"alternative-id":["182"],"URL":"https:\/\/doi.org\/10.1007\/s10766-011-0182-5","relation":{},"ISSN":["0885-7458","1573-7640"],"issn-type":[{"value":"0885-7458","type":"print"},{"value":"1573-7640","type":"electronic"}],"subject":[],"published":{"date-parts":[[2011,8,18]]}}}