{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T12:13:15Z","timestamp":1763467995814},"publisher-location":"Berlin, Heidelberg","reference-count":49,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783642133732"},{"type":"electronic","value":"9783642133749"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2010]]},"DOI":"10.1007\/978-3-642-13374-9_4","type":"book-chapter","created":{"date-parts":[[2010,6,10]],"date-time":"2010-06-10T11:15:59Z","timestamp":1276168559000},"page":"50-64","source":"Crossref","is-referenced-by-count":49,"title":["Loop Transformation Recipes for Code Generation and Auto-Tuning"],"prefix":"10.1007","author":[{"given":"Mary","family":"Hall","sequence":"first","affiliation":[]},{"given":"Jacqueline","family":"Chame","sequence":"additional","affiliation":[]},{"given":"Chun","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Jaewook","family":"Shin","sequence":"additional","affiliation":[]},{"given":"Gabe","family":"Rudy","sequence":"additional","affiliation":[]},{"given":"Malik Murtaza","family":"Khan","sequence":"additional","affiliation":[]}],"member":"297","reference":[{"key":"4_CR1","unstructured":"http:\/\/www.peri-scidac.org\/wiki\/index.php\/Main_Page"},{"key":"4_CR2","unstructured":"http:\/\/rosecompiler.org\/"},{"key":"4_CR3","unstructured":"http:\/\/www.gnu.org\/prep\/standards\/html_node\/Errors.html"},{"key":"4_CR4","unstructured":"http:\/\/nek5000.mcs.anl.gov\/index.php\/Main_Page"},{"key":"4_CR5","doi-asserted-by":"crossref","unstructured":"Ahmed, N., Mateev, N., Pingali, K.: Synthesizing transformations for locality enhancement of imperfectly-nested loop nests. In: Proceedings of the 2000 ACM International Conference on Supercomputing (May 2000)","DOI":"10.1145\/335231.335245"},{"key":"4_CR6","doi-asserted-by":"crossref","unstructured":"Almagor, L., Cooper, K.D., Grosul, A., Harvey, T.J., Reeves, S.W., Subramanian, D., Torczon, L., Waterman, T.: Finding effective compilation sequences. In: Proceedings of ACM SIGPLAN Workshop on Languages, Compilers, and Tools for Embedded Systems, LCTES 2004 (June 2004)","DOI":"10.1145\/997163.997196"},{"key":"4_CR7","doi-asserted-by":"crossref","unstructured":"Anderson, E., Sorensen, D., Bai, Z., Dongarra, J., Greenbaum, A., McKenney, A., Croz, J.D., Hammarling, S., Demmel, J., Bischof, C.H.: LAPACK: A portable linear algebra library for high-performance computers. In: Proceedings of Supercomputing 1990 (November 1990)","DOI":"10.1109\/SUPERC.1990.129995"},{"issue":"6","key":"4_CR8","doi-asserted-by":"publisher","first-page":"1768","DOI":"10.1145\/197320.197366","volume":"16","author":"S. Carr","year":"1994","unstructured":"Carr, S., Kennedy, K.: Improving the ratio of memory operations to floating-point operations in loops. ACM Transactions on Programming Languages and Systems\u00a016(6), 1768\u20131810 (1994)","journal-title":"ACM Transactions on Programming Languages and Systems"},{"key":"4_CR9","unstructured":"Chen, C.: Model-Guided Empirical Optimization for Memory Hierarchy. PhD thesis, University of Southern California (May 2007)"},{"key":"4_CR10","unstructured":"Chen, C., Chame, J., Hall, M.: CHiLL: A framework for composing high-level loop transformations. Technical Report 08-897, University of Southern California (June 2008)"},{"key":"4_CR11","doi-asserted-by":"crossref","unstructured":"Chen, C., Chame, J., Hall, M.W.: Combining models and guided empirical search to optimize for multiple levels of the memory hierarchy. In: Proceedings of the International Symposium on Code Generation and Optimization (March 2005)","DOI":"10.1109\/CGO.2005.10"},{"issue":"1","key":"4_CR12","doi-asserted-by":"publisher","first-page":"7","DOI":"10.1023\/A:1015729001611","volume":"23","author":"K.D. Cooper","year":"2002","unstructured":"Cooper, K.D., Subramanian, D., Torczon, L.: Adaptive optimizing compilers for the 21st century. The Journal of Supercomputing\u00a023(1), 7\u201322 (2002)","journal-title":"The Journal of Supercomputing"},{"key":"4_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"136","DOI":"10.1007\/978-3-540-69330-7_10","volume-title":"Languages and Compilers for Parallel Computing","author":"S. Donadio","year":"2006","unstructured":"Donadio, S., Brodman, J., Roeder, T., Yotov, K., Barthou, D., Cohen, A., Garzar\u00e1n, M.J., Padua, D., Pingali, K.: A language for the compact representation of multiple program versions. In: Ayguad\u00e9, E., Baumgartner, G., Ramanujam, J., Sadayappan, P. (eds.) LCPC 2005. LNCS, vol.\u00a04339, pp. 136\u2013151. Springer, Heidelberg (2006)"},{"issue":"2","key":"4_CR14","doi-asserted-by":"crossref","first-page":"216","DOI":"10.1109\/JPROC.2004.840301","volume":"93","author":"M. Frigo","year":"2005","unstructured":"Frigo, M., Johnson, S.G.: The design and implementation of FFTW3. Proceedings of the IEEE: Special Issue on Program Generation, Optimization, and Platform Adaptation\u00a093(2), 216\u2013231 (2005)","journal-title":"Proceedings of the IEEE: Special Issue on Program Generation, Optimization, and Platform Adaptation"},{"issue":"3","key":"4_CR15","doi-asserted-by":"publisher","first-page":"261","DOI":"10.1007\/s10766-006-0012-3","volume":"34","author":"S. Girbal","year":"2006","unstructured":"Girbal, S., Vasilache, N., Bastoul, C., Cohen, A., Parello, D., Sigler, M., Temam, O.: Semi-automatic composition of loop transformations for deep parallelism and memory hierarchies. International Journal of Parallel Programming\u00a034(3), 261\u2013317 (2006)","journal-title":"International Journal of Parallel Programming"},{"key":"4_CR16","doi-asserted-by":"crossref","unstructured":"Hartono, A., Norris, B., Sadayappan, P.: Annotation-based empirical performance tuning using Orio. In: Proceedings of the 23rd International Parallel and Distributed Processing Symposium (May 2009)","DOI":"10.1109\/IPDPS.2009.5161004"},{"key":"4_CR17","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"crossref","first-page":"461","DOI":"10.1007\/978-3-540-45209-6_68","volume-title":"Euro-Par 2003 Parallel Processing","author":"J.R. Herrero","year":"2003","unstructured":"Herrero, J.R., Navarro, J.J.: Improving performance of hypermatrix cholesky factorization. In: Kosch, H., B\u00f6sz\u00f6rm\u00e9nyi, L., Hellwagner, H. (eds.) Euro-Par 2003. LNCS, vol.\u00a02790, pp. 461\u2013469. Springer, Heidelberg (2003)"},{"issue":"4","key":"4_CR18","doi-asserted-by":"publisher","first-page":"409","DOI":"10.1145\/567097.567101","volume":"24","author":"M. Jim\u00e9nez","year":"2002","unstructured":"Jim\u00e9nez, M., Llaber\u00eda, J.M., Fern\u00e1ndez, A.: Register tiling in nonrectangular iteration spaces. ACM Transactions on Programming Languages and Systems\u00a024(4), 409\u2013453 (2002)","journal-title":"ACM Transactions on Programming Languages and Systems"},{"key":"4_CR19","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"120","DOI":"10.1007\/978-3-540-89894-8_14","volume-title":"High Performance Computing - HiPC 2008","author":"D.K. Kaushik","year":"2008","unstructured":"Kaushik, D.K., Gropp, W., Minkoff, M., Smith, B.: Improving the performance of tensor matrix vector multiplication in cumulative reaction probability based quantum chemistry codes. In: Sadayappan, P., Parashar, M., Badrinath, R., Prasanna, V.K. (eds.) HiPC 2008. LNCS, vol.\u00a05374, pp. 120\u2013130. Springer, Heidelberg (2008)"},{"key":"4_CR20","unstructured":"Kelly, W., Pugh, W.: A framework for unifying reordering transformations. Technical Report CS-TR-3193, Department of Computer Science, University of Maryland (1993)"},{"key":"4_CR21","unstructured":"Kisuki, T., Knijnenburg, P.M.W., O\u2019Boyle, M.F.P.: Combined selection of tile sizes and unroll factors using iterative compilation. In: Proceedings of the International Conference on Parallel Architectures and Compilation Techniques (October 2000)"},{"issue":"2-3","key":"4_CR22","doi-asserted-by":"publisher","first-page":"247","DOI":"10.1002\/cpe.773","volume":"16","author":"P.M.W. Knijnenburg","year":"2004","unstructured":"Knijnenburg, P.M.W., Kisuki, T., Gallivan, K., O\u2019Boyle, M.F.P.: The effect of cache models on iterative compilation for combined tiling and unrolling. Concurrency and Computation: Practice and Experience\u00a016(2-3), 247\u2013270 (2004)","journal-title":"Concurrency and Computation: Practice and Experience"},{"key":"4_CR23","doi-asserted-by":"crossref","unstructured":"Kodukula, I., Ahmed, N., Pingali, K.: Data-centric multi-level blocking. In: Proceedings of ACM SIGPLAN Conference on Programming Language Design and Implementation (June 1997)","DOI":"10.1145\/258915.258946"},{"key":"4_CR24","doi-asserted-by":"crossref","unstructured":"Lee, Y., Diniz, P., Hall, M., Lucas, R.: Empirical optimization for a sparse linear solver: A case study. International Journal of Parallel Programming\u00a033 (2005)","DOI":"10.1007\/s10766-005-3581-7"},{"key":"4_CR25","doi-asserted-by":"crossref","unstructured":"Lim, A.W., Lam, M.S.: Maximizing parallelism and minimizing synchronization with affine partitioning. In: Proceedings of ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages (POPL 1997) (January 1997)","DOI":"10.1145\/263699.263719"},{"key":"4_CR26","doi-asserted-by":"crossref","unstructured":"Lim, A.W., Liao, S.-W., Lam, M.S.: Blocking and array contraction across arbitrarily nested loops using affine partitioning. In: ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (June 2001)","DOI":"10.1145\/379539.379586"},{"key":"4_CR27","doi-asserted-by":"crossref","unstructured":"Lu, Q., Krishnamoorthy, S., Sadaypppan, P.: Combining analytical and empirical approaches in tuning matrix transposition. In: Proceedings of the International Conference on Parallel Architectures and Compilation Techniques (September 2006)","DOI":"10.1145\/1152154.1152190"},{"issue":"4","key":"4_CR28","doi-asserted-by":"publisher","first-page":"424","DOI":"10.1145\/233561.233564","volume":"18","author":"K.S. McKinley","year":"1996","unstructured":"McKinley, K.S., Carr, S., Tseng, C.-W.: Improving data locality with loop transformations. ACM Transactions on Programming Languages and Systems\u00a018(4), 424\u2013453 (1996)","journal-title":"ACM Transactions on Programming Languages and Systems"},{"key":"4_CR29","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"248","DOI":"10.1007\/978-3-642-01970-8_25","volume-title":"Computational Science \u2013 ICCS 2009","author":"B. Norris","year":"2009","unstructured":"Norris, B., Hartono, A., Jessup, E., Siek, J.: Generating empirically optimized composed matrix kernels from matlab prototypes. In: Allen, G., Nabrzyski, J., Seidel, E., van Albada, G.D., Dongarra, J., Sloot, P.M.A. (eds.) Computational Science \u2013 ICCS 2009. LNCS, vol.\u00a05544, pp. 248\u2013258. Springer, Heidelberg (2009)"},{"key":"4_CR30","unstructured":"Pop, S., Cohen, A., Bastoul, C., Girbal, S., Silber, G.-A., Vasilache, N.: GRAPHITE: Polyhedral analyses and optimizations for GCC. In: Proceedings of the 4th GCC Developers\u2019 Summit (June 2006)"},{"key":"4_CR31","doi-asserted-by":"crossref","unstructured":"Pouchet, L.-N., Bastoul, C., Cohen, A., Cavazos, J.: Iterative optimization in the polyhedral model: Part I, one-dimensional time. In: Proceedings of the International Symposium on Code Generation and Optimization (March 2007)","DOI":"10.1109\/CGO.2007.21"},{"key":"4_CR32","doi-asserted-by":"crossref","unstructured":"Pouchet, L.-N., Bastoul, C., Cohen, A., Vasilache, N.: Iterative optimization in the polyhedral model: Part II, multi-dimensional time. In: Proceedings of ACM SIGPLAN Conference on Programming Language Design and Implementation (June 2008)","DOI":"10.1145\/1375581.1375594"},{"key":"4_CR33","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"164","DOI":"10.1007\/3-540-44905-1_11","volume-title":"Languages and Compilers for Parallel Computing","author":"B. Pugh","year":"1999","unstructured":"Pugh, B., Rosser, E.: Iteration space slicing for locality. In: Carter, L., Ferrante, J. (eds.) LCPC 1999. LNCS, vol.\u00a01863, p. 164. Springer, Heidelberg (1999)"},{"key":"4_CR34","unstructured":"Qasem, A., Jin, G., Mellor-Crummey, J.: Improving performance with integrated program transformations. Technical Report TR03-419, Rice University (October 2003)"},{"key":"4_CR35","doi-asserted-by":"crossref","unstructured":"Qasem, A., Kennedy, K.: Profitable loop fusion and tiling using model-driven empirical search. In: Proceedings of the 2006 ACM International Conference on Supercomputing (June 2006)","DOI":"10.1145\/1183401.1183437"},{"key":"4_CR36","doi-asserted-by":"crossref","unstructured":"Ren, M., Park, J.Y., Houston, M., Aiken, A., Dally, W.J.: A tuning framework for software-managed memory hierarchies. In: Proceedings of the International Conference on Parallel Architectures and Compilation Techniques (October 2008)","DOI":"10.1145\/1454115.1454155"},{"key":"4_CR37","doi-asserted-by":"crossref","unstructured":"Rivera, G., Tseng, C.-W.: Data transformations for eliminating conflict misses. In: Proceedings of ACM SIGPLAN Conference on Programming Language Design and Implementation (June 1998)","DOI":"10.1145\/277650.277661"},{"key":"4_CR38","doi-asserted-by":"crossref","unstructured":"Sarkar, V., Thekkath, R.: A general framework for iteration-reordering loop transformations. In: Proceedings of ACM SIGPLAN Conference on Programming Language Design and Implementation (June 1992)","DOI":"10.1145\/143095.143132"},{"key":"4_CR39","doi-asserted-by":"crossref","unstructured":"Shin, J., Hall, M.W., Chame, J., Chen, C., Hovland, P.D.: Autotuning and specialization: Speeding up matrix multiply for small matrices with compiler technology. In: The Fourth International Workshop on Automatic Performance Tuning (October 2009)","DOI":"10.1007\/978-1-4419-6935-4_20"},{"key":"4_CR40","doi-asserted-by":"crossref","unstructured":"Temam, O., Granston, E.D., Jalby, W.: To copy or not to copy: A compile-time technique for assessing when data copying should be used to eliminate cache conflicts. In: Proceedings of Supercomputing 1993 (November 1993)","DOI":"10.1145\/169627.169762"},{"key":"4_CR41","doi-asserted-by":"crossref","unstructured":"Tiwari, A., Chen, C., Chame, J., Hall, M., Hollingsworth, J.K.: A scalable auto-tuning framework for compiler optimization. In: Proceedings of the 24th International Parallel and Distributed Processing Symposium (April 2009)","DOI":"10.1109\/IPDPS.2009.5161054"},{"key":"4_CR42","doi-asserted-by":"crossref","unstructured":"Tufo, H.M., Fischer, P.F.: Terascale spectral element algorithms and implementations. In: ACM\/IEEE conference on Supercomputing, Portland, OR (1999)","DOI":"10.1145\/331532.331599"},{"issue":"1-2","key":"4_CR43","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1016\/S0167-8191(00)00087-9","volume":"27","author":"R.C. Whaley","year":"2001","unstructured":"Whaley, R.C., Petitet, A., Dongarra, J.J.: Automated empirical optimization of software and the ATLAS project. Parallel Computing\u00a027(1-2), 3\u201335 (2001)","journal-title":"Parallel Computing"},{"key":"4_CR44","unstructured":"Clint Whaley, R., Whaley, D.B.: Tuning high performance kernels through empirical compilation. In: Proceedings of the 34th International Conference on Parallel Processing (June 2005)"},{"key":"4_CR45","doi-asserted-by":"crossref","unstructured":"Wolf, M.E., Lam, M.S.: A data locality optimizing algorithm. In: Proceedings of ACM SIGPLAN Conference on Programming Language Design and Implementation (June 1991)","DOI":"10.1145\/113445.113449"},{"issue":"4","key":"4_CR46","doi-asserted-by":"publisher","first-page":"452","DOI":"10.1109\/71.97902","volume":"2","author":"M.E. Wolf","year":"1991","unstructured":"Wolf, M.E., Lam, M.S.: A loop transformation theory and an algorithm to maximize parallelism. IEEE Transactions on Parallel and Distributed Systems\u00a02(4), 452\u2013471 (1991)","journal-title":"IEEE Transactions on Parallel and Distributed Systems"},{"issue":"4","key":"4_CR47","doi-asserted-by":"publisher","first-page":"321","DOI":"10.1007\/BF00129834","volume":"4","author":"M. Wolfe","year":"1991","unstructured":"Wolfe, M.: Data dependence and program restructuring. The Journal of Supercomputing\u00a04(4), 321\u2013344 (1991)","journal-title":"The Journal of Supercomputing"},{"key":"4_CR48","unstructured":"Wolfe, M.: Compilers and more: Optimizing gpu kernels (October 2008), http:\/\/www.hpcwire.com\/features\/Compilers_and_More_Optimizing_GPU_Kernels.html"},{"key":"4_CR49","doi-asserted-by":"crossref","unstructured":"Yi, Q., Seymour, K., You, H., Vuduc, R., Quinlan, D.: POET: parameterized optimizations for empirical tuning. In: Proceedings of the 21st International Parallel and Distributed Processing Symposium (March 2007)","DOI":"10.1109\/IPDPS.2007.370637"}],"container-title":["Lecture Notes in Computer Science","Languages and Compilers for Parallel Computing"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-13374-9_4.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,11,24]],"date-time":"2020-11-24T03:05:30Z","timestamp":1606187130000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-13374-9_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2010]]},"ISBN":["9783642133732","9783642133749"],"references-count":49,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-13374-9_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2010]]}}}