{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T12:13:42Z","timestamp":1763468022044},"publisher-location":"Berlin, Heidelberg","reference-count":38,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783642198601"},{"type":"electronic","value":"9783642198618"}],"license":[{"start":{"date-parts":[[2011,1,1]],"date-time":"2011-01-01T00:00:00Z","timestamp":1293840000000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2011]]},"DOI":"10.1007\/978-3-642-19861-8_15","type":"book-chapter","created":{"date-parts":[[2011,3,14]],"date-time":"2011-03-14T17:39:26Z","timestamp":1300124366000},"page":"266-285","source":"Crossref","is-referenced-by-count":4,"title":["Practical Loop Transformations for Tensor Contraction Expressions on Multi-level Memory Hierarchies"],"prefix":"10.1007","author":[{"given":"Wenjing","family":"Ma","sequence":"first","affiliation":[]},{"given":"Sriram","family":"Krishnamoorthy","sequence":"additional","affiliation":[]},{"given":"Gagan","family":"Agrawal","sequence":"additional","affiliation":[]}],"member":"297","reference":[{"issue":"5","key":"15_CR1","doi-asserted-by":"publisher","first-page":"493","DOI":"10.1023\/A:1012293814832","volume":"29","author":"N. Ahmed","year":"2001","unstructured":"Ahmed, N., Mateev, N., Pingali, K.: Synthesizing transformations for locality enhancement of imperfectly-nested loop nests. International Journal of Parallel Programming\u00a029(5), 493\u2013544 (2001)","journal-title":"International Journal of Parallel Programming"},{"key":"15_CR2","doi-asserted-by":"crossref","unstructured":"Apr\u00e0, E., Rendell, A.P., Harrison, R.J., Tipparaju, V., deJong, W.A., Xantheas, S.S.: Liquid water: obtaining the right answer for the right reasons. In: SC (2009)","DOI":"10.1145\/1654059.1654127"},{"issue":"1","key":"15_CR3","doi-asserted-by":"publisher","first-page":"291","DOI":"10.1103\/RevModPhys.79.291","volume":"79","author":"R.J. Bartlett","year":"2007","unstructured":"Bartlett, R.J., Musial\u0308, M.: Coupled-cluster Theory in Quantum Chemistry. Rev. Mod. Phys.\u00a079(1), 291\u2013352 (2007)","journal-title":"Rev. Mod. Phys."},{"key":"15_CR4","doi-asserted-by":"crossref","unstructured":"Baskaran, M.M., Bondhugula, U., Krishnamoorthy, S., Ramanujam, J., Rountev, A., Sadayappan, P.: Automatic data movement and computation mapping for multi-level parallel architectures with explicitly managed memories. In: PPoPP, pp. 1\u201310 (2008)","DOI":"10.1145\/1345206.1345210"},{"key":"15_CR5","doi-asserted-by":"crossref","unstructured":"Bordawekar, R., Choudhary, A., Kennedy, K., Koelbel, C., Paleczny, M.: A model and compilation strategy for out-of-core data parallel programs. In: PPoPP, pp. 1\u201310 (July 1995)","DOI":"10.1145\/209937.209938"},{"issue":"2","key":"15_CR6","doi-asserted-by":"publisher","first-page":"111","DOI":"10.1145\/377769.377774","volume":"19","author":"A.D. Brown","year":"2001","unstructured":"Brown, A.D., Mowry, T.C., Krieger, O.: Compiler-based i\/o prefetching for out-of-core applications. ACM Trans. Comput. Syst.\u00a019(2), 111\u2013170 (2001)","journal-title":"ACM Trans. Comput. Syst."},{"key":"15_CR7","doi-asserted-by":"crossref","unstructured":"Cascaval, C., Padua, D.A.: Estimating cache misses and locality using stack distances. In: ICS, pp. 150\u2013159 (2003)","DOI":"10.1145\/782832.782836"},{"key":"15_CR8","doi-asserted-by":"crossref","unstructured":"Coleman, S., McKinley, K.S.: Tile size selection using cache organization and data layout. In: PLDI, pp. 279\u2013290 (1995)","DOI":"10.1145\/223428.207162"},{"key":"15_CR9","volume-title":"Introduction to algorithms","author":"T.H. Cormen","year":"2001","unstructured":"Cormen, T.H., Leiserson, C.E., Rivest, R.L.: Introduction to algorithms. MIT Press, Cambridge (2001)"},{"issue":"9","key":"15_CR10","doi-asserted-by":"publisher","first-page":"1175","DOI":"10.1016\/S0167-8191(00)00034-X","volume":"26","author":"A. Darte","year":"2000","unstructured":"Darte, A.: On the complexity of loop fusion. Parallel Computing\u00a026(9), 1175\u20131193 (2000)","journal-title":"Parallel Computing"},{"issue":"10","key":"15_CR11","doi-asserted-by":"publisher","first-page":"1242","DOI":"10.1109\/TC.2005.167","volume":"54","author":"A. Darte","year":"2005","unstructured":"Darte, A., Schreiber, R., Villard, G.: Lattice-based memory allocation. IEEE Trans. Computers\u00a054(10), 1242\u20131257 (2005)","journal-title":"IEEE Trans. Computers"},{"key":"15_CR12","doi-asserted-by":"crossref","first-page":"245","DOI":"10.1145\/781131.781159","volume-title":"PLDI","author":"C. Ding","year":"2003","unstructured":"Ding, C., Zhong, Y.: Predicting whole-program locality through reuse distance analysis. In: PLDI, pp. 245\u2013257. ACM, New York (2003)"},{"key":"15_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"408","DOI":"10.1007\/978-3-642-13374-9_29","volume-title":"Languages and Compilers for Parallel Computing","author":"B. Diouf","year":"2010","unstructured":"Diouf, B., Ozturk, O., Cohen, A.: Optimizing local memory allocation and assignment through a decoupled approach. In: Gao, G.R., Pollock, L.L., Cavazos, J., Li, X. (eds.) LCPC 2009. LNCS, vol.\u00a05898, pp. 408\u2013415. Springer, Heidelberg (2010)"},{"issue":"18","key":"15_CR14","doi-asserted-by":"publisher","first-page":"2425","DOI":"10.1002\/cpe.1182","volume":"19","author":"X. Gao","year":"2007","unstructured":"Gao, X., Krishnamoorthy, S., Sahoo, S.K., Lam, C.-C., Baumgartner, G., Ramanujam, J., Sadayappan, P.: Efficient search-space pruning for integrated fusion and tiling transformations. Concurrency and Computation: Practice and Experience\u00a019(18), 2425\u20132443 (2007)","journal-title":"Concurrency and Computation: Practice and Experience"},{"issue":"46","key":"15_CR15","doi-asserted-by":"publisher","first-page":"9887","DOI":"10.1021\/jp034596z","volume":"107","author":"S. Hirata","year":"2003","unstructured":"Hirata, S.: Tensor contraction engine: Abstraction and automated parallel implementation of configuration-interaction, coupled-cluster, and many-body perturbation theories. Journal of Physical Chemistry A\u00a0107(46), 9887\u20139897 (2003)","journal-title":"Journal of Physical Chemistry A"},{"issue":"3","key":"15_CR16","doi-asserted-by":"publisher","first-page":"279","DOI":"10.1023\/B:SUPE.0000011388.54204.8e","volume":"27","author":"C.-h. Hsu","year":"2004","unstructured":"Hsu, C.-h., Kremer, U.: A quantitative analysis of tile size selection algorithms. J. Supercomput.\u00a027(3), 279\u2013294 (2004)","journal-title":"J. Supercomput."},{"key":"15_CR17","unstructured":"Kandemir, M., Choudhary, A., Choudhary, A.: Compiler optimizations for i\/o intensive computations. In: Proceedings of International Conference on Parallel Processing (September 1999)"},{"issue":"3-4","key":"15_CR18","doi-asserted-by":"publisher","first-page":"597","DOI":"10.1016\/S0167-8191(98)00027-1","volume":"24","author":"M. Kandemir","year":"1998","unstructured":"Kandemir, M., Choudhary, A., Ramanujam, J., Bordawekar, R.: Compilation techniques for out-of-core parallel computations. Parallel Computing\u00a024(3-4), 597\u2013628 (1998)","journal-title":"Parallel Computing"},{"key":"15_CR19","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","DOI":"10.1007\/BFb0025874","volume-title":"Languages and Compilers for Parallel Computing","author":"W. Kelly","year":"1995","unstructured":"Kelly, W., Pugh, W.: Finding legal reordering transformations using mappings. In: Pingali, K.K., Gelernter, D., Padua, D.A., Banerjee, U., Nicolau, A. (eds.) LCPC 1994. LNCS, vol.\u00a0892. Springer, Heidelberg (1995)"},{"key":"15_CR20","doi-asserted-by":"crossref","unstructured":"Li, L., Nguyen, Q.H., Xue, J.: Scratchpad allocation for data aggregates in superperfect graphs. In: LCTES 2007: Proceedings of Conference on Languages, Compilers, and Tools for Embedded Systems, pp. 207\u2013216 (2007)","DOI":"10.1145\/1254766.1254805"},{"key":"15_CR21","doi-asserted-by":"crossref","unstructured":"Lim, A.W., Cheong, G.I., Lam, M.S.: An affine partitioning algorithm to maximize parallelism and minimize communication. In: International Conference on Supercomputing, pp. 228\u2013237 (1999)","DOI":"10.1145\/305138.305197"},{"key":"15_CR22","doi-asserted-by":"crossref","unstructured":"Ma, W., Agrawal, G.: A Translation System for Enabling Data Mining Applications on GPUs. In: Proceedings of International Conference on Supercomputing (ICS) (June 2009)","DOI":"10.1145\/1542275.1542331"},{"issue":"4","key":"15_CR23","doi-asserted-by":"publisher","first-page":"424","DOI":"10.1145\/233561.233564","volume":"18","author":"K.S. McKinley","year":"1996","unstructured":"McKinley, K.S., Carr, S., Tseng, C.-W.: Improving data locality with loop transformations. ACM Transactions on Programming Languages and Systems\u00a018(4), 424\u2013453 (1996)","journal-title":"ACM Transactions on Programming Languages and Systems"},{"issue":"4","key":"15_CR24","doi-asserted-by":"publisher","first-page":"288","DOI":"10.1145\/329466.329484","volume":"17","author":"K.S. McKinley","year":"1999","unstructured":"McKinley, K.S., Temam, O.: Quantifying loop nest locality using spec\u201995 and the perfect benchmarks. ACM Trans. Comput. Syst.\u00a017(4), 288\u2013336 (1999)","journal-title":"ACM Trans. Comput. Syst."},{"issue":"6","key":"15_CR25","doi-asserted-by":"publisher","first-page":"641","DOI":"10.1023\/A:1018782528453","volume":"26","author":"N. Mitchell","year":"1998","unstructured":"Mitchell, N., H\u00f6gstedt, K., Carter, L., Ferrante, J.: Quantifying the multi-level nature of tiling interactions. International Journal of Parallel Programming\u00a026(6), 641\u2013670 (1998)","journal-title":"International Journal of Parallel Programming"},{"key":"15_CR26","doi-asserted-by":"crossref","unstructured":"Moazeni, M., Bui, A., Sarrafzadeh, M.: A Memory Optimization Technique for Software-Managed Scratchpad Memory in GPUs (July 2009), http:\/\/www.sasp-conference.org\/index.html","DOI":"10.1109\/SASP.2009.5226334"},{"issue":"2","key":"15_CR27","doi-asserted-by":"publisher","first-page":"169","DOI":"10.1007\/BF00130708","volume":"10","author":"J. Nieplocha","year":"1996","unstructured":"Nieplocha, J., Harrison, R.J., Littlefield, R.J.: Global arrays: A nonuniform memory access programming model for high-performance computers. Journal of Supercomputing\u00a010(2), 169\u2013189 (1996)","journal-title":"Journal of Supercomputing"},{"issue":"2","key":"15_CR28","doi-asserted-by":"publisher","first-page":"183","DOI":"10.1007\/s11227-006-7957-2","volume":"36","author":"A. Qasem","year":"2006","unstructured":"Qasem, A., Kennedy, K., Mellor-Crummey, J.M.: Automatic tuning of whole applications using direct search and a performance-based transformation system. The Journal of Supercomputing\u00a036(2), 183\u2013196 (2006)","journal-title":"The Journal of Supercomputing"},{"key":"15_CR29","doi-asserted-by":"crossref","unstructured":"Ren, M., Park, J.Y., Houston, M., Aiken, A., Dally, W.J.: A tuning framework for software-managed memory hierarchies. In: PACT, pp. 280\u2013291 (2008)","DOI":"10.1145\/1454115.1454155"},{"key":"15_CR30","doi-asserted-by":"crossref","unstructured":"Renganarayana, L., Harthikote-matha, M., Dewri, R., Rajopadhye, S.: Towards optimal multi-level tiling for stencil computations. In: IPDPS (2007)","DOI":"10.1109\/IPDPS.2007.370291"},{"key":"15_CR31","doi-asserted-by":"crossref","unstructured":"Renganarayana, L., Rajopadhye, S.: Positivity, posynomials and tile size selection. In: SC, pp. 1\u201312 (2008)","DOI":"10.1109\/SC.2008.5213293"},{"key":"15_CR32","doi-asserted-by":"crossref","unstructured":"Rivera, G., wen Tseng, C.: Locality Optimizations for Multi-level Caches. In: Proceedings of the SC 1999 (November 1999)","DOI":"10.1145\/331532.331534"},{"key":"15_CR33","first-page":"13","volume-title":"SC","author":"S.K. Sahoo","year":"2005","unstructured":"Sahoo, S.K., Krishnamoorthy, S., Panuganti, R., Sadayappan, P.: Integrated loop optimizations for data locality enhancement of tensor contraction expressions. In: SC, p. 13. IEEE Computer Society, Los Alamitos (2005)"},{"key":"15_CR34","doi-asserted-by":"crossref","unstructured":"Sundaram, N., Raghunathan, A., Chakradhar, S.: A framework for efficient and scalable execution of domain-specific templates on GPUs. In: IPDPS (2009)","DOI":"10.1109\/IPDPS.2009.5161039"},{"key":"15_CR35","doi-asserted-by":"crossref","unstructured":"Tarditi, D., Puri, S., Oglesby, J.: Accelerator: using data parallelism to program gpus for general-purpose uses. In: ASPLOS, pp. 325\u2013335 (2006)","DOI":"10.1145\/1168919.1168898"},{"key":"15_CR36","doi-asserted-by":"crossref","unstructured":"Thakur, R., Bordawekar, R., Choudhary, A.: Compilation of out-of-core data parallel programs for distributed memory machines. In: Second Annual Workshop on Input\/Output in Parallel Computer Systems (IPPS), pp. 54\u201372 (April 1994)","DOI":"10.1145\/190787.190793"},{"issue":"2","key":"15_CR37","doi-asserted-by":"publisher","first-page":"472","DOI":"10.1145\/1151074.1151085","volume":"5","author":"S. Udayakumaran","year":"2006","unstructured":"Udayakumaran, S., Dominguez, A., Barua, R.: Dynamic allocation for scratch-pad memory using compile-time decisions. ACM Trans. Embed. Comput. Syst.\u00a05(2), 472\u2013511 (2006)","journal-title":"ACM Trans. Embed. Comput. Syst."},{"issue":"3","key":"15_CR38","doi-asserted-by":"publisher","first-page":"219","DOI":"10.1023\/B:SUPE.0000011386.69245.f5","volume":"27","author":"Q. Yi","year":"2004","unstructured":"Yi, Q., Kennedy, K., Adve, V.: Transforming complex loop nests for locality. J. Supercomput.\u00a027(3), 219\u2013264 (2004)","journal-title":"J. Supercomput."}],"container-title":["Lecture Notes in Computer Science","Compiler Construction"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-19861-8_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,6,5]],"date-time":"2023-06-05T18:42:18Z","timestamp":1685990538000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-19861-8_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2011]]},"ISBN":["9783642198601","9783642198618"],"references-count":38,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-19861-8_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2011]]}}}