{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,1,21]],"date-time":"2025-01-21T05:19:58Z","timestamp":1737436798082,"version":"3.33.0"},"publisher-location":"Berlin, Heidelberg","reference-count":43,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783540425007"},{"type":"electronic","value":"9783540446880"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2001]]},"DOI":"10.1007\/3-540-44688-5_3","type":"book-chapter","created":{"date-parts":[[2007,8,28]],"date-time":"2007-08-28T13:43:33Z","timestamp":1188308613000},"page":"26-38","source":"Crossref","is-referenced-by-count":8,"title":["Fractal Matrix Multiplication: A Case Study on Portability of Cache Performance"],"prefix":"10.1007","author":[{"given":"Gianfranco","family":"Bilardi","sequence":"first","affiliation":[]},{"given":"Paolo","family":"D\u2019Alberto","sequence":"additional","affiliation":[]},{"given":"Alex","family":"Nicolau","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2001,8,17]]},"reference":[{"key":"3_CR1","doi-asserted-by":"crossref","unstructured":"A. Aggarwal, B. Alpern, A. K. Chandra and M. Snir: A model for hierarchical memory. Proc. of 19th Annual ACM Symposium on the Theory of Computing, New York, 1987, 305\u2013314.","DOI":"10.1145\/28395.28428"},{"key":"3_CR2","doi-asserted-by":"crossref","unstructured":"A. Aggarwal, A. K. Chandra and M. Snir: Hierarchical memory with block transfer. 1987 IEEE.","DOI":"10.1109\/SFCS.1987.31"},{"key":"3_CR3","doi-asserted-by":"publisher","first-page":"72","DOI":"10.1007\/BF01185206","volume":"12","author":"B. Alpern","year":"1994","unstructured":"B. Alpern, L. Carter, E. Feig and T. Selker: The uniform memory hierarchy model of computation. In Algorithmica, vol. 12, (1994), 72\u2013129.","journal-title":"Algorithmica"},{"key":"3_CR4","doi-asserted-by":"crossref","unstructured":"U. Banerjee, R. Eigenmann, A. Nicolau and D. Padua: Automatic program parallelization. Proceedings of the IEEE vol 81, n. 2 Feb. 1993.","DOI":"10.1109\/5.214548"},{"key":"3_CR5","doi-asserted-by":"crossref","unstructured":"G. Bilardi, P. D\u2019Alberto, and A. Nicolau: Fractal Matrix Multiplication: a Case Study on Portability of Cache Performance, University of California at Irvine, ICS TR#00-21, 2000.","DOI":"10.1007\/3-540-44688-5_3"},{"key":"3_CR6","doi-asserted-by":"publisher","first-page":"531","DOI":"10.1007\/s002240000131","volume":"32","author":"G. Bilardi","year":"1999","unstructured":"G. Bilardi and F. P. Preparata: Processor-time tradeoffs under bounded-speed message propagation. Part II: lower bounds. Theory of Computing Systems, Vol. 32, 531\u2013559, 1999.","journal-title":"Theory of Computing Systems"},{"key":"3_CR7","doi-asserted-by":"crossref","unstructured":"G. Bilardi, E. Peserico: An Approach toward an Analytical Characterization of Locality and its Portability. IWIA 2000, International Workshop on Innovative Architectures, Maui, Hawai, January 2001.","DOI":"10.1109\/IWIA.2001.955195"},{"key":"3_CR8","doi-asserted-by":"crossref","unstructured":"G. Bilardi, E. Peserico: A Characterization of Temporal Locality and its Portability Across Memory Hierarchies. ICALP 2001, International Colloquium on Automata, Languages, and Programming, Crete, July 2001.","DOI":"10.1007\/3-540-48224-5_11"},{"key":"3_CR9","doi-asserted-by":"crossref","unstructured":"G. Bilardi, A. Pietracaprina, and P. D\u2019Alberto: On the space and access complexity of computation DAGs. 26th Workshop on Graph-Theoretic Concepts in Computer Science, Konstanz, Germany, June 2000.","DOI":"10.1007\/3-540-40064-8_6"},{"key":"3_CR10","doi-asserted-by":"crossref","unstructured":"J. Bilmes, Krste Asanovic, C. Chin and J. Demmel: Optimizing matrix multiply using PHiPAC: a portable, high-performance, Ansi C coding methodology. International Conference on Supercomputing, July 1997.","DOI":"10.1145\/263580.263662"},{"key":"3_CR11","doi-asserted-by":"crossref","unstructured":"S. Carr and K. Kennedy: Compiler blockability of numerical algorithms. Proceedings of Supercomputing Nov 1992, pg. 114\u2013124.","DOI":"10.1109\/SUPERC.1992.236704"},{"key":"3_CR12","doi-asserted-by":"crossref","unstructured":"S. Chatterjee, V. V. Jain, A. R. Lebeck and S. Mundhra: Nonlinear array layouts for hierarchical memory systems. Proc. of ACM international Conference on Supercomputing, Rhodes,Greece, June 1999.","DOI":"10.1145\/305138.305231"},{"key":"3_CR13","doi-asserted-by":"crossref","unstructured":"S. Chatterjee, A. R. Lebeck, P. K. Patnala and M. Thottethodi: Recursive array layout and fast parallel matrix multiplication. Proc. 11-th ACM SIGPLAN, June 1999.","DOI":"10.1145\/305619.305645"},{"key":"3_CR14","doi-asserted-by":"crossref","unstructured":"D. Coppersmith and S. Winograd: Matrix multiplication via arithmetic progression. In Poceedings of 9th annual ACM Symposium on Theory of Computing pag. 1\u20136, 1987.","DOI":"10.1145\/28395.28396"},{"key":"3_CR15","unstructured":"P. D\u2019Alberto, G. Bilardi and A. Nicolau: Fractal LU-decomposition with partial pivoting. Manuscript."},{"key":"3_CR16","unstructured":"M. J. Dayde and I. S. Duff: A blocked implementation of level 3 BLAS for RISC processors. TRPA9606, available on line http:\/\/www.cerfacs.fr\/algorreports\/TRPA9606.ps.gz Apr. 6 1996."},{"key":"3_CR17","unstructured":"N. Eiron, M. Rodeh and I. Steinwarts: Matrix multiplication: a case study of algorithm engineering. Proceedings WAE\u201998, Saarbr\u00fccken, Germany, Aug.20\u201322, 1998"},{"key":"3_CR18","unstructured":"Engineering and Scientific Subroutine Library. http:\/\/www.rs6000.ibm.com\/resource\/aixresource\/spbooks\/essl\/"},{"key":"3_CR19","unstructured":"P. Flajolet, G. Gonnet, C. Puech and J. M. Robson: The analysis of multidimentional searching in Quad-Tree. Proceeding of the second Annual ACM-SIAM symposium on Discrete Algorithms, San Francisco, 1991, pag. 100\u2013109."},{"key":"3_CR20","doi-asserted-by":"crossref","unstructured":"J. D. Frens and D. S. Wise: Auto-blocking matrix-multiplication or tracking BLAS3 performance from source code. Proc. 1997 ACM Symp. on Principles and Practice of Parallel Programming, SIGPLAN Not. 32, 7 (July 1997), 206\u2013216.","DOI":"10.1145\/263767.263789"},{"key":"3_CR21","doi-asserted-by":"crossref","unstructured":"M. Frigo and S. G. Johnson: The fastest Fourier transform in the west. MIT-LCS-TR-728 Massachusetts Institute of technology, Sep. 11 1997.","DOI":"10.21236\/ADA479065"},{"key":"3_CR22","doi-asserted-by":"crossref","unstructured":"M. Frigo, C. E. Leiserson, H. Prokop and S. Ramachandran: Cache-oblivious algorithms. Proc. 40th Annual Symposium on Foundations of Computer Science, (1999).","DOI":"10.1109\/SFFCS.1999.814600"},{"key":"3_CR23","doi-asserted-by":"crossref","unstructured":"E. D. Granston, W. Jalby and O. Teman: To copy or not to copy: a compiletime technique for assessing when data copying should be used to eliminate cache conflicts. Proceedings of Supercomputing Nov 1993, pg. 410\u2013419.","DOI":"10.1145\/169627.169762"},{"key":"3_CR24","unstructured":"G. H. Golub and C. F. van Loan: Matrix computations. Johns Hopkins editor 3-rd edition."},{"key":"3_CR25","doi-asserted-by":"crossref","unstructured":"F. G. Gustavson: Recursion leads to automatic variable blocking for dense linear algebra algorithms. Journal of Research and Development Volume 41, Number 6, November 1997.","DOI":"10.1147\/rd.416.0737"},{"key":"3_CR26","doi-asserted-by":"crossref","unstructured":"F. Gustavson, A. Henriksson, I. Jonsson, P. Ling, and B. Kagstrom: Recursive blocked data formats and BLAS\u2019s for dense linear algebra algorithms. In B. Kagstrom et al (eds), Applied Parallel Computing. Large Scale Scientific and Industrial Problems, PARA\u201998 Proceedings. Lecture Notes in Computing Science, No. 1541, p. 195\u2013206, Springer Verlag, 1998.","DOI":"10.1007\/BFb0095337"},{"key":"3_CR27","unstructured":"N. J. Higham: Accuracy and stability of numerical algorithms ed. SIAM 1996"},{"key":"3_CR28","doi-asserted-by":"crossref","unstructured":"Hong Jia-Wei and T. H. Kung: I\/O complexity:The Red-Blue pebble game. Proc.of the 13th Ann. ACM Symposium on Theory of Computing Oct.1981, 326\u2013333.","DOI":"10.1145\/800076.802486"},{"key":"3_CR29","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1145\/292395.292426","volume":"24","author":"B. K\u00ffagstr\u00f6m","year":"1998","unstructured":"B. K\u00ffagstr\u00f6m, P. Ling and C. Van Loan: Algorithm 784: GEMM-based level 3 BLAS: portability and optimization issues. ACM transactions on Mathematical Software, Vol24, No. 3, Sept. 1998, pages 303\u2013316","journal-title":"ACM transactions on Mathematical Software"},{"key":"3_CR30","doi-asserted-by":"publisher","first-page":"268","DOI":"10.1145\/292395.292412","volume":"24","author":"B. K\u00ffagstr\u00f6m","year":"1998","unstructured":"B. K\u00ffagstr\u00f6m, P. Ling and C. Van Loan: GEMM-based level 3 BLAS: highperformance model implementations and performance evaluation benchmark. ACM transactions on Mathematical Software, Vol24, No. 3, Sept. 1998, pages 268\u2013302.","journal-title":"ACM transactions on Mathematical Software"},{"key":"3_CR31","doi-asserted-by":"crossref","unstructured":"M. Lam, E. Rothberg and M. Wolfe: The cache performance and optimizations of blocked algorithms. Proceedings of the fourth international conference on architectural support for programming languages and operating system, Apr. 1991,pg. 63\u201374.","DOI":"10.1145\/106972.106981"},{"key":"3_CR32","unstructured":"S. S. Muchnick: Advanced compiler design implementation. Morgan Kaufman"},{"key":"3_CR33","unstructured":"P. D\u2019Alberto: Performance Evaluation of Data Locality Exploitation. Techincal Report UBLCS-2000-9. Department of Computer Science, University of Bologna."},{"key":"3_CR34","series-title":"Lect Notes Comput Sci","doi-asserted-by":"crossref","DOI":"10.1007\/3-540-62573-9","volume-title":"Solving Irregularly Structured Problems","author":"P. R. Panda","year":"1997","unstructured":"P. R. Panda, H. Nakamura, N. D. Dutt and A. Nicolau: Improving cache performance through tiling and data alignment. Solving Irregularly Structured Problems in Parallel Lecture Notes in Computer Science, Springer-Verlag 1997."},{"key":"3_CR35","unstructured":"John E. Savage: Space-Time tradeoff in memory hierarchies. Technical report Oct 19, 1993."},{"issue":"3","key":"3_CR36","doi-asserted-by":"publisher","first-page":"354","DOI":"10.1007\/BF02165411","volume":"14","author":"V. Strassen","year":"1969","unstructured":"V. Strassen: Gaussian elimination is not optimal. Numerische Mathematik 14(3):354\u2013356, 1969.","journal-title":"Numerische Mathematik"},{"issue":"4","key":"3_CR37","doi-asserted-by":"publisher","first-page":"1065","DOI":"10.1137\/S0895479896297744","volume":"18","author":"S. Toledo","year":"1997","unstructured":"S. Toledo: Locality of reference in LU decomposition with partial pivoting. SIAM J.Matrix Anal. Appl. Vol.18, No. 4, pp. 1065\u20131081, Oct. 1997","journal-title":"SIAM J.Matrix Anal. Appl."},{"key":"3_CR38","doi-asserted-by":"crossref","unstructured":"M. Thottethodi, S. Chatterjee and A. R. Lebeck: Tuning Strassen\u2019s matrix multiplication for memory efficiency. Proc. SC98, Orlando,FL, nov.1998 ( http:\/\/www.supercomp.org\/sc98 .","DOI":"10.1109\/SC.1998.10045"},{"key":"3_CR39","unstructured":"R. C. Whaley and J. J. Dongarra: Automatically Tuned Linear Algebra Software. http:\/\/www.netlib.org\/atlas\/index.html"},{"key":"3_CR40","unstructured":"D. S. Wise: Undulant-block elimination and integer-preserving matrix inversion. Technical Report 418 Computer Science Department Indiana University August 1995"},{"key":"3_CR41","doi-asserted-by":"crossref","unstructured":"M. Wolfe: More iteration space tiling. Proceedings of Supercomputing, Nov. 1989, pg. 655\u2013665.","DOI":"10.1145\/76263.76337"},{"key":"3_CR42","doi-asserted-by":"crossref","unstructured":"M. Wolfe and M. Lam: A Data locality optimizing algorithm. Proceedings of the ACM SIGPLAN\u201991 conference on programming Language Design and Implementation, Toronto, Ontario,Canada, June 26\u201328, 1991.","DOI":"10.1145\/113445.113449"},{"key":"3_CR43","unstructured":"M. Wolfe: High performance compilers for parallel computing. Addison-Wesley Pub.Co. 1995"}],"container-title":["Lecture Notes in Computer Science","Algorithm Engineering"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/3-540-44688-5_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,20]],"date-time":"2025-01-20T18:07:00Z","timestamp":1737396420000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/3-540-44688-5_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2001]]},"ISBN":["9783540425007","9783540446880"],"references-count":43,"URL":"https:\/\/doi.org\/10.1007\/3-540-44688-5_3","relation":{},"ISSN":["0302-9743"],"issn-type":[{"type":"print","value":"0302-9743"}],"subject":[],"published":{"date-parts":[[2001]]}}}