{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T12:14:11Z","timestamp":1763468051418},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2011,11,24]],"date-time":"2011-11-24T00:00:00Z","timestamp":1322092800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Cluster Comput"],"published-print":{"date-parts":[[2013,3]]},"DOI":"10.1007\/s10586-011-0179-2","type":"journal-article","created":{"date-parts":[[2011,11,24]],"date-time":"2011-11-24T10:11:40Z","timestamp":1322129500000},"page":"131-155","source":"Crossref","is-referenced-by-count":28,"title":["Optimizing tensor contraction expressions for hybrid CPU-GPU execution"],"prefix":"10.1007","volume":"16","author":[{"given":"Wenjing","family":"Ma","sequence":"first","affiliation":[]},{"given":"Sriram","family":"Krishnamoorthy","sequence":"additional","affiliation":[]},{"given":"Oreste","family":"Villa","sequence":"additional","affiliation":[]},{"given":"Karol","family":"Kowalski","sequence":"additional","affiliation":[]},{"given":"Gagan","family":"Agrawal","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2011,11,24]]},"reference":[{"key":"179_CR1","unstructured":"Anzt, H., Hahn, T., Heuveline, V., Rocker, B.: GPU accelerated scientific computing: evaluation of the NVIDIA Fermi architecture; elementary kernels and linear solvers (2010). http:\/\/www.emcl.kit.edu\/preprints\/emcl-preprint-2010-04.pdf"},{"key":"179_CR2","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/1654059.1654127","volume-title":"Proceedings of the ACM\/IEEE SC Conference on High Performance Networking and Computing","author":"E. Apr\u00e0","year":"2009","unstructured":"Apr\u00e0, E., Rendell, A.P., Harrison, R.J., Tipparaju, V., deJong, W.A., Xantheas, S.S.: Liquid water: obtaining the right answer for the right reasons. In: Proceedings of the ACM\/IEEE SC Conference on High Performance Networking and Computing, pp. 1\u20137 (2009). doi: 10.1145\/1654059.1654127"},{"key":"179_CR3","doi-asserted-by":"crossref","first-page":"211","DOI":"10.1080\/00268970500275780","volume":"2","author":"A. Auer","year":"2006","unstructured":"Auer, A., Baumgartner, G., Bernholdt, D., Bibireata, A., Choppella, V., Cociorva, D., Gao, X., Harrison, R., Krishnamoorthy, S., Krishnan, S., Lam, C., Lu, Q., Nooijen, M., Pitzer, R., Ramanujam, J., Sadayappan, P., Sibiryakov, A.: Automatic code generation for many-body electronic structure methods: the tensor contraction engine. Mol. Phys. 2, 211 (2006)","journal-title":"Mol. Phys."},{"key":"179_CR4","first-page":"105","volume-title":"Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP)","author":"S.S. Baghsorkhi","year":"2010","unstructured":"Baghsorkhi, S.S., Delahaye, M., Patel, S.J., Gropp, W.D., Hwu,\u00a0W.M.: An adaptive performance modeling tool for GPU architectures. In: Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP), pp. 105\u2013114 (2010). doi: 10.1145\/1693453.1693470"},{"issue":"1","key":"179_CR5","doi-asserted-by":"crossref","first-page":"291","DOI":"10.1103\/RevModPhys.79.291","volume":"79","author":"R.J. Bartlett","year":"2007","unstructured":"Bartlett, R.J., Musia\u0142, M.: Coupled-cluster theory in quantum chemistry. Rev. Mod. Phys. 79(1), 291\u2013352 (2007). doi: 10.1103\/RevModPhys.79.291","journal-title":"Rev. Mod. Phys."},{"key":"179_CR6","first-page":"225","volume-title":"Proceedings of the International Conference on Supercomputing (ICS)","author":"M.M. Baskaran","year":"2008","unstructured":"Baskaran, M.M., Bondhugula, U., Krishnamoorthy, S., Ramanujam, J., Rountev, A., Sadayappan, P.: A compiler framework for optimization of affine loop nests for GPGPUs. In: Proceedings of the International Conference on Supercomputing (ICS), pp. 225\u2013234 (2008). doi: 10.1145\/1375527.1375562"},{"issue":"2","key":"179_CR7","doi-asserted-by":"crossref","first-page":"276","DOI":"10.1109\/JPROC.2004.840311","volume":"93","author":"G. Baumgartner","year":"2005","unstructured":"Baumgartner, G., Auer, A., Bernholdt, D., Bibireata, A., Choppella, V., Cociorva, D., Gao, X., Harrison, R., Hirata, S., Krishnamoorthy, S., et al.: Synthesis of high-performance parallel programs for a class of ab initio quantum chemistry models. Proc. IEEE 93(2), 276\u2013292 (2005)","journal-title":"Proc. IEEE"},{"key":"179_CR8","first-page":"1","volume-title":"Proceedings of the International Parallel and Distributed Processing Symposium (IPDPS)","author":"M. Boyer","year":"2009","unstructured":"Boyer, M., Tarjan, D., Acton, S.T., Skadron, K.: Accelerating leukocyte tracking using CUDA: a case study in leveraging manycore coprocessors. In: Proceedings of the International Parallel and Distributed Processing Symposium (IPDPS), pp. 1\u201312 (2009). doi: 10.1109\/IPDPS.2009.5160984"},{"issue":"10","key":"179_CR9","doi-asserted-by":"crossref","first-page":"1370","DOI":"10.1016\/j.jpdc.2008.05.014","volume":"68","author":"S. Che","year":"2008","unstructured":"Che, S., Meng, J., Sheaffer, J.W., Skadron, K.: A performance study of general-purpose applications on graphics processors using CUDA. J. Parallel Distrib. Comput. 68(10), 1370\u20131380 (2008). doi: 10.1016\/j.jpdc.2008.05.014","journal-title":"J. Parallel Distrib. Comput."},{"key":"179_CR10","first-page":"115","volume-title":"Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP)","author":"J.W. Choi","year":"2010","unstructured":"Choi, J.W., Singh, A., Vuduc, R.W.: Model-driven autotuning of sparse matrix-vector multiply on GPUs. In: Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP), pp. 115\u2013126 (2010). doi: 10.1145\/1693453.1693471"},{"issue":"11","key":"179_CR11","doi-asserted-by":"crossref","first-page":"4256","DOI":"10.1063\/1.1727484","volume":"45","author":"J. \u010ci\u017eek","year":"1966","unstructured":"\u010ci\u017eek, J.: On correlation problem in atomic and molecular systems. Calculation of wavefunction components in ursell-type expansion using quantum-field theoretical methods. J. Chem. Phys. 45(11), 4256\u20134266 (1966)","journal-title":"J. Chem. Phys."},{"key":"179_CR12","unstructured":"Consortium, H.T.: PCI Express 3.0 specification. http:\/\/www.hypertransport.org\/docs\/twgdocs\/HTC20051222-00046-0028.pdf (2011)"},{"issue":"5","key":"179_CR13","doi-asserted-by":"crossref","first-page":"1287","DOI":"10.1021\/ct100584w","volume":"7","author":"A.E. DePrince","year":"2011","unstructured":"DePrince, A.E., Hammond, J.R.: Coupled cluster theory on graphics processing units I. The coupled cluster doubles method. J.\u00a0Chem. Theory Comput. 7(5), 1287\u20131295 (2011). doi: 10.1021\/ct100584w . http:\/\/pubs.acs.org\/doi\/abs\/10.1021\/ct100584w","journal-title":"J.\u00a0Chem. Theory Comput."},{"key":"179_CR14","doi-asserted-by":"crossref","first-page":"257","DOI":"10.1145\/1941553.1941589","volume-title":"Proceedings of the 16th ACM Symposium on Principles and Practice of Parallel Programming, PPoPP \u201911","author":"Y. Dotsenko","year":"2011","unstructured":"Dotsenko, Y., Baghsorkhi, S.S., Lloyd, B., Govindaraju, N.K.: Auto-tuning of fast Fourier transform on graphics processors. In: Proceedings of the 16th ACM Symposium on Principles and Practice of Parallel Programming, PPoPP \u201911, pp. 257\u2013266. ACM Press, New York (2011). doi: 10.1145\/1941553.1941589 . URL http:\/\/doi.acm.org\/10.1145\/1941553.1941589"},{"key":"179_CR15","doi-asserted-by":"crossref","first-page":"1007","DOI":"10.1063\/1.456153","volume":"90","author":"T. Dunning","year":"1989","unstructured":"Dunning, T.: Gaussian basis sets for use in correlated molecular calculations I. The atoms boron through neon and hydrogen. J.\u00a0Chem. Phys. 90, 1007\u20131023 (1989)","journal-title":"J.\u00a0Chem. Phys."},{"key":"179_CR16","doi-asserted-by":"crossref","first-page":"2074","DOI":"10.1021\/ct900227j","volume":"5","author":"C. Filippi","year":"2009","unstructured":"Filippi, C., Zaccheddu, M., Buda, F.: Absorption spectrum of the green fluorescent protein chromophore: a difficult case for ab initio methods? J. Chem. Theory Comput. 5, 2074\u20132087 (2009)","journal-title":"J. Chem. Theory Comput."},{"issue":"5","key":"179_CR17","doi-asserted-by":"crossref","first-page":"151","DOI":"10.1145\/1168917.1168877","volume":"40","author":"M.I. Gordon","year":"2006","unstructured":"Gordon, M.I., Thies, W., Amarasinghe, S.: Exploiting coarse-grained task, data, and pipeline parallelism in stream programs. Oper. Syst. Rev. 40(5), 151\u2013162 (2006). doi: 10.1145\/1168917.1168877","journal-title":"Oper. Syst. Rev."},{"key":"179_CR18","unstructured":"Hammond, J.R., De Prince, III, A.E.: Evaluating one-sided programming models for gpu cluster computations. http:\/\/saahpc.ncsa.illinois.edu\/papers\/paper_43.pdf (2011)"},{"key":"179_CR19","first-page":"197","volume-title":"Proceedings of the International Conference on High Performance Computing (HiPC)","author":"P. Harish","year":"2007","unstructured":"Harish, P., Narayanan, P.: Accelerating large graph algorithms on the GPU using CUDA. In: Proceedings of the International Conference on High Performance Computing (HiPC), pp. 197\u2013208 (2007)"},{"issue":"46","key":"179_CR20","doi-asserted-by":"crossref","first-page":"9887","DOI":"10.1021\/jp034596z","volume":"107","author":"S. Hirata","year":"2003","unstructured":"Hirata, S.: Tensor contraction engine: abstraction and automated parallel implementation of configuration-interaction, coupled-cluster, and many-body perturbation theories. J. Phys. Chem. 107(46), 9887\u20139897 (2003)","journal-title":"J. Phys. Chem."},{"key":"179_CR21","doi-asserted-by":"crossref","first-page":"152","DOI":"10.1145\/1555754.1555775","volume-title":"ISCA \u201909: Proceedings of the 36th Annual International Symposium on Computer Architecture","author":"S. Hong","year":"2009","unstructured":"Hong, S., Kim, H.: An analytical model for a GPU architecture with memory-level and thread-level parallelism awareness. In: ISCA \u201909: Proceedings of the 36th Annual International Symposium on Computer Architecture, pp. 152\u2013163. ACM Press, New York (2009). doi: 10.1145\/1555754.1555775"},{"key":"179_CR22","unstructured":"Intel: An introduction to the Intel QuickPath Interconnect. Document Number: 320412, January 2009, http:\/\/www.intel.com\/technology\/quickpath\/introduction.pdf"},{"key":"179_CR23","volume-title":"Proceedings of the ACM\/IEEE SC Conference on High Performance Networking and Computing","author":"K. Kowalski","year":"2011","unstructured":"Kowalski, K., Krishnamoorthy, S., Olson, R.M., Tipparaju, V., Apra, E.: Scalable implementations of accurate excited-state coupled cluster theories: application of high-level methods to porphyrin-based systems. In: Proceedings of the ACM\/IEEE SC Conference on High Performance Networking and Computing (2011). doi: 10.1145\/2063384.2063481"},{"key":"179_CR24","first-page":"884","volume-title":"Proceedings of the International Conference on Computational Science (ICCS)","author":"Y. Li","year":"2009","unstructured":"Li, Y., Dongarra, J., Tomov, S.: A note on auto-tuning GEMM for GPUs. In: Proceedings of the International Conference on Computational Science (ICCS), pp. 884\u2013892 (2009). doi: 10.1007\/978-3-642-01970-8-89"},{"key":"179_CR25","doi-asserted-by":"crossref","first-page":"233","DOI":"10.1145\/1152154.1152190","volume-title":"Proceedings of the Conference on Parallel Architectures and Compilation Techniques (PACT)","author":"Q. Lu","year":"2006","unstructured":"Lu, Q., Krishnamoorthy, S., Sadayappan, P.: Combining analytical and empirical approaches in tuning matrix transposition. In: Proceedings of the Conference on Parallel Architectures and Compilation Techniques (PACT), pp. 233\u2013242 (2006). doi: 10.1145\/1152154.1152190"},{"key":"179_CR26","doi-asserted-by":"crossref","first-page":"400","DOI":"10.1145\/1542275.1542331","volume-title":"Proceedings of the International Conference on Supercomputing (ICS)","author":"W. Ma","year":"2009","unstructured":"Ma, W., Agrawal, G.: A translation system for enabling data mining applications on GPUs. In: Proceedings of the International Conference on Supercomputing (ICS), pp. 400\u2013409 (2009). doi: 10.1145\/1542275.1542331"},{"issue":"5","key":"179_CR27","doi-asserted-by":"crossref","first-page":"1316","DOI":"10.1021\/ct1007247","volume":"7","author":"W. Ma","year":"2011","unstructured":"Ma, W., Krishnamoorthy, S., Villa, O., Kowalski, K.: GPU-based implementations of the noniterative regularized-CCSD(T) corrections: applications to strongly correlated systems. J. Chem. Theory Comput. 7(5), 1316\u20131327 (2011). doi: 10.1021\/ct1007247 . URL http:\/\/pubs.acs.org\/doi\/abs\/10.1021\/ct1007247","journal-title":"J. Chem. Theory Comput."},{"key":"179_CR28","first-page":"261","volume-title":"Proceedings of the Conference on Parallel Architectures and Compilation Techniques (PACT)","author":"D. Molka","year":"2009","unstructured":"Molka, D., Hackenberg, D., Schone, R., Muller, M.S.: Memory performance and cache coherency effects on an intel nehalem multiprocessor system. In: Proceedings of the Conference on Parallel Architectures and Compilation Techniques (PACT), pp. 261\u2013270 (2009). doi: 10.1109\/PACT.2009.22"},{"key":"179_CR29","doi-asserted-by":"crossref","unstructured":"Murthy, S.G.: Optimal loop unrolling for GPGPU programs. Master\u2019s thesis, The Ohio State University (2009)","DOI":"10.1109\/IPDPS.2010.5470423"},{"key":"179_CR30","doi-asserted-by":"crossref","unstructured":"Nath, R., Tomov, S., Dongarra, J.: An improved MAGMA GEMM for fermi GPUs. http:\/\/icl.cs.utk.edu\/projectsfiles\/magma\/pubs\/fermi_gemm.pdf (2010)","DOI":"10.1177\/1094342010385729"},{"issue":"2","key":"179_CR31","doi-asserted-by":"crossref","first-page":"40","DOI":"10.1145\/1365490.1365500","volume":"6","author":"J. Nickolls","year":"2008","unstructured":"Nickolls, J., Buck, I., Garland, M., Skadron, K.: Scalable parallel programming with CUDA. ACM Queue 6(2), 40\u201353 (2008). doi: 10.1145\/1365490.1365500","journal-title":"ACM Queue"},{"issue":"2","key":"179_CR32","doi-asserted-by":"crossref","first-page":"233","DOI":"10.1177\/1094342006064504","volume":"20","author":"J. Nieplocha","year":"2006","unstructured":"Nieplocha, J., Tipparaju, V., Krishnan, M., Panda, D.: High performance remote memory access communication: the armci approach. Int. J. High Perform. Comput. Appl. 20(2), 233 (2006)","journal-title":"Int. J. High Perform. Comput. Appl."},{"key":"179_CR33","first-page":"1","volume-title":"Proceedings of the ACM\/IEEE SC Conference on High Performance Networking and Computing","author":"A. Nukada","year":"2008","unstructured":"Nukada, A., Ogata, Y., Endo, T., Matsuoka, S.: Bandwidth intensive 3-D FFT kernel for GPUs using CUDA. In: Proceedings of the ACM\/IEEE SC Conference on High Performance Networking and Computing, pp. 1\u201311 (2008)"},{"key":"179_CR34","unstructured":"Nvidia: NVIDIA\u2019s next generation CUDA compute architecture: Fermi. http:\/\/www.nvidia.com\/object\/fermi_architecture.html"},{"key":"179_CR35","unstructured":"NVIDIA: NVIDIA CUDA Programming guide, version 3.0 (2010)"},{"key":"179_CR36","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1002\/9780470141694.ch1","volume":"110","author":"J. Paldus","year":"1999","unstructured":"Paldus, J., Li, X.: A critical assessment of coupled cluster method in quantum chemistry. Adv. Chem. Phys. 110, 1\u2013175 (1999)","journal-title":"Adv. Chem. Phys."},{"issue":"6","key":"179_CR37","doi-asserted-by":"crossref","first-page":"479","DOI":"10.1016\/S0009-2614(89)87395-6","volume":"157","author":"K. Raghavachari","year":"1989","unstructured":"Raghavachari, K., Trucks, G.W., Pople, J.A., Head-Gordon, M.: A\u00a05th-order perturbation comparison of electron correlation theories. Chem. Phys. Lett. 157(6), 479\u2013483 (1989)","journal-title":"Chem. Phys. Lett."},{"key":"179_CR38","doi-asserted-by":"crossref","first-page":"73","DOI":"10.1145\/1345206.1345220","volume-title":"Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP)","author":"S. Ryoo","year":"2008","unstructured":"Ryoo, S., Rodrigues, C.I., Baghsorkhi, S.S., Stone, S.S., Kirk, D.B., Hwu, W.M.: Optimization principles and application performance evaluation of a multithreaded GPU using CUDA. In: Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP), pp. 73\u201382 (2008). doi: 10.1145\/1345206.1345220"},{"key":"179_CR39","first-page":"195","volume-title":"Proceedings of the International Symposium on Code Generation and Optimization (CGO)","author":"S. Ryoo","year":"2008","unstructured":"Ryoo, S., Rodrigues, C.I., Stone, S.S., Baghsorkhi, S.S., Ueng, S.Z., Stratton, J.A., Hwu, W.M.W.: Program optimization space pruning for a multithreaded GPU. In: Proceedings of the International Symposium on Code Generation and Optimization (CGO), pp. 195\u2013204 (2008). doi: 10.1145\/1356058.1356084"},{"issue":"1","key":"179_CR40","doi-asserted-by":"crossref","first-page":"474","DOI":"10.1186\/1471-2105-8-474","volume":"8","author":"M. Schatz","year":"2007","unstructured":"Schatz, M., Trapnell, C., Delcher, A., Varshney, A.: High-throughput sequence alignment using graphics processing units. BMC Bioinform. 8(1), 474 (2007). doi: 10.1186\/1471-2105-8-474","journal-title":"BMC Bioinform."},{"key":"179_CR41","unstructured":"TOP500: http:\/\/www.top500.org (2011)"},{"key":"179_CR42","doi-asserted-by":"crossref","first-page":"200","DOI":"10.1109\/CGO.2009.20","volume-title":"Proceedings of the International Symposium on Code Generation and Optimization (CGO)","author":"A. Udupa","year":"2009","unstructured":"Udupa, A., Govindarajan, R., Thazhuthaveetil, M.J.: Software pipelined execution of stream programs on GPUs. In: Proceedings of the International Symposium on Code Generation and Optimization (CGO), pp. 200\u2013209 (2009). doi: 10.1109\/CGO.2009.20"},{"issue":"9","key":"179_CR43","doi-asserted-by":"crossref","first-page":"1477","DOI":"10.1016\/j.cpc.2010.04.018","volume":"181","author":"M. Valiev","year":"2010","unstructured":"Valiev, M., Bylaska, E., Govind, N., Kowalski, K., Straatsma, T., Dam, H.V., Wang, D., Nieplocha, J., Apra, E., Windus, T., de Jong, W.: NWChem: A comprehensive and scalable open-source solution for large scale molecular simulations. Comput. Phys. Commun. 181(9), 1477\u20131489 (2010). doi: 10.1016\/j.cpc.2010.04.018 . URL http:\/\/www.sciencedirect.com\/science\/article\/pii\/S0010465510001438","journal-title":"Comput. Phys. Commun."},{"key":"179_CR44","unstructured":"Volkov, V., Demmel, J.: LU, QR and Cholesky Factorizations using Vector Capabilities of GPUs. Tech. Rep. UCB\/EECS-2008-49, EECS Department. University of California, Berkeley (2008). URL http:\/\/www.eecs.berkeley.edu\/Pubs\/TechRpts\/2008\/EECS-2008-49.html"},{"key":"179_CR45","first-page":"1","volume-title":"Proceedings of the ACM\/IEEE SC Conference on High Performance Networking and Computing","author":"V. Volkov","year":"2008","unstructured":"Volkov, V., Demmel, J.W.: Benchmarking GPUs to tune dense linear algebra. In: Proceedings of the ACM\/IEEE SC Conference on High Performance Networking and Computing, pp. 1\u201311 (2008)"}],"container-title":["Cluster Computing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10586-011-0179-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10586-011-0179-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10586-011-0179-2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,12,17]],"date-time":"2021-12-17T21:31:14Z","timestamp":1639776674000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10586-011-0179-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2011,11,24]]},"references-count":45,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2013,3]]}},"alternative-id":["179"],"URL":"https:\/\/doi.org\/10.1007\/s10586-011-0179-2","relation":{},"ISSN":["1386-7857","1573-7543"],"issn-type":[{"value":"1386-7857","type":"print"},{"value":"1573-7543","type":"electronic"}],"subject":[],"published":{"date-parts":[[2011,11,24]]}}}