{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,28]],"date-time":"2025-11-28T17:14:55Z","timestamp":1764350095927},"publisher-location":"Berlin, Heidelberg","reference-count":28,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783642286513"},{"type":"electronic","value":"9783642286520"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2012]]},"DOI":"10.1007\/978-3-642-28652-0_2","type":"book-chapter","created":{"date-parts":[[2012,3,22]],"date-time":"2012-03-22T21:02:02Z","timestamp":1332450122000},"page":"21-40","source":"Crossref","is-referenced-by-count":24,"title":["Automatic Restructuring of GPU Kernels for Exploiting Inter-thread Data Locality"],"prefix":"10.1007","author":[{"given":"Swapneela","family":"Unkule","sequence":"first","affiliation":[]},{"given":"Christopher","family":"Shaltz","sequence":"additional","affiliation":[]},{"given":"Apan","family":"Qasem","sequence":"additional","affiliation":[]}],"member":"297","reference":[{"key":"2_CR1","unstructured":"CUDA PTX ISA, http:\/\/www.nvidia.com\/content\/CUDAptxisa1.4.pdf"},{"key":"2_CR2","unstructured":"GPU Computing SDK, http:\/\/developer.nvidia.com"},{"key":"2_CR3","unstructured":"Kernel for min-max and reduction, http:\/\/supercomputingblog.com\/cuda\/cuda-tutorial-3-thread-communication\/"},{"key":"2_CR4","unstructured":"Top 500 Supercomputer Sites, http:\/\/www.top500.org"},{"key":"2_CR5","unstructured":"CUDA Programming Guide, Version 3.0. NVIDIA (2010)"},{"issue":"6","key":"2_CR6","doi-asserted-by":"crossref","first-page":"685","DOI":"10.1002\/cpe.1553","volume":"22","author":"L. Adhianto","year":"2010","unstructured":"Adhianto, L., Banerjee, S., Fagan, M., Krentel, M., Marin, G., Mellor-Crummey, J., Tallent, N.R.: Hpctoolkit: tools for performance analysis of optimized parallel programs. Concurrency and Computation: Practice and Experience\u00a022(6), 685\u2013701 (2010)","journal-title":"Concurrency and Computation: Practice and Experience"},{"key":"2_CR7","unstructured":"Allen, R., Kennedy, K.: Optimizing Compilers for Modern Architectures. Morgan Kaufmann (2002)"},{"key":"2_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"244","DOI":"10.1007\/978-3-642-11970-5_14","volume-title":"Compiler Construction","author":"M.M. Baskaran","year":"2010","unstructured":"Baskaran, M.M., Ramanujam, J., Sadayappan, P.: Automatic C-to-CUDA Code Generation for Affine Programs. In: Gupta, R. (ed.) CC 2010. LNCS, vol.\u00a06011, pp. 244\u2013263. Springer, Heidelberg (2010)"},{"key":"2_CR9","doi-asserted-by":"crossref","unstructured":"Briggs, P., Cooper, K.D.: Effective partial redundancy elimination. In: Proceedings of the ACM SIGPLAN 1994 Conference on Programming Language Design and Implementation, PLDI 1994 (1994)","DOI":"10.1145\/178243.178257"},{"key":"2_CR10","doi-asserted-by":"crossref","unstructured":"Browne, S., Dongarra, J., Garner, N., London, K., Mucci, P.: A scalable cross-platform infrastructure for application performance tuning using hardware counters. In: ACM\/IEEE 2000 Conference, Supercomputing (November 2000)","DOI":"10.1109\/SC.2000.10029"},{"issue":"6","key":"2_CR11","doi-asserted-by":"publisher","first-page":"1768","DOI":"10.1145\/197320.197366","volume":"16","author":"S. Carr","year":"1994","unstructured":"Carr, S., Kennedy, K.: Improving the ratio of memory operations to floating-point operations in loops. ACM Transactions on Programming Languages and Systems\u00a016(6), 1768\u20131810 (1994)","journal-title":"ACM Transactions on Programming Languages and Systems"},{"key":"2_CR12","doi-asserted-by":"publisher","first-page":"115","DOI":"10.1145\/1693453.1693471","volume-title":"PPoPP 2010: Proceedings of the 15th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming","author":"J.W. Choi","year":"2010","unstructured":"Choi, J.W., Singh, A., Vuduc, R.W.: Model-driven autotuning of sparse matrix-vector multiply on GPUs. In: PPoPP 2010: Proceedings of the 15th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, pp. 115\u2013126. ACM, New York (2010)"},{"key":"2_CR13","unstructured":"Cytron, R., Ferrante, J.: What\u2019s in a name? -or- the value of renaming for parallelism detection and storage allocation. In: ICPP 1987, pp. 19\u201327 (1987)"},{"key":"2_CR14","first-page":"1","volume-title":"SC 2008: Proceedings of the 2008 ACM\/IEEE Conference on Supercomputing","author":"K. Datta","year":"2008","unstructured":"Datta, K., Murphy, M., Volkov, V., Williams, S., Carter, J., Oliker, L., Patterson, D., Shalf, J., Yelick, K.: Stencil computation optimization and auto-tuning on state-of-the-art multicore architectures. In: SC 2008: Proceedings of the 2008 ACM\/IEEE Conference on Supercomputing, pp. 1\u201312. IEEE Press, Piscataway (2008)"},{"key":"2_CR15","doi-asserted-by":"crossref","unstructured":"Murthy, G., Ravishankar, M., Sadayappan, M.B., Optimal, P.: loop unrolling for gpgpu programs. In: IEEE International Symposium on Parallel Distributed Processing (2010)","DOI":"10.1109\/IPDPS.2010.5470423"},{"key":"2_CR16","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1145\/1188455.1188549","volume-title":"SC 2006: Proceedings of the 2006 ACM\/IEEE Conference on Supercomputing","author":"N.K. Govindaraju","year":"2006","unstructured":"Govindaraju, N.K., Larsen, S., Gray, J., Manocha, D.: A memory model for scientific algorithms on graphics processors. In: SC 2006: Proceedings of the 2006 ACM\/IEEE Conference on Supercomputing, p. 89. ACM, New York (2006)"},{"key":"2_CR17","first-page":"1","volume-title":"SC 2008: Proceedings of the 2008 ACM\/IEEE Conference on Supercomputing","author":"N.K. Govindaraju","year":"2008","unstructured":"Govindaraju, N.K., Lloyd, B., Dotsenko, Y., Smith, B., Manferdelli, J.: High performance discrete fourier transforms on graphics processors. In: SC 2008: Proceedings of the 2008 ACM\/IEEE Conference on Supercomputing, pp. 1\u201312. IEEE Press, Piscataway (2008)"},{"key":"2_CR18","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-642-19595-2_9","volume-title":"Languages and Compilers for Parallel Computing","author":"S. Grauer-Gray","year":"2011","unstructured":"Grauer-Gray, S., Cavazos, J.: Optimizing and Auto-tuning Belief Propagation on the GPU. In: Cooper, K., Mellor-Crummey, J., Sarkar, V. (eds.) LCPC 2010. LNCS, vol.\u00a06548, pp. 121\u2013135. Springer, Heidelberg (2011)"},{"key":"2_CR19","doi-asserted-by":"crossref","unstructured":"Lee, S., Min, S.J., Eigenmann, R.: OpenMP to GPGPU: a compiler framework for automatic translation and optimization. In: Proceedings of the 14th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (2009)","DOI":"10.1145\/1504176.1504194"},{"key":"2_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"83","DOI":"10.1007\/978-3-642-19328-6_10","volume-title":"High Performance Computing for Computational Science \u2013 VECPAR 2010","author":"R. Nath","year":"2011","unstructured":"Nath, R., Tomov, S., Dongarra, J.: Accelerating GPU Kernels for Dense Linear Algebra. In: Palma, J.M.L.M., Dayd\u00e9, M., Marques, O., Lopes, J.C. (eds.) VECPAR 2010. LNCS, vol.\u00a06449, pp. 83\u201392. Springer, Heidelberg (2011)"},{"key":"2_CR21","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/1654059.1654090","volume-title":"SC 2009: Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis","author":"A. Nukada","year":"2009","unstructured":"Nukada, A., Matsuoka, S.: Auto-tuning 3-d FFT library for CUDA GPUs. In: SC 2009: Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis, pp. 1\u201310. ACM, New York (2009)"},{"key":"2_CR22","doi-asserted-by":"crossref","unstructured":"Rahimian, A., Lashuk, I., Veerapaneni, S., Chandramowlishwaran, A., Malhotra, D., Moon, L., Sampath, R., Shringarpure, A., Vetter, J., Vuduc, R., Zorin, D., Biros, G.: Petascale direct numerical simulation of blood flow on 200k cores and heterogeneous architectures. In: Proceedings of the 2010 ACM\/IEEE International Conference for High Performance Computing, Networking, Storage and Analysis (2010)","DOI":"10.1109\/SC.2010.42"},{"key":"2_CR23","doi-asserted-by":"crossref","unstructured":"Ryoo, S., Rodrigues, C.I., Baghsorkhi, S.S., Stone, S.S., Kirk, D.B., Hwu, W.M.W.: Optimization principles and application performance evaluation of a multithreaded GPU using CUDA. In: Proceedings of the 13th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (2008)","DOI":"10.1145\/1345206.1345220"},{"key":"2_CR24","doi-asserted-by":"crossref","unstructured":"Volkov, V., Demmel, J.W.: Benchmarking GPUs to tune dense linear algebra. In: SC 2008: Proceedings of the 2008 ACM\/IEEE Conference on Supercomputing (2008)","DOI":"10.1109\/SC.2008.5214359"},{"issue":"3","key":"2_CR25","doi-asserted-by":"publisher","first-page":"178","DOI":"10.1016\/j.parco.2008.12.006","volume":"35","author":"S. Williams","year":"2009","unstructured":"Williams, S., Oliker, L., Vuduc, R., Shalf, J., Yelick, K., Demmel, J.: Optimization of sparse matrix-vector multiplication on emerging multicore platforms. Parallel Comput.\u00a035(3), 178\u2013194 (2009)","journal-title":"Parallel Comput."},{"key":"2_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"343","DOI":"10.1007\/978-3-540-89740-8_24","volume-title":"Languages and Compilers for Parallel Computing","author":"Q. Yi","year":"2008","unstructured":"Yi, Q., Qasem, A.: Exploring the Optimization Space of Dense Linear Algebra Kernels. In: Amaral, J.N. (ed.) LCPC 2008. LNCS, vol.\u00a05335, pp. 343\u2013355. Springer, Heidelberg (2008)"},{"key":"2_CR27","doi-asserted-by":"crossref","unstructured":"Yixun, L., Zhang, E.Z., Shen, X.: A cross-input adaptive framework for GPU program optimizations. In: Proceedings of the 2009 IEEE International Symposium on Parallel & Distributed Processing (2009)","DOI":"10.1109\/IPDPS.2009.5160988"},{"key":"2_CR28","doi-asserted-by":"crossref","unstructured":"Zhuo, Y., Wu, X.L., Haldar, J.P., Hwu, W.M., Liang, Z.P., Sutton, B.P.: Accelerating iterative field-compensated mr image reconstruction on GPUs. In: Proceedings of the 2010 IEEE International Conference on Biomedical Imaging: From Nano to Macro, ISBI 2010 (2010)","DOI":"10.1109\/ISBI.2010.5490112"}],"container-title":["Lecture Notes in Computer Science","Compiler Construction"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-28652-0_2.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,21]],"date-time":"2024-04-21T21:24:50Z","timestamp":1713734690000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-28652-0_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012]]},"ISBN":["9783642286513","9783642286520"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-28652-0_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2012]]}}}