{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T07:08:56Z","timestamp":1769843336971,"version":"3.49.0"},"publisher-location":"Berlin, Heidelberg","reference-count":41,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"value":"9783642198601","type":"print"},{"value":"9783642198618","type":"electronic"}],"license":[{"start":{"date-parts":[[2011,1,1]],"date-time":"2011-01-01T00:00:00Z","timestamp":1293840000000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2011]]},"DOI":"10.1007\/978-3-642-19861-8_13","type":"book-chapter","created":{"date-parts":[[2011,3,14]],"date-time":"2011-03-14T17:39:26Z","timestamp":1300124366000},"page":"225-245","source":"Crossref","is-referenced-by-count":60,"title":["Data Layout Transformation for Stencil Computations on Short-Vector SIMD Architectures"],"prefix":"10.1007","author":[{"given":"Tom","family":"Henretty","sequence":"first","affiliation":[]},{"given":"Kevin","family":"Stock","sequence":"additional","affiliation":[]},{"given":"Louis-No\u00ebl","family":"Pouchet","sequence":"additional","affiliation":[]},{"given":"Franz","family":"Franchetti","sequence":"additional","affiliation":[]},{"given":"J.","family":"Ramanujam","sequence":"additional","affiliation":[]},{"given":"P.","family":"Sadayappan","sequence":"additional","affiliation":[]}],"member":"297","reference":[{"key":"13_CR1","doi-asserted-by":"crossref","unstructured":"Allen, R., Kennedy, K.: Automatic translation of fortran programs to vector form. ACM TOPLAS 9(4) (1987)","DOI":"10.1145\/29873.29875"},{"key":"13_CR2","doi-asserted-by":"crossref","unstructured":"Amarasinghe, S., Lam, M.: Communication optimization and code generation for distributed memory machines. In: PLDI (1993)","DOI":"10.1145\/155090.155102"},{"key":"13_CR3","doi-asserted-by":"crossref","unstructured":"Anderson, J., Amarasinghe, S., Lam, M.: Data and computation transformations for multiprocessors. In: PPoPP (1995)","DOI":"10.1145\/209936.209954"},{"key":"13_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"772","DOI":"10.1007\/978-3-642-03869-3_72","volume-title":"Euro-Par 2009 Parallel Processing","author":"W. Augustin","year":"2009","unstructured":"Augustin, W., Heuveline, V., Weiss, J.-P.: Optimized stencil computation using in-place calculation on modern multicore systems. In: Sips, H., Epema, D., Lin, H.-X. (eds.) Euro-Par 2009. LNCS, vol.\u00a05704, pp. 772\u2013784. Springer, Heidelberg (2009)"},{"key":"13_CR5","doi-asserted-by":"crossref","unstructured":"Chatterjee, S., Gilbert, J., Schreiber, R., Teng, S.: Automatic array alignment in data-parallel programs. In: POPL (1993)","DOI":"10.1145\/158511.158517"},{"key":"13_CR6","doi-asserted-by":"crossref","unstructured":"Datta, K., Kamil, S., Williams, S., Oliker, L., Shalf, J., Yelick, K.: Optimization and performance modeling of stencil computations on modern microprocessors. SIAM Review 51(1) (2009)","DOI":"10.1137\/070693199"},{"key":"13_CR7","doi-asserted-by":"crossref","unstructured":"Datta, K., Murphy, M., Volkov, V., Williams, S., Carter, J., Oliker, L., Patterson, D., Shalf, J., Yelick, K.: Stencil computation optimization and auto-tuning on state-of-the-art multicore architectures. In: SC 2008, pp. 1\u201312 (2008)","DOI":"10.1109\/SC.2008.5222004"},{"key":"13_CR8","unstructured":"Datta, K., Williams, S., Volkov, V., Carter, J., Oliker, L., Shalf, J., Yelick, K.: Auto-tuning the 27-point stencil for multicore. In: iWAPT 2009 (2009)"},{"key":"13_CR9","doi-asserted-by":"crossref","unstructured":"de la Cruz, R., Araya-Polo, M., Cela, J.M.: Introducing the semi-stencil algorithm. In: PPAM (1) (2009)","DOI":"10.1007\/978-3-642-14390-8_52"},{"key":"13_CR10","unstructured":"Dursun, H., Nomura, K., Wang, W., Kunaseth, M., Peng, L., Seymour, R., Kalia, R., Nakano, A., Vashishta, P.: In-core optimization of high-order stencil computations. In: PDPTA (2009)"},{"key":"13_CR11","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"642","DOI":"10.1007\/978-3-642-03869-3_61","volume-title":"Euro-Par 2009 Parallel Processing","author":"H. Dursun","year":"2009","unstructured":"Dursun, H., Nomura, K.-i., Peng, L., Seymour, R., Wang, W., Kalia, R.K., Nakano, A., Vashishta, P.: A multilevel parallelization framework for high-order stencil computations. In: Sips, H., Epema, D., Lin, H.-X. (eds.) Euro-Par 2009. LNCS, vol.\u00a05704, pp. 642\u2013653. Springer, Heidelberg (2009)"},{"key":"13_CR12","doi-asserted-by":"crossref","unstructured":"Eichenberger, A., Wu, P., O\u2019Brien, K.: Vectorization for simd architectures with alignment constraints. In: PLDI (2004)","DOI":"10.1145\/996841.996853"},{"key":"13_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/978-3-540-71229-9_1","volume-title":"Compiler Construction","author":"L. Fireman","year":"2007","unstructured":"Fireman, L., Petrank, E., Zaks, A.: New algorithms for SIMD alignment. In: Adsul, B., Vetta, A. (eds.) CC 2007. LNCS, vol.\u00a04420, pp. 1\u201315. Springer, Heidelberg (2007)"},{"key":"13_CR14","doi-asserted-by":"crossref","unstructured":"Hohenauer, M., Engel, F., Leupers, R., Ascheid, G., Meyr, H.: A simd optimization framework for retargetable compilers. ACM TACO 6(1) (2009)","DOI":"10.1145\/1509864.1509866"},{"key":"13_CR15","doi-asserted-by":"crossref","unstructured":"Jang, B., Mistry, P., Schaa, D., Dominguez, R., Kaeli, D.R.: Data transformations enabling loop vectorization on multithreaded data parallel architectures. In: PPOPP (2010)","DOI":"10.1145\/1693453.1693510"},{"key":"13_CR16","doi-asserted-by":"crossref","unstructured":"Kamil, S., Datta, K., Williams, S., Oliker, L., Shalf, J., Yelick, K.: Implicit and explicit optimizations for stencil computations. In: MSPC 2006 (2006)","DOI":"10.1145\/1178597.1178605"},{"key":"13_CR17","doi-asserted-by":"crossref","unstructured":"Kamil, S., Husbands, P., Oliker, L., Shalf, J., Yelick, K.: Impact of modern memory subsystems on cache optimizations for stencil computations. In: MSP 2005 (2005)","DOI":"10.1145\/1111583.1111589"},{"key":"13_CR18","doi-asserted-by":"crossref","unstructured":"Kandemir, M., Choudhary, A., Shenoy, N., Banerjee, P., Ramanujam, J.: A linear algebra framework for automatic determination of optimal data layouts. IEEE TPDS 10(2) (1999)","DOI":"10.1109\/71.752779"},{"key":"13_CR19","volume-title":"Optimizing compilers for modern architectures: A dependence-based approach","author":"K. Kennedy","year":"2002","unstructured":"Kennedy, K., Allen, J.: Optimizing compilers for modern architectures: A dependence-based approach. Morgan Kaufmann, San Francisco (2002)"},{"key":"13_CR20","doi-asserted-by":"crossref","unstructured":"Kennedy, K., Kremer, U.: Automatic data layout for distributed-memory machines. ACM TOPLAS 20(4) (1998)","DOI":"10.1145\/291891.291901"},{"key":"13_CR21","doi-asserted-by":"crossref","unstructured":"Krishnamoorthy, S., Baskaran, M., Bondhugula, U., Ramanujam, J., Rountev, A., Sadayappan, P.: Effective automatic parallelization of stencil computations. In: PLDI (2007)","DOI":"10.1145\/1250734.1250761"},{"key":"13_CR22","doi-asserted-by":"crossref","unstructured":"Larsen, S., Amarasinghe, S.P.: Exploiting superword level parallelism with multimedia instruction sets. In: PLDI (2000)","DOI":"10.1145\/349299.349320"},{"key":"13_CR23","unstructured":"Larsen, S., Witchel, E., Amarasinghe, S.P.: Increasing and detecting memory address congruence. In: IEEE PACT (2002)"},{"key":"13_CR24","doi-asserted-by":"crossref","unstructured":"Li, Z., Song, Y.: Automatic tiling of iterative stencil loops. ACM TOPLAS 26(6) (2004)","DOI":"10.1145\/1034774.1034777"},{"key":"13_CR25","doi-asserted-by":"crossref","unstructured":"Meng, J., Skadron, K.: Performance modeling and automatic ghost zone optimization for iterative stencil loops on gpus. In: ICS (2009)","DOI":"10.1145\/1542275.1542313"},{"key":"13_CR26","doi-asserted-by":"crossref","unstructured":"Micikevicius, P.: 3d finite difference computation on gpus using cuda. In: GPGPU-2 (2009)","DOI":"10.1145\/1513895.1513905"},{"key":"13_CR27","unstructured":"Nuzman, D., Henderson, R.: Multi-platform auto-vectorization. In: CGO (2006)"},{"key":"13_CR28","doi-asserted-by":"crossref","unstructured":"Nuzman, D., Rosen, I., Zaks, A.: Auto-vectorization of interleaved data for simd. In: PLDI (2006)","DOI":"10.1145\/1133981.1133997"},{"key":"13_CR29","doi-asserted-by":"crossref","unstructured":"Nuzman, D., Zaks, A.: Outer-loop vectorization: revisited for short simd architectures. In: PACT (2008)","DOI":"10.1145\/1454115.1454119"},{"key":"13_CR30","unstructured":"O\u2019Boyle, M., Knijnenburg, P.: Nonsingular data transformations: Definition, validity, and applications. IJPP\u00a027(3) (1999)"},{"key":"13_CR31","doi-asserted-by":"crossref","unstructured":"Orozco, D., Gao, G.R.: Mapping the FDTD Application to Many-Core Chip Architectures. In: ICPP (2009)","DOI":"10.1109\/ICPP.2009.44"},{"key":"13_CR32","doi-asserted-by":"crossref","unstructured":"Rivera, G., Tseng, C.-W.: Data transformations for eliminating conflict misses. In: PLDI (1998)","DOI":"10.1145\/277650.277661"},{"key":"13_CR33","doi-asserted-by":"crossref","unstructured":"Shafiq, M., Pericas, M., de la Cruz, R., Araya-Polo, M., Navarro, N., Ayguade, E.: Exploiting memory customization in fpga for 3d stencil computations. In: FPT (2009)","DOI":"10.1109\/FPT.2009.5377644"},{"key":"13_CR34","doi-asserted-by":"crossref","unstructured":"Solar-Lezama, A., Arnold, G., Tancau, L., Bodik, R., Saraswat, V., Seshia, S.: Sketching stencils. In: PLDI (2007)","DOI":"10.1145\/1250734.1250754"},{"key":"13_CR35","unstructured":"Treibig, J., Wellein, G., Hager, G.: Efficient multicore-aware parallelization strategies for iterative stencil computations. CoRR, abs\/1004.1741 (2010)"},{"key":"13_CR36","doi-asserted-by":"crossref","unstructured":"Venkatasubramanian, S., Vuduc, R.: Tuned and wildly asynchronous stencil kernels for hybrid cpu\/gpu systems. In: ICS (2009)","DOI":"10.1145\/1542275.1542312"},{"key":"13_CR37","doi-asserted-by":"crossref","unstructured":"Wellein, G., Hager, G., Zeiser, T., Wittmann, M., Fehske, H.: Efficient temporal blocking for stencil computations by multicore-aware wavefront parallelization. In: COMPSAC (2009)","DOI":"10.1109\/COMPSAC.2009.82"},{"key":"13_CR38","doi-asserted-by":"crossref","unstructured":"Wittmann, M., Hager, G., Treibig, J., Wellein, G.: Leveraging shared caches for parallel temporal blocking of stencil codes on multicore processors and clusters. CoRR, abs\/1006.3148 (2010)","DOI":"10.1142\/S0129626410000296"},{"key":"13_CR39","volume-title":"High Performance Compilers For Parallel Computing","author":"M.J. Wolfe","year":"1996","unstructured":"Wolfe, M.J.: High Performance Compilers For Parallel Computing. Addison-Wesley, Reading (1996)"},{"key":"13_CR40","unstructured":"Wonnacott, D.: Achieving scalable locality with time skewing. IJPP\u00a030(3) (2002)"},{"key":"13_CR41","unstructured":"Wu, P., Eichenberger, A.E., Wang, A.: Efficient SIMD Code Generation for Runtime Alignment and Length Conversion. In: CGO (2005)"}],"container-title":["Lecture Notes in Computer Science","Compiler Construction"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-19861-8_13","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,5,21]],"date-time":"2019-05-21T11:39:00Z","timestamp":1558438740000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-19861-8_13"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2011]]},"ISBN":["9783642198601","9783642198618"],"references-count":41,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-19861-8_13","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2011]]}}}