{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T08:00:18Z","timestamp":1761897618855,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":31,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032024350","type":"print"},{"value":"9783032024367","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-02436-7_9","type":"book-chapter","created":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T06:50:08Z","timestamp":1761893408000},"page":"126-141","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Performance-Portable Tensor Transpositions in\u00a0MLIR"],"prefix":"10.1007","author":[{"given":"Mahesh","family":"Lakshminarasimhan","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mahesh","family":"Ravishankar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mary","family":"Hall","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"P.","family":"Sadayappan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,11,1]]},"reference":[{"key":"9_CR1","unstructured":"Firefly - A new compiler and runtime for BEAM languages. https:\/\/github.com\/GetFirefly\/firefly"},{"key":"9_CR2","unstructured":"IREE Compiler. https:\/\/openxla.github.io\/iree\/. https:\/\/github.com\/openxla\/iree"},{"key":"9_CR3","unstructured":"Microsoft Accera Compiler. https:\/\/github.com\/microsoft\/Accera. https:\/\/microsoft.github.io\/Accera\/"},{"key":"9_CR4","unstructured":"PennyLane Catalyst Compiler for hybrid quantum-classical programs. https:\/\/github.com\/PennyLaneAI\/catalyst"},{"key":"9_CR5","unstructured":"PlaidML Tensor Compiler. https:\/\/github.com\/plaidml\/plaidml. https:\/\/plaidml.github.io\/plaidml\/"},{"key":"9_CR6","unstructured":"SHARK High Performance Machine Learning Distribution. https:\/\/github.com\/nod-ai\/SHARK"},{"key":"9_CR7","unstructured":"The Torch MLIR Project. https:\/\/github.com\/llvm\/torch-mlir"},{"key":"9_CR8","unstructured":"Abadi, M., et al.: TensorFlow: large-scale machine learning on heterogeneous distributed systems. arXiv preprint arXiv:1603.04467 (2016)"},{"key":"9_CR9","doi-asserted-by":"crossref","unstructured":"Ben-Nun, T., Ates, B., Calotoiu, A., Hoefler, T.: Bridging control-centric and data-centric optimization. In: Proceedings of the 21st ACM\/IEEE International Symposium on Code Generation and Optimization. CGO 2023 (2023)","DOI":"10.1145\/3579990.3580018"},{"key":"9_CR10","unstructured":"Ga\u00ebl Guennebaud and Beno\u00eet Jacob and others: Eigen library (2010). http:\/\/eigen.tuxfamily.org"},{"key":"9_CR11","doi-asserted-by":"crossref","unstructured":"Georganas, E., et al.: Anatomy of high-performance deep learning convolutions on SIMD architectures. In: SC18: International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 830\u2013841. IEEE (2018)","DOI":"10.1109\/SC.2018.00069"},{"key":"9_CR12","doi-asserted-by":"publisher","unstructured":"Georganas, E., et al.: Harnessing deep learning via a single building block. In: 2020 IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 222\u2013233 (2020). https:\/\/doi.org\/10.1109\/IPDPS47924.2020.00032","DOI":"10.1109\/IPDPS47924.2020.00032"},{"key":"9_CR13","unstructured":"Hammond, J.: Automatically tuned libraries for native-dimension tensor transpose and contraction algorithms (2009)"},{"key":"9_CR14","doi-asserted-by":"crossref","unstructured":"Heinecke, A., Henry, G., Hutchinson, M., Pabst, H.: LIBXSMM: accelerating small matrix multiplications by runtime code generation. In: SC\u201916: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 981\u2013991. IEEE (2016)","DOI":"10.1109\/SC.2016.83"},{"key":"9_CR15","unstructured":"Hu, P., Lu, M., Wang, L., Jiang, G.: TPU-MLIR: A Compiler For TPU Using MLIR. arXiv preprint arXiv:2210.15016 (2022)"},{"key":"9_CR16","unstructured":"Hynninen, A.P., Lyakh, D.I.: cuTT: a high-performance tensor transpose library for CUDA compatible GPUs. arXiv preprint arXiv:1705.01598 (2017)"},{"key":"9_CR17","unstructured":"Jin, T., et al.: Compiling ONNX neural network models using MLIR. arXiv preprint arXiv:2008.08272 (2020)"},{"issue":"5","key":"9_CR18","doi-asserted-by":"publisher","first-page":"876","DOI":"10.1007\/s10766-015-0366-5","volume":"43","author":"JL Jodra","year":"2015","unstructured":"Jodra, J.L., Gurrutxaga, I., Muguerza, J.: Efficient 3D transpositions in graphics processing units. Int. J. Parallel Prog. 43(5), 876\u2013891 (2015)","journal-title":"Int. J. Parallel Prog."},{"key":"9_CR19","doi-asserted-by":"crossref","unstructured":"Lattner, C., et al.: MLIR: scaling compiler infrastructure for domain specific computation. In: 2021 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO), pp. 2\u201314. IEEE (2021)","DOI":"10.1109\/CGO51591.2021.9370308"},{"key":"9_CR20","unstructured":"Liu, Y., Wang, Y., Yu, R., Li, M., Sharma, V., Wang, Y.: Optimizing $$\\{$$CNN$$\\}$$ model inference on $$\\{$$CPUs$$\\}$$. In: 2019 USENIX Annual Technical Conference (USENIX ATC 2019), pp. 1025\u20131040 (2019)"},{"key":"9_CR21","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1016\/j.cpc.2014.12.013","volume":"189","author":"DI Lyakh","year":"2015","unstructured":"Lyakh, D.I.: An efficient tensor transpose algorithm for multicore CPU, Intel Xeon Phi, and NVidia Tesla GPU. Comput. Phys. Commun. 189, 84\u201391 (2015)","journal-title":"Comput. Phys. Commun."},{"key":"9_CR22","unstructured":"McCalpin, J.D., et\u00a0al.: Memory bandwidth and machine balance in current high performance computers. IEEE Comput. Soc. Tech. Committee Comput. Archit. (TCCA) Newslett. 2(19-25) (1995)"},{"key":"9_CR23","unstructured":"Mutlu, E., et al.: COMET: A Domain-Specific Compilation of High-Performance Computational Chemistry. arXiv preprint arXiv:2102.06827 (2021)"},{"key":"9_CR24","doi-asserted-by":"publisher","first-page":"947","DOI":"10.1016\/j.future.2017.08.007","volume":"92","author":"SJ Pennycook","year":"2019","unstructured":"Pennycook, S.J., Sewall, J.D., Lee, V.W.: Implications of a metric for performance portability. Futur. Gener. Comput. Syst. 92, 947\u2013958 (2019)","journal-title":"Futur. Gener. Comput. Syst."},{"issue":"3","key":"9_CR25","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3157733","volume":"44","author":"P Springer","year":"2018","unstructured":"Springer, P., Bientinesi, P.: Design of a high-performance GEMM-like tensor-tensor multiplication. ACM Trans. Math. Softw. (TOMS) 44(3), 1\u201329 (2018)","journal-title":"ACM Trans. Math. Softw. (TOMS)"},{"issue":"2","key":"9_CR26","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3104988","volume":"44","author":"P Springer","year":"2017","unstructured":"Springer, P., Hammond, J.R., Bientinesi, P.: TTC: a high-performance compiler for tensor transpositions. ACM Trans. Math. Softw. (TOMS) 44(2), 1\u201321 (2017)","journal-title":"ACM Trans. Math. Softw. (TOMS)"},{"key":"9_CR27","doi-asserted-by":"crossref","unstructured":"Springer, P., Su, T., Bientinesi, P.: HPTT: a high-performance tensor transposition C++ library. In: Proceedings of the 4th ACM SIGPLAN International Workshop on Libraries, Languages, and Compilers for Array Programming, pp. 56\u201362 (2017)","DOI":"10.1145\/3091966.3091968"},{"key":"9_CR28","doi-asserted-by":"crossref","unstructured":"Tillet, P., Kung, H.T., Cox, D.: Triton: an intermediate language and compiler for tiled neural network computations. In: Proceedings of the 3rd ACM SIGPLAN International Workshop on Machine Learning and Programming Languages, pp. 10\u201319 (2019)","DOI":"10.1145\/3315508.3329973"},{"issue":"3","key":"9_CR29","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/2764454","volume":"41","author":"FG Van Zee","year":"2015","unstructured":"Van Zee, F.G., Van De Geijn, R.A.: BLIS: a framework for rapidly instantiating BLAS functionality. ACM Trans. Math. Softw. (TOMS) 41(3), 1\u201333 (2015)","journal-title":"ACM Trans. Math. Softw. (TOMS)"},{"key":"9_CR30","doi-asserted-by":"crossref","unstructured":"Vedurada, J., et al.: TTLG - an efficient tensor transposition library for GPUs. In: 2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 578\u2013588. IEEE (2018)","DOI":"10.1109\/IPDPS.2018.00067"},{"key":"9_CR31","doi-asserted-by":"crossref","unstructured":"Wei, L., Mellor-Crummey, J.: Autotuning tensor transposition. In: 2014 IEEE International Parallel & Distributed Processing Symposium Workshops, pp. 342\u2013351. IEEE (2014)","DOI":"10.1109\/IPDPSW.2014.43"}],"container-title":["Lecture Notes in Computer Science","Languages and Compilers for Parallel Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-02436-7_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T06:50:16Z","timestamp":1761893416000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-02436-7_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,1]]},"ISBN":["9783032024350","9783032024367"],"references-count":31,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-02436-7_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,1]]},"assertion":[{"value":"1 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"LCPC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Workshop on Languages and Compilers for Parallel Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lexington, KY","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"USA","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 October 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 October 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"36","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"lcpc2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.lcpcworkshop.org\/LCPC23\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}