{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:40:09Z","timestamp":1750297209379,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,2,19]],"date-time":"2025-02-19T00:00:00Z","timestamp":1739923200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100020950","name":"National Science and Technology Council","doi-asserted-by":"publisher","award":["1132221E007140"],"award-info":[{"award-number":["1132221E007140"]}],"id":[{"id":"10.13039\/501100020950","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,2,19]]},"DOI":"10.1145\/3712031.3712033","type":"proceedings-article","created":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T12:28:34Z","timestamp":1743078514000},"page":"90-98","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["ITTPD: In-place Tensor Transposition with Permutation Decomposition on GPUs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-0620-6442","authenticated-orcid":false,"given":"Kai-Jung","family":"Cheng","sequence":"first","affiliation":[{"name":"National Tsing Hua University, Hsinchu, Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3940-4478","authenticated-orcid":false,"given":"Che-Rung","family":"Lee","sequence":"additional","affiliation":[{"name":"National Tsing Hua University, Hsinchu, Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,3,27]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"2022. NVIDIA Multi-Instance GPU User Guide. https:\/\/docs.nvidia.com\/datacenter\/tesla\/mig-user-guide\/index.html."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","unstructured":"A. Abdelfattah M. Baboulin V. Dobrev J. Dongarra C. Earl J. Falcou A. Haidar I. Karlin Tz. Kolev I. Masliah and S. Tomov. 2016. High-performance Tensor Contractions for GPUs. Procedia Computer Science 80 (2016) 108 \u2013 118. 10.1016\/j.procs.2016.05.302International Conference on Computational Science 2016 ICCS 2016 6-8 June 2016 San Diego California USA.","DOI":"10.1016\/j.procs.2016.05.302"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-21175-1_16"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","unstructured":"Brett\u00a0W. Bader and Tamara\u00a0G. Kolda. 2006. Algorithm 862: MATLAB tensor classes for fast algorithm prototyping. ACM Trans. Math. Softw. 32 4 (dec 2006) 635\u2013653. 10.1145\/1186785.1186794","DOI":"10.1145\/1186785.1186794"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","unstructured":"Vineet Bafna and Pavel\u00a0A. Pevzner. 1996. Genome Rearrangements and Sorting by Reversals. SIAM J. Comput. 25 2 (1996) 272\u2013289. 10.1137\/S0097539793250627 arXiv:10.1137\/S0097539793250627","DOI":"10.1137\/S0097539793250627"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","unstructured":"Vineet Bafna and Pavel\u00a0A. Pevzner. 1998. Sorting by Transpositions. SIAM Journal on Discrete Mathematics 11 2 (1998) 224\u2013240. 10.1137\/S089548019528280X arXiv:10.1137\/S089548019528280X","DOI":"10.1137\/S089548019528280X"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","first-page":"200","DOI":"10.1007\/3-540-45749-6_21","volume-title":"Algorithms \u2014 ESA 2002","author":"Berman Piotr","year":"2002","unstructured":"Piotr Berman, Sridhar Hannenhalli, and Marek Karpinski. 2002. 1.375-Approximation Algorithm for Sorting by Reversals. In Algorithms \u2014 ESA 2002, Rolf M\u00f6hring and Rajeev Raman (Eds.). Springer Berlin Heidelberg, Berlin, Heidelberg, 200\u2013210."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","unstructured":"Sangeeta Bhatia Pedro Feij\u00e3o and Andrew\u00a0R. Francis. 2018. Position and Content Paradigms in Genome Rearrangements: The Wild and Crazy World of Permutations in Genomics. Bulletin of Mathematical Biology 80 12 (01 Dec 2018) 3227\u20133246. 10.1007\/s11538-018-0514-3","DOI":"10.1007\/s11538-018-0514-3"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"L\u00a0Susan Blackford Antoine Petitet Roldan Pozo Karin Remington R\u00a0Clint Whaley James Demmel Jack Dongarra Iain Duff Sven Hammarling Greg Henry et\u00a0al. 2002. An updated set of basic linear algebra subprograms (BLAS). ACM Trans. Math. Software 28 2 (2002) 135\u2013151.","DOI":"10.1145\/567806.567807"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","unstructured":"Laurent Bulteau Guillaume Fertin and Irena Rusu. 2012. Sorting by Transpositions Is Difficult. SIAM Journal on Discrete Mathematics 26 3 (2012) 1148\u20131180. 10.1137\/110851390 arXiv:10.1137\/110851390","DOI":"10.1137\/110851390"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","unstructured":"Laurent Bulteau and Mathias Weller. 2019. Parameterized Algorithms in Bioinformatics: An Overview. Algorithms 12 12 (2019). 10.3390\/a12120256","DOI":"10.3390\/a12120256"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/267521.267531"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","unstructured":"Bryan Catanzaro Alexander Keller and Michael Garland. 2014. A Decomposition for In-Place Matrix Transposition. SIGPLAN Not. 49 8 (Feb. 2014) 193\u2013206. 10.1145\/2692916.2555253","DOI":"10.1145\/2692916.2555253"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.5555\/314613.314711"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","unstructured":"Lu\u00eds Felipe\u00a0I. Cunha Luis Antonio\u00a0B. Kowada Rodrigo de\u00a0A. Hausen and Celina\u00a0M.H. de Figueiredo. 2015. A Faster 1.375-Approximation Algorithm for Sorting by Transpositions*. Journal of Computational Biology 22 11 (2015) 1044\u20131056. 10.1089\/cmb.2014.0298 arXiv:10.1089\/cmb.2014.0298 PMID: 26383040.","DOI":"10.1089\/cmb.2014.0298"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","unstructured":"ULISSES DIAS and ZANONI DIAS. 2013. HEURISTICS FOR THE TRANSPOSITION DISTANCE PROBLEM. Journal of Bioinformatics and Computational Biology 11 05 (2013) 1350013. 10.1142\/S0219720013500133 arXiv:10.1142\/S0219720013500133 PMID: 24131057.","DOI":"10.1142\/S0219720013500133"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/1463768.1463781"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","unstructured":"Isaac Elias and Tzvika Hartman. 2006. A 1.375-Approximation Algorithm for Sorting by Transpositions. IEEE\/ACM Transactions on Computational Biology and Bioinformatics 3 4 (2006) 369\u2013379. 10.1109\/TCBB.2006.44","DOI":"10.1109\/TCBB.2006.44"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","unstructured":"Muhammad Elsayed Saleh El-shehaby and Mohamed Abougabal. 2015. NDPA: A generalized efficient parallel in-place N-Dimensional Permutation Algorithm. Alexandria Engineering Journal 32 (04 2015). 10.1016\/j.aej.2015.03.024","DOI":"10.1016\/j.aej.2015.03.024"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","unstructured":"Glen Evenbly. 2022. A Practical Guide to the Numerical Implementation of Tensor Networks I: Contractions Decompositions and Gauge Freedom. Frontiers in Applied Mathematics and Statistics 8 (2022). 10.3389\/fams.2022.806549","DOI":"10.3389\/fams.2022.806549"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","unstructured":"Fred Gustavson Lars Karlsson and Bo K\u00e5gstr\u00f6m. 2012. Parallel and Cache-Efficient In-Place Matrix Storage Format Conversion. 38 3 Article 17 (April 2012) 32\u00a0pages. 10.1145\/2168773.2168775","DOI":"10.1145\/2168773.2168775"},{"key":"e_1_3_3_1_23_2","unstructured":"Fred\u00a0Gehrung Gustavson and John\u00a0A Gunnels. 2014. Method and structure for cache aware transposition via rectangular subsections. (2 2014)."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","unstructured":"Fred\u00a0G. Gustavson and David\u00a0W. Walker. 2019. Algorithms for in-place matrix transposition. Concurrency and Computation: Practice and Experience 31 13 (2019) e5071. 10.1002\/cpe.5071 arXiv:https:\/\/onlinelibrary.wiley.com\/doi\/pdf\/10.1002\/cpe.5071e5071 cpe.5071.","DOI":"10.1002\/cpe.5071"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","unstructured":"J. G\u00f3mez-Luna I. Sung L. Chang J.\u00a0M. Gonz\u00e1lez-Linares N. Guil and W.\u00a0W. Hwu. 2016. In-Place Matrix Transposition on GPUs. IEEE Transactions on Parallel and Distributed Systems 27 3 (2016) 776\u2013788. 10.1109\/TPDS.2015.2412549","DOI":"10.1109\/TPDS.2015.2412549"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","unstructured":"So Hirata. 2003. Tensor Contraction Engine: Abstraction and Automated Parallel Implementation of Configuration-Interaction Coupled-Cluster and Many-Body Perturbation Theories. The Journal of Physical Chemistry A 107 (11 2003) 9887\u20139897. 10.1021\/jp034596z","DOI":"10.1021\/jp034596z"},{"key":"e_1_3_3_1_27_2","unstructured":"Antti-Pekka Hynninen and Dmitry\u00a0I. Lyakh. 2017. cuTT: A High-Performance Tensor Transpose Library for CUDA Compatible GPUs. arxiv:https:\/\/arXiv.org\/abs\/1705.01598\u00a0[cs.MS]"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","unstructured":"Jose\u00a0L. Jodra Ibai Gurrutxaga and Javier Muguerza. 2015. Efficient 3D Transpositions in Graphics Processing Units. Int. J. Parallel Program. 43 5 (Oct. 2015) 876\u2013891. 10.1007\/s10766-015-0366-5","DOI":"10.1007\/s10766-015-0366-5"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","unstructured":"John Kececioglu and David Sankoff. 1995. Exact and Approximation Algorithms for Sorting by Reversals with Application to Genome Rearrangement. Algorithmica 13 (02 1995) 180\u2013210. 10.1007\/BF01188586","DOI":"10.1007\/BF01188586"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","unstructured":"Tamara\u00a0G. Kolda and Brett\u00a0W. Bader. 2009. Tensor Decompositions and Applications. SIAM Rev. 51 3 (2009) 455\u2013500. 10.1137\/07070111X arXiv:10.1137\/07070111X","DOI":"10.1137\/07070111X"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","unstructured":"Dmitry\u00a0I. Lyakh. 2015. An efficient tensor transpose algorithm for multicore CPU Intel Xeon Phi and NVidia Tesla GPU. Computer Physics Communications 189 (1 2015). 10.1016\/j.cpc.2014.12.013","DOI":"10.1016\/j.cpc.2014.12.013"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","unstructured":"Devin\u00a0A. Matthews. 2018. High-Performance Tensor Contraction without Transposition. SIAM Journal on Scientific Computing 40 1 (2018) C1\u2013C24. 10.1137\/16M108968X arXiv:10.1137\/16M108968X","DOI":"10.1137\/16M108968X"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","unstructured":"Devin\u00a0A. Matthews. 2018. High-Performance Tensor Contraction without Transposition. SIAM Journal on Scientific Computing 40 1 (2018) C1\u2013C24. 10.1137\/16M108968X","DOI":"10.1137\/16M108968X"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","unstructured":"Chetan Nayak Steven\u00a0H. Simon Ady Stern Michael Freedman and Sankar Das\u00a0Sarma. 2008. Non-Abelian anyons and topological quantum computation. Reviews of Modern Physics 80 3 (Sep 2008) 1083\u20131159. 10.1103\/revmodphys.80.1083","DOI":"10.1103\/revmodphys.80.1083"},{"key":"e_1_3_3_1_35_2","unstructured":"Alexander Novikov Dmitry Podoprikhin Anton Osokin and Dmitry Vetrov. 2015. Tensorizing Neural Networks. arxiv:https:\/\/arXiv.org\/abs\/1509.06569\u00a0[cs.LG]"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","unstructured":"Andre\u00a0Rodrigues Oliveira Klairton\u00a0Lima Brito Alexsandro\u00a0Oliveira Alexandrino Gabriel Siqueira Ulisses Dias and Zanoni Dias. 2024. Rearrangement Distance Problems: An updated survey. ACM Comput. Surv. 56 8 Article 206 (apr 2024) 27\u00a0pages. 10.1145\/3653295","DOI":"10.1145\/3653295"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","unstructured":"Yang Shi U.\u00a0N. Niranjan Animashree Anandkumar and Cris Cecka. 2016. Tensor Contractions with Extended BLAS Kernels on CPU and GPU. 2016 IEEE 23rd International Conference on High Performance Computing (HiPC) (Dec 2016). 10.1109\/hipc.2016.031","DOI":"10.1109\/hipc.2016.031"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","unstructured":"Luiz Augusto\u00a0G. Silva Luis Antonio\u00a0B. Kowada Nora\u00ed\u00a0Romeu Rocco and Maria Em\u00edlia M.\u00a0T. Walter. 2022. A new 1.375-approximation algorithm for sorting by transpositions. Algorithms for Molecular Biology 17 1 (15 Jan 2022) 1. 10.1186\/s13015-022-00205-z","DOI":"10.1186\/s13015-022-00205-z"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","unstructured":"Edgar Solomonik Devin Matthews Jeff\u00a0R. Hammond John\u00a0F. Stanton and James Demmel. 2014. A massively parallel tensor contraction framework for coupled-cluster computations. J. Parallel and Distrib. Comput. 74 12 (2014) 3176 \u2013 3190. 10.1016\/j.jpdc.2014.06.002Domain-Specific Languages and High-Level Frameworks for High-Performance Computing.","DOI":"10.1016\/j.jpdc.2014.06.002"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","unstructured":"Paul Springer and Paolo Bientinesi. 2018. Design of a High-Performance GEMM-like Tensor\u2013Tensor Multiplication. ACM Trans. Math. Softw. 44 3 Article 28 (Jan. 2018) 29\u00a0pages. 10.1145\/3157733","DOI":"10.1145\/3157733"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","unstructured":"Paul Springer Aravind Sankaran and Paolo Bientinesi. 2016. TTC: a tensor transposition compiler for multiple architectures. Proceedings of the 3rd ACM SIGPLAN International Workshop on Libraries Languages and Compilers for Array Programming - ARRAY 2016 (2016). 10.1145\/2935323.2935328","DOI":"10.1145\/2935323.2935328"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","unstructured":"Paul Springer Tong Su and Paolo Bientinesi. 2017. HPTT: A High-Performance Tensor Transposition C++ Library(ARRAY 2017). Association for Computing Machinery New York NY USA 56\u201362. 10.1145\/3091966.3091968","DOI":"10.1145\/3091966.3091968"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/2555243.2555266"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","unstructured":"A.A. Tretyakov and E.E. Tyrtyshnikov. 2009. Optimal in-place transposition of rectangular matrices. Journal of Complexity 25 4 (2009) 377 \u2013 384. 10.1016\/j.jco.2009.02.008","DOI":"10.1016\/j.jco.2009.02.008"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"crossref","first-page":"447","DOI":"10.1007\/3-540-47969-4_30","volume-title":"Computer Vision \u2014 ECCV 2002","author":"Vasilescu M.\u00a0Alex\u00a0O.","year":"2002","unstructured":"M.\u00a0Alex\u00a0O. Vasilescu and Demetri Terzopoulos. 2002. Multilinear Analysis of Image Ensembles: TensorFaces. In Computer Vision \u2014 ECCV 2002, Anders Heyden, Gunnar Sparr, Mads Nielsen, and Peter Johansen (Eds.). Springer Berlin Heidelberg, Berlin, Heidelberg, 447\u2013460."},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2018.00067"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","unstructured":"Hans-Joachim Werner Peter\u00a0J. Knowles Gerald Knizia Frederick\u00a0R. Manby and Martin Sch\u00fctz. 2012. Molpro: a general-purpose quantum chemistry program package. WIREs Computational Molecular Science 2 2 (2012) 242\u2013253. 10.1002\/wcms.82 arXiv:https:\/\/onlinelibrary.wiley.com\/doi\/pdf\/10.1002\/wcms.82","DOI":"10.1002\/wcms.82"},{"key":"e_1_3_3_1_48_2","unstructured":"Chun-Yu Wu Chih-Chieh Tu Kai-Jung Cheng and Che-Rung Lee. 2024. EITHOT: Efficient In-place Transposition of High Order Tensors on GPUs. ACM Transaction on Parallel Algorithm accepted (2024)."},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-10837-3_10"}],"event":{"name":"HPCASIA '25: Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region","acronym":"HPCASIA '25","location":"Hsinchu Taiwan"},"container-title":["Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712031.3712033","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712031.3712033","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:10Z","timestamp":1750295890000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712031.3712033"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,19]]},"references-count":48,"alternative-id":["10.1145\/3712031.3712033","10.1145\/3712031"],"URL":"https:\/\/doi.org\/10.1145\/3712031.3712033","relation":{},"subject":[],"published":{"date-parts":[[2025,2,19]]},"assertion":[{"value":"2025-03-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}