{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:39:33Z","timestamp":1766219973831,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,9,8]]},"DOI":"10.1145\/3754598.3754651","type":"proceedings-article","created":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:34:32Z","timestamp":1766219672000},"page":"208-217","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["AMPED: Accelerating MTTKRP for Billion-Scale Sparse Tensor Decomposition on Multiple GPUs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5538-2988","authenticated-orcid":false,"given":"Sasindu","family":"Wijeratne","sequence":"first","affiliation":[{"name":"University of Southern California, Los Angeles, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8736-3012","authenticated-orcid":false,"given":"Rajgopal","family":"Kannan","sequence":"additional","affiliation":[{"name":"DEVCOM Army Research Office, Los Angeles, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1609-8589","authenticated-orcid":false,"given":"Viktor","family":"Prasanna","sequence":"additional","affiliation":[{"name":"University of Southern California, LOS ANGELES, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,12,20]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1017\/9781108855273"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3626183.3659980"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE53745.2022.00234"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE53745.2022.00234"},{"key":"e_1_3_3_1_6_2","unstructured":"Andrzej Cichocki. 2014. Era of big data processing: A new approach via tensor networks and tensor decompositions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1403.2048 (2014)."},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Andrzej Cichocki Danilo Mandic Lieven De\u00a0Lathauwer Guoxu Zhou Qibin Zhao Cesar Caiafa and Huy\u00a0Anh Phan. 2015. Tensor decompositions for signal processing applications: From two-way to multiway component analysis. IEEE signal processing magazine 32 2 (2015) 145\u2013163.","DOI":"10.1109\/MSP.2013.2297439"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2008.7476520"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"crossref","unstructured":"G\u00e9rard Favier and Andr\u00e9\u00a0LF de Almeida. 2014. Overview of constrained PARAFAC models. EURASIP Journal on Advances in Signal Processing 2014 1 (2014) 1\u201325.","DOI":"10.1186\/1687-6180-2014-142"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"Sofia Fernandes Hadi Fanaee-T and Jo\u00e3o Gama. 2021. Tensor decomposition for analysing time-evolving social networks: An overview. Artificial Intelligence Review 54 4 (2021) 2891\u20132916.","DOI":"10.1007\/s10462-020-09916-4"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3447818.3461703"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","unstructured":"Pieter Hijma Stijn Heldens Alessio Sclocco Ben van Werkhoven and Henri\u00a0E. Bal. 2023. Optimization Techniques for GPU Programming. ACM Comput. Surv. 55 11 Article 239 (March 2023) 81\u00a0pages. 10.1145\/3570638","DOI":"10.1145\/3570638"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"David Hong Tamara\u00a0G Kolda and Jed\u00a0A Duersch. 2020. Generalized canonical polyadic tensor decomposition. SIAM Rev. 62 1 (2020) 133\u2013163.","DOI":"10.1137\/18M1203626"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","unstructured":"Yuwang Ji Qiang Wang Xuan Li and Jie Liu. 2019. A Survey on Tensor Techniques and Applications in Machine Learning. IEEE Access 7 (2019) 162950\u2013162990. 10.1109\/ACCESS.2019.2949814","DOI":"10.1109\/ACCESS.2019.2949814"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/3337821.3337889"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Tamara\u00a0G Kolda and Brett\u00a0W Bader. 2009. Tensor decompositions and applications. SIAM review 51 3 (2009) 455\u2013500.","DOI":"10.1137\/07070111X"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS53621.2022.00097"},{"key":"e_1_3_3_1_18_2","unstructured":"Jan Laukemann Ahmed\u00a0E Helal S Anderson Fabio Checconi Yongseok Soh Jesmin\u00a0Jahan Tithi Teresa Ranadive Brian\u00a0J Gravelle Fabrizio Petrini and Jee Choi. 2024. Accelerating Sparse Tensor Decomposition Using Adaptive Linearized Representation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.06348 (2024)."},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2018.8573483"},{"key":"e_1_3_3_1_20_2","unstructured":"Jiajia Li Yuchen Ma and Richard Vuduc. 2018. ParTI!: A parallel tensor infrastructure for multicore CPUs and GPUs. A parallel tensor infrastructure for multicore CPUs and GPUs (2018)."},{"key":"e_1_3_3_1_21_2","unstructured":"Jiajia Li Bora U\u00e7ar \u00dcmit\u00a0V. \u00c7ataly\u00fcrek Jimeng Sun Kevin Barker and Richard Vuduc. 2019. Efficient and Effective Sparse Tensor Reordering. https:\/\/github.com\/hpcgarage\/ParTI"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER59578.2024.00036"},{"key":"e_1_3_3_1_23_2","unstructured":"Julian McAuley. 2021. Recommender Systems and Personalization Datasets. https:\/\/cseweb.ucsd.edu\/\u00a0jmcauley\/datasets.html#"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3524059.3532363"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Andy Nguyen Ahmed\u00a0E Helal Fabio Checconi Jan Laukemann Jesmin\u00a0Jahan Tithi Yongseok Soh Teresa Ranadive Fabrizio Petrini and Jee\u00a0W Choi. 2022. Efficient out-of-memory sparse MTTKRP on massively parallel architectures. https:\/\/github.com\/jeewhanchoi\/blocked-linearized-coordinate","DOI":"10.1145\/3524059.3532363"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356216"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Israt Nisa Jiajia Li Aravind Sukumaran-Rajam Prasant\u00a0Singh Rawat Sriram Krishnamoorthy and Ponnuswamy Sadayappan. 2019. An Efficient Mixed-Mode Representation of Sparse Tensors. https:\/\/github.com\/isratnisa\/MM-CSF","DOI":"10.1145\/3295500.3356216"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2019.00023"},{"key":"e_1_3_3_1_29_2","unstructured":"NVIDIA Corporation. 2024. GPUDirect. https:\/\/developer.nvidia.com\/gpudirect Accessed: 2025-04-30."},{"key":"e_1_3_3_1_30_2","unstructured":"NVIDIA Corporation. 2024. NVIDIA NVLink. https:\/\/www.nvidia.com\/en-us\/design-visualization\/nvlink-bridges\/ Accessed: 2025-04-30."},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","unstructured":"John\u00a0D. Owens Mike Houston David Luebke Simon Green John\u00a0E. Stone and James\u00a0C. Phillips. 2008. GPU Computing. Proc. IEEE 96 5 (2008) 879\u2013899. 10.1109\/JPROC.2008.917757","DOI":"10.1109\/JPROC.2008.917757"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3460231.3474267"},{"key":"e_1_3_3_1_33_2","volume-title":"FROSTT: The Formidable Repository of Open Sparse Tensors and Tools","author":"Smith Shaden","year":"2017","unstructured":"Shaden Smith, Jee\u00a0W. Choi, Jiajia Li, Richard Vuduc, Jongsoo Park, Xing Liu, and George Karypis. 2017. FROSTT: The Formidable Repository of Open Sparse Tensors and Tools. http:\/\/frostt.io\/"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00062"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC49654.2021.9622851"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/SBAC-PAD59825.2023.00012"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/3649153.3649187"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3543622.3573179"},{"key":"e_1_3_3_1_39_2","unstructured":"Cyril Zeller. 2011. CUDA C\/C++ Basics. (2011)."}],"event":{"name":"ICPP '25: 54th International Conference on Parallel Processing","location":"San Diego CA USA","acronym":"ICPP '25"},"container-title":["Proceedings of the 54th International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3754598.3754651","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:35:27Z","timestamp":1766219727000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3754598.3754651"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,8]]},"references-count":38,"alternative-id":["10.1145\/3754598.3754651","10.1145\/3754598"],"URL":"https:\/\/doi.org\/10.1145\/3754598.3754651","relation":{},"subject":[],"published":{"date-parts":[[2025,9,8]]},"assertion":[{"value":"2025-12-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}