{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:44:28Z","timestamp":1766220268512,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","funder":[{"name":"NSF","award":["2047821"],"award-info":[{"award-number":["2047821"]}]},{"name":"DOE","award":["DE-SC0021285"],"award-info":[{"award-number":["DE-SC0021285"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,9,8]]},"DOI":"10.1145\/3754598.3754664","type":"proceedings-article","created":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:34:32Z","timestamp":1766219672000},"page":"228-237","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Scaling Distributed Graph Processing to Hundreds of GPUs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4709-4724","authenticated-orcid":false,"given":"George","family":"Slota","sequence":"first","affiliation":[{"name":"RPI, Troy, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6656-6237","authenticated-orcid":false,"given":"Michael","family":"Mandulak","sequence":"additional","affiliation":[{"name":"RPI, Troy, NY, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,12,20]]},"reference":[{"key":"e_1_3_3_1_2_2","first-page":"1616","volume-title":"2024 SC24: International Conference for High Performance Computing, Networking, Storage and Analysis SC","author":"Arai Junya","year":"2024","unstructured":"Junya Arai, Masahiro Nakao, Yuto Inoue, Kanto Teranishi, Koji Ueno, Keiichiro Yamamura, Mitsuhisa Sato, and Katsuki Fujisawa. 2024. Doubling Graph Traversal Efficiency to 198 TeraTEPS on the Supercomputer Fugaku. In 2024 SC24: International Conference for High Performance Computing, Networking, Storage and Analysis SC. IEEE Computer Society, 1616\u20131629."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2012.50"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/3078597.3078616"},{"key":"e_1_3_3_1_5_2","first-page":"1","volume-title":"2022 IEEE High Performance Extreme Computing Conference (HPEC)","author":"Bogle Ian","year":"2022","unstructured":"Ian Bogle and George\u00a0M Slota. 2022. Achieving Speedups for Distributed Graph Biconnectivity. In 2022 IEEE High Performance Extreme Computing Conference (HPEC). IEEE, 1\u20137."},{"key":"e_1_3_3_1_6_2","first-page":"1139","volume-title":"IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)","author":"Bogle Ian","year":"2022","unstructured":"Ian Bogle and George\u00a0M Slota. 2022. Distributed algorithms for the graph biconnectivity and least common ancestor problems. In IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW). IEEE, 1139\u20131142."},{"key":"e_1_3_3_1_7_2","series-title":"(SC \u201913)","volume-title":"Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis","author":"Boman Erik\u00a0G.","year":"2013","unstructured":"Erik\u00a0G. Boman, Karen\u00a0D. Devine, and Sivasankaran Rajamanickam. 2013. Scalable matrix computations on large scale-free graphs using 2D graph partitioning. In Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis (Denver, Colorado) (SC \u201913). Association for Computing Machinery, New York, NY, USA, Article 50, 12\u00a0pages."},{"key":"e_1_3_3_1_8_2","first-page":"118","volume-title":"IPDPS","author":"\u00c7ataly\u00fcrek \u00dcmit\u00a0V","year":"2001","unstructured":"\u00dcmit\u00a0V \u00c7ataly\u00fcrek and Cevdet Aykanat. 2001. A Fine-Grain Hypergraph Model for 2D Decomposition of Sparse Matrices.. In IPDPS, Vol.\u00a01. Citeseer, 118."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"crossref","unstructured":"Avery Ching Sergey Edunov Maja Kabiljo Dionysios Logothetis and Sambavi Muthukrishnan. 2015. One trillion edges: graph processing at Facebook-scale. Proc. VLDB Endow. 8 12 (Aug. 2015) 1804\u20131815.","DOI":"10.14778\/2824032.2824077"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2019.00010"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3627535.3638498"},{"key":"e_1_3_3_1_12_2","first-page":"17","volume-title":"10th USENIX symposium on operating systems design and implementation (OSDI 12)","author":"Gonzalez Joseph\u00a0E","year":"2012","unstructured":"Joseph\u00a0E Gonzalez, Yucheng Low, Haijie Gu, Danny Bickson, and Carlos Guestrin. 2012. PowerGraph: Distributed Graph-Parallel computation on natural graphs. In 10th USENIX symposium on operating systems design and implementation (OSDI 12). 17\u201330."},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"Bruce Hendrickson Robert Leland and Steve Plimpton. 1995. An efficient parallel algorithm for matrix-vector multiplication. International Journal of High Speed Computing 7 01 (1995) 73\u201388.","DOI":"10.1142\/S0129053395000051"},{"key":"e_1_3_3_1_14_2","volume-title":"International Parallel and Distributed Processing Symposium (IPDPS)","author":"Jatala Vishwesh","year":"2020","unstructured":"Vishwesh Jatala, Roshan Dathathri, Gurbinder Gill, Loc Hoang, V.\u00a0Krishna Nandivada, and Keshav Pingali. 2020. A Study of Graph Analytics for Massive Datasets on Distributed GPUs. In International Parallel and Distributed Processing Symposium (IPDPS)."},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","first-page":"84","DOI":"10.1109\/IPDPS47924.2020.00019","volume-title":"2020 IEEE International Parallel and Distributed Processing Symposium (IPDPS)","author":"Jatala Vishwesh","year":"2020","unstructured":"Vishwesh Jatala, Roshan Dathathri, Gurbinder Gill, Loc Hoang, V.\u00a0Krishna Nandivada, and Keshav Pingali. 2020. A Study of Graph Analytics for Massive Datasets on Distributed Multi-GPUs. In 2020 IEEE International Parallel and Distributed Processing Symposium (IPDPS). 84\u201394."},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Zhihao Jia Yongkee Kwon Galen Shipman Pat McCormick Mattan Erez and Alex Aiken. 2017. A distributed multi-gpu system for fast graph processing. Proceedings of the VLDB Endowment 11 3 (2017) 297\u2013310.","DOI":"10.14778\/3157794.3157799"},{"key":"e_1_3_3_1_17_2","first-page":"1","volume-title":"2022 IEEE High Performance Extreme Computing Conference (HPEC)","author":"Kang Seunghwa","year":"2022","unstructured":"Seunghwa Kang, Joseph Nke, and Brad Rees. 2022. Analyzing Multi-trillion Edge Graphs on Large GPU Clusters: A Case Study with PageRank. In 2022 IEEE High Performance Extreme Computing Conference (HPEC). 1\u20137."},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"crossref","first-page":"215","DOI":"10.1109\/BigData59044.2023.10386309","volume-title":"2023 IEEE International Conference on Big Data (BigData)","author":"Koohi\u00a0Esfahani Mohsen","year":"2023","unstructured":"Mohsen Koohi\u00a0Esfahani, Paolo Boldi, Hans Vandierendonck, Peter Kilpatrick, and Sebastiano Vigna. 2023. On Overcoming HPC Challenges of Trillion-Scale Real-World Graph Datasets. In 2023 IEEE International Conference on Big Data (BigData). 215\u2013220."},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00059"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","first-page":"135","DOI":"10.1145\/1807167.1807184","volume-title":"Proceedings of the 2010 ACM SIGMOD International Conference on Management of data","author":"Malewicz Grzegorz","year":"2010","unstructured":"Grzegorz Malewicz, Matthew\u00a0H Austern, Aart\u00a0JC Bik, James\u00a0C Dehnert, Ilan Horn, Naty Leiser, and Grzegorz Czajkowski. 2010. Pregel: a system for large-scale graph processing. In Proceedings of the 2010 ACM SIGMOD International Conference on Management of data. 135\u2013146."},{"key":"e_1_3_3_1_21_2","volume-title":"15th Workshop on Hot Topics in Operating Systems (HotOS XV)","author":"McSherry Frank","year":"2015","unstructured":"Frank McSherry, Michael Isard, and Derek\u00a0G Murray. 2015. Scalability! but at what { COST} ?. In 15th Workshop on Hot Topics in Operating Systems (HotOS XV)."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Duane Merrill Michael Garland and Andrew Grimshaw. 2012. Scalable GPU graph traversal. ACM Sigplan Notices 47 8 (2012) 117\u2013128.","DOI":"10.1145\/2370036.2145832"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-49116-3_24"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/2588555.2610518"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356206"},{"key":"e_1_3_3_1_26_2","first-page":"1","volume-title":"2024 IEEE High Performance Extreme Computing Conference (HPEC)","author":"Slota George\u00a0M","year":"2024","unstructured":"George\u00a0M Slota and Christopher Brissette. 2024. Constant-Memory Graph Coarsening. In 2024 IEEE High Performance Extreme Computing Conference (HPEC). IEEE, 1\u20137."},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","first-page":"17","DOI":"10.1109\/IPDPS.2015.54","volume-title":"2015 IEEE International Parallel and Distributed Processing Symposium","author":"Slota George\u00a0M","year":"2015","unstructured":"George\u00a0M Slota, Sivasankaran Rajamanickam, and Kamesh Madduri. 2015. High-performance graph analytics on manycore processors. In 2015 IEEE International Parallel and Distributed Processing Symposium. IEEE, 17\u201327."},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2016.93"},{"key":"e_1_3_3_1_29_2","first-page":"90","volume-title":"European Conference on Parallel Processing","author":"Solomonik Edgar","year":"2011","unstructured":"Edgar Solomonik and James Demmel. 2011. Communication-optimal parallel 2.5 D matrix multiplication and LU factorization algorithms. In European Conference on Parallel Processing. Springer, 90\u2013109."},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"crossref","unstructured":"Christian\u00a0L Staudt Aleksejs Sazonovs and Henning Meyerhenke. 2016. NetworKit: A tool suite for large-scale complex network analysis. Network Science 4 4 (2016) 508\u2013530.","DOI":"10.1017\/nws.2016.20"},{"key":"e_1_3_3_1_31_2","series-title":"(ICPP \u201919)","volume-title":"Proceedings of the 48th International Conference on Parallel Processing","author":"Tom Ancy\u00a0Sarah","year":"2019","unstructured":"Ancy\u00a0Sarah Tom and George Karypis. 2019. A 2D Parallel Triangle Counting Algorithm for Distributed-Memory Architectures. In Proceedings of the 48th International Conference on Parallel Processing (Kyoto, Japan) (ICPP \u201919). Association for Computing Machinery, New York, NY, USA, Article 45, 10\u00a0pages."},{"key":"e_1_3_3_1_32_2","first-page":"1","volume-title":"Proceedings of the 21st ACM SIGPLAN symposium on principles and practice of parallel programming","author":"Wang Yangzihao","year":"2016","unstructured":"Yangzihao Wang, Andrew Davidson, Yuechao Pan, Yuduo Wu, Andy Riffel, and John\u00a0D Owens. 2016. Gunrock: A high-performance graph processing library on the GPU. In Proceedings of the 21st ACM SIGPLAN symposium on principles and practice of parallel programming. 1\u201312."},{"key":"e_1_3_3_1_33_2","first-page":"140","volume-title":"2014 IEEE International Symposium on Workload Characterization (IISWC)","author":"Xu Qiumin","year":"2014","unstructured":"Qiumin Xu, Hyeran Jeon, and Murali Annavaram. 2014. Graph processing on GPUs: Where are the bottlenecks?. In 2014 IEEE International Symposium on Workload Characterization (IISWC). 140\u2013149."},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"crossref","unstructured":"Carl Yang Ayd\u0131n Bulu\u00e7 and John\u00a0D Owens. 2022. GraphBLAST: A high-performance linear algebra-based graph framework on the GPU. ACM Transactions on Mathematical Software (TOMS) 48 1 (2022) 1\u201351.","DOI":"10.1145\/3466795"},{"key":"e_1_3_3_1_35_2","series-title":"(OSDI\u201916)","first-page":"301","volume-title":"Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation","author":"Zhu Xiaowei","year":"2016","unstructured":"Xiaowei Zhu, Wenguang Chen, Weimin Zheng, and Xiaosong Ma. 2016. Gemini: a computation-centric distributed graph processing system. In Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation (Savannah, GA, USA) (OSDI\u201916). USENIX Association, USA, 301\u2013316."}],"event":{"name":"ICPP '25: 54th International Conference on Parallel Processing","location":"San Diego CA USA","acronym":"ICPP '25"},"container-title":["Proceedings of the 54th International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3754598.3754664","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:39:47Z","timestamp":1766219987000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3754598.3754664"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,8]]},"references-count":34,"alternative-id":["10.1145\/3754598.3754664","10.1145\/3754598"],"URL":"https:\/\/doi.org\/10.1145\/3754598.3754664","relation":{},"subject":[],"published":{"date-parts":[[2025,9,8]]},"assertion":[{"value":"2025-12-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}