{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:40:10Z","timestamp":1755870010537,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":69,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3730422","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"57-72","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Scaling Large-scale GNN Training to Thousands of Processors on CPU-based Supercomputers"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-4156-9879","authenticated-orcid":false,"given":"Chen","family":"Zhuang","sequence":"first","affiliation":[{"name":"Institute of Science Tokyo, Tokyo, Japan and Riken Center for Computational Science, Kobe, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2452-1551","authenticated-orcid":false,"given":"Lingqi","family":"Zhang","sequence":"additional","affiliation":[{"name":"RIKEN Center for Computational Science, Kobe, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4002-0837","authenticated-orcid":false,"given":"Du","family":"Wu","sequence":"additional","affiliation":[{"name":"Institute of Science Tokyo, Tokyo, Japan and RIKEN Center for Computational Science, Kobe, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1244-3151","authenticated-orcid":false,"given":"Peng","family":"Chen","sequence":"additional","affiliation":[{"name":"RIKEN Center for Computational Science, Kobe, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5092-3987","authenticated-orcid":false,"given":"Jiajun","family":"Huang","sequence":"additional","affiliation":[{"name":"University of South Florida, Tampa, Florida, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2336-7409","authenticated-orcid":false,"given":"Xin","family":"Liu","sequence":"additional","affiliation":[{"name":"National Institute of Advanced Industrial Science &amp; Technology, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7573-7873","authenticated-orcid":false,"given":"Rio","family":"Yokota","sequence":"additional","affiliation":[{"name":"Institute of Science Tokyo, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9965-3647","authenticated-orcid":false,"given":"Nikoli","family":"Dryden","sequence":"additional","affiliation":[{"name":"Lawrence Livermore National Laboratory, Livermore, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7297-6211","authenticated-orcid":false,"given":"Toshio","family":"Endo","sequence":"additional","affiliation":[{"name":"Institute of Science Tokyo, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1910-8532","authenticated-orcid":false,"given":"Satoshi","family":"Matsuoka","sequence":"additional","affiliation":[{"name":"RIKEN Center for Computational Science, Kobe, Japan and Institute of Science Tokyo, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7165-2095","authenticated-orcid":false,"given":"Mohamed","family":"Wahib","sequence":"additional","affiliation":[{"name":"RIKEN Center for Computational Science, Kobe, Japan"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Ryousei Takano Shinichiro Takizawa Yusuke Tanimura Hidemoto Nakada Hirotaka Ogawa. 2024. ABCI 3.0: Evolution of the Leading AI Infrastructure in Japan. arXiv preprint arXiv:2411.09134."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"crossref","unstructured":"Ariful Azad Georgios\u00a0A Pavlopoulos Christos\u00a0A Ouzounis Nikos\u00a0C Kyrpides and Aydin Bulu\u00e7. 2018. HipMCL: a high-performance parallel implementation of the Markov clustering algorithm for large-scale networks. Nucleic acids research 46 6 (2018) e33\u2013e33.","DOI":"10.1093\/nar\/gkx1313"},{"key":"e_1_3_3_1_4_2","first-page":"2402","volume-title":"international conference on machine learning","author":"Belbute-Peres Filipe De\u00a0Avila","year":"2020","unstructured":"Filipe De\u00a0Avila Belbute-Peres, Thomas Economon, and Zico Kolter. 2020. Combining differentiable PDE solvers and graph neural networks for fluid flow prediction. In international conference on machine learning. PMLR, 2402\u20132411."},{"key":"e_1_3_3_1_5_2","unstructured":"Open\u00a0Graph Benchmark. 2024. OGB-Leaderboards for Node Property Prediction. https:\/\/ogb.stanford.edu\/docs\/leader_nodeprop\/"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/1963405.1963488"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/988672.988752"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3447786.3456233"},{"key":"e_1_3_3_1_9_2","first-page":"3541","volume-title":"International Conference on Machine Learning","author":"Cao Yadi","year":"2023","unstructured":"Yadi Cao, Menglei Chai, Minchen Li, and Chenfanfu Jiang. 2023. Efficient learning of mesh-based physical simulation with bi-stride multi-scale graph neural network. In International Conference on Machine Learning. PMLR, 3541\u20133558."},{"key":"e_1_3_3_1_10_2","unstructured":"Jie Chen Tengfei Ma and Cao Xiao. 2018. Fastgcn: fast learning with graph convolutional networks via importance sampling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1801.10247 (2018)."},{"key":"e_1_3_3_1_11_2","first-page":"1803","volume-title":"International Conference on Machine Learning","author":"Chen Jianfei","year":"2021","unstructured":"Jianfei Chen, Lianmin Zheng, Zhewei Yao, Dequan Wang, Ion Stoica, Michael Mahoney, and Joseph Gonzalez. 2021. Actnn: Reducing training memory footprint via 2-bit activation compressed training. In International Conference on Machine Learning. PMLR, 1803\u20131813."},{"key":"e_1_3_3_1_12_2","unstructured":"Jianfei Chen Jun Zhu and Le Song. 2017. Stochastic training of graph convolutional networks with variance reduction. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1710.10568 (2017)."},{"key":"e_1_3_3_1_13_2","first-page":"942","volume-title":"International Conference on Machine Learning","author":"Chen Jianfei","year":"2018","unstructured":"Jianfei Chen, Jun Zhu, and Le Song. 2018. Stochastic Training of Graph Convolutional Networks with Variance Reduction. In International Conference on Machine Learning. PMLR, 942\u2013950."},{"key":"e_1_3_3_1_14_2","first-page":"578","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et\u00a0al. 2018. TVM: An automated End-to-End optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 578\u2013594."},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330925"},{"key":"e_1_3_3_1_16_2","unstructured":"Wei Dai Yi Zhou Nanqing Dong Hao Zhang and Eric\u00a0P Xing. 2018. Toward understanding the impact of staleness in distributed machine learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1810.03264 (2018)."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-53622-3"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICTAI50040.2020.00198"},{"key":"e_1_3_3_1_19_2","unstructured":"Matthias Fey and Jan\u00a0Eric Lenssen. 2019. Fast graph representation learning with PyTorch Geometric. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1903.02428 (2019)."},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3502181.3531467"},{"key":"e_1_3_3_1_21_2","unstructured":"Fujitsu. 2023. A64FX Microarchitecture Manual. https:\/\/github.com\/fujitsu\/A64FX\/blob\/master\/doc\/A64FX_Microarchitecture_Manual_en_1.8.1.pdf Accessed: 2025-02-21."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433723"},{"key":"e_1_3_3_1_23_2","first-page":"551","volume-title":"OSDI","author":"Gandhi Swapnil","year":"2021","unstructured":"Swapnil Gandhi and Anand\u00a0Padmanabha Iyer. 2021. P3: Distributed Deep Graph Learning at Scale.. In OSDI. 551\u2013568."},{"key":"e_1_3_3_1_24_2","unstructured":"Zhijiang Guo Yan Zhang and Wei Lu. 2019. Attention guided graph convolutional networks for relation extraction. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1906.07510 (2019)."},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.25080\/TCWV9851"},{"key":"e_1_3_3_1_26_2","unstructured":"Will Hamilton Zhitao Ying and Jure Leskovec. 2017. Inductive representation learning on large graphs. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2016.83"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"John\u00a0E Hopcroft and Richard\u00a0M Karp. 1973. An n5\/2 algorithm for maximum matchings in bipartite graphs. SIAM Journal on computing 2 4 (1973) 225\u2013231.","DOI":"10.1137\/0202019"},{"key":"e_1_3_3_1_29_2","unstructured":"Weihua Hu Matthias Fey Hongyu Ren Maho Nakata Yuxiao Dong and Jure Leskovec. 2021. Ogb-lsc: A large-scale challenge for machine learning on graphs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2103.09430 (2021)."},{"key":"e_1_3_3_1_30_2","unstructured":"Weihua Hu Matthias Fey Marinka Zitnik Yuxiao Dong Hongyu Ren Bowen Liu Michele Catasta and Jure Leskovec. 2020. Open graph benchmark: Datasets for machine learning on graphs. Advances in neural information processing systems 33 (2020) 22118\u201322133."},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00075"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00076"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00286"},{"key":"e_1_3_3_1_34_2","unstructured":"Zhihao Jia Sina Lin Mingyu Gao Matei Zaharia and Alex Aiken. 2020. Improving the accuracy scalability and performance of graph neural networks with roc. Proceedings of Machine Learning and Systems 2 (2020) 187\u2013198."},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"crossref","unstructured":"Hao Jiang Peng Cao MingYi Xu Jinzhu Yang and Osmar Zaiane. 2020. Hi-GCN: A hierarchical graph convolution network for graph embedding learning of brain network and brain disorders prediction. Computers in Biology and Medicine 127 (2020) 104096.","DOI":"10.1016\/j.compbiomed.2020.104096"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3431379.3460644"},{"key":"e_1_3_3_1_37_2","unstructured":"Tim Kaler Alexandros Iliopoulos Philip Murzynowski Tao Schardl Charles\u00a0E Leiserson and Jie Chen. 2023. Communication-efficient graph neural networks with probabilistic neighborhood expansion analysis and caching. Proceedings of Machine Learning and Systems 5 (2023)."},{"key":"e_1_3_3_1_38_2","unstructured":"Tim Kaler Nickolas Stathas Anne Ouyang Alexandros-Stavros Iliopoulos Tao Schardl Charles\u00a0E Leiserson and Jie Chen. 2022. Accelerating training and inference of graph neural networks with fast sampling and pipelining. Proceedings of Machine Learning and Systems 4 (2022) 172\u2013189."},{"key":"e_1_3_3_1_39_2","unstructured":"George Karypis Kirk Schloegel and Vipin Kumar. 1997. Parmetis: Parallel graph partitioning and sparse matrix ordering library. (1997)."},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599843"},{"key":"e_1_3_3_1_41_2","unstructured":"Thomas\u00a0N Kipf and Max Welling. 2016. Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1609.02907 (2016)."},{"key":"e_1_3_3_1_42_2","unstructured":"D\u00e9nes K\u0151nig. 1931. Gr\u00e1fok \u00e9s m\u00e1trixok. Matematikai \u00e9s Fizikai Lapok 38 (1931) 116\u2013119."},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"crossref","unstructured":"Remi Lam Alvaro Sanchez-Gonzalez Matthew Willson Peter Wirnsberger Meire Fortunato Ferran Alet Suman Ravuri Timo Ewalds Zach Eaton-Rosen Weihua Hu et\u00a0al. 2023. Learning skillful medium-range global weather forecasting. Science 382 6677 (2023) 1416\u20131421.","DOI":"10.1126\/science.adi2336"},{"key":"e_1_3_3_1_44_2","first-page":"443","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Ma Lingxiao","year":"2019","unstructured":"Lingxiao Ma, Zhi Yang, Youshan Miao, Jilong Xue, Ming Wu, Lidong Zhou, and Yafei Dai. 2019. { NeuGraph} : Parallel deep neural network computation on large graphs. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). 443\u2013458."},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3480856"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"crossref","unstructured":"Jintao Meng Peng Chen Mohamed Wahib Mingjun Yang Liangzhen Zheng Yanjie Wei Shengzhong Feng and Wei Liu. 2022. Boosting the predictive performance with aqueous solubility dataset curation. Scientific Data 9 1 (2022) 71.","DOI":"10.1038\/s41597-022-01154-3"},{"key":"e_1_3_3_1_47_2","unstructured":"Hesham Mostafa. 2022. Sequential aggregation and rematerialization: Distributed full-batch training of graph neural networks on large graphs. Proceedings of Machine Learning and Systems 4 (2022) 265\u2013275."},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"crossref","unstructured":"Jingshu Peng Zhao Chen Yingxia Shao Yanyan Shen Lei Chen and Jiannong Cao. 2022. Sancus: sta le n ess-aware c omm u nication-avoiding full-graph decentralized training in large-scale graph neural networks. Proceedings of the VLDB Endowment 15 9 (2022) 1937\u20131950.","DOI":"10.14778\/3538598.3538614"},{"key":"e_1_3_3_1_49_2","unstructured":"Tobias Pfaff Meire Fortunato Alvaro Sanchez-Gonzalez and Peter\u00a0W Battaglia. 2020. Learning mesh-based simulation with graph networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.03409 (2020)."},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"crossref","unstructured":"Seongok Ryu Yongchan Kwon and Woo\u00a0Youn Kim. 2019. A Bayesian graph convolutional network for reliable prediction of molecular properties with uncertainty quantification. Chemical science 10 36 (2019) 8438\u20138446.","DOI":"10.1039\/C9SC01992H"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433763"},{"key":"e_1_3_3_1_52_2","unstructured":"Yunsheng Shi Zhengjie Huang Shikun Feng Hui Zhong Wenjin Wang and Yu Sun. 2020. Masked label prediction: Unified message passing model for semi-supervised classification. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2009.03509 (2020)."},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"crossref","unstructured":"Mengying Sun Sendong Zhao Coryandar Gilvary Olivier Elemento Jiayu Zhou and Fei Wang. 2020. Graph convolutional networks for computational drug development and discovery. Briefings in bioinformatics 21 3 (2020) 919\u2013935.","DOI":"10.1093\/bib\/bbz042"},{"key":"e_1_3_3_1_54_2","first-page":"495","volume-title":"15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21)","author":"Thorpe John","year":"2021","unstructured":"John Thorpe, Yifan Qiao, Jonathan Eyolfson, Shen Teng, Guanzhou Hu, Zhihao Jia, Jinliang Wei, Keval Vora, Ravi Netravali, Miryung Kim, et\u00a0al. 2021. Dorylus: Affordable, scalable, and accurate { GNN} training with distributed { CPU} servers and serverless threads. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21). 495\u2013514."},{"key":"e_1_3_3_1_55_2","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433794"},{"key":"e_1_3_3_1_56_2","unstructured":"Petar Velickovic Guillem Cucurull Arantxa Casanova Adriana Romero Pietro Lio Yoshua Bengio et\u00a0al. 2017. Graph attention networks. stat 1050 20 (2017) 10\u201348550."},{"key":"e_1_3_3_1_57_2","unstructured":"Borui Wan Juntao Zhao and Chuan Wu. 2023. Adaptive Message Quantization and Parallelization for Distributed Full-graph GNN Training. Proceedings of Machine Learning and Systems 5 (2023)."},{"key":"e_1_3_3_1_58_2","unstructured":"Cheng Wan Youjie Li Ang Li Nam\u00a0Sung Kim and Yingyan Lin. 2022. BNS-GCN: Efficient full-graph training of graph convolutional networks with partition-parallelism and random boundary node sampling. Proceedings of Machine Learning and Systems 4 (2022) 673\u2013693."},{"key":"e_1_3_3_1_59_2","volume-title":"International Conference on Learning Representations","author":"Wan Cheng","year":"2022","unstructured":"Cheng Wan, Youjie Li, Cameron\u00a0R. Wolfe, Anastasios Kyrillidis, Nam\u00a0Sung Kim, and Yingyan Lin. 2022. PipeGCN: Efficient Full-Graph Training of Graph Convolutional Networks with Pipelined Feature Communication. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=kSwqMH0zn1F"},{"key":"e_1_3_3_1_60_2","unstructured":"Hongwei Wang and Jure Leskovec. 2020. Unifying graph convolutional neural networks and label propagation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2002.06755 (2020)."},{"key":"e_1_3_3_1_61_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCSW.2011.34"},{"key":"e_1_3_3_1_62_2","unstructured":"Keyulu Xu Weihua Hu Jure Leskovec and Stefanie Jegelka. 2018. How powerful are graph neural networks? arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1810.00826 (2018)."},{"key":"e_1_3_3_1_63_2","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3340404"},{"key":"e_1_3_3_1_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582047"},{"key":"e_1_3_3_1_65_2","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3219890"},{"key":"e_1_3_3_1_66_2","doi-asserted-by":"crossref","unstructured":"Ye Yuan and Ziv Bar-Joseph. 2020. GCNG: graph convolutional networks for inferring gene interaction from spatial transcriptomics data. Genome biology 21 1 (2020) 1\u201316.","DOI":"10.1186\/s13059-020-02214-w"},{"key":"e_1_3_3_1_67_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE60146.2024.00293"},{"key":"e_1_3_3_1_68_2","doi-asserted-by":"publisher","DOI":"10.1109\/IA351965.2020.00011"},{"key":"e_1_3_3_1_69_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00204"},{"key":"e_1_3_3_1_70_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTERWorkshops61563.2024.00038"}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3730422","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:59:11Z","timestamp":1755867551000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3730422"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":69,"alternative-id":["10.1145\/3721145.3730422","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3730422","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}