{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,26]],"date-time":"2025-12-26T03:47:43Z","timestamp":1766720863340,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"vor","delay-in-days":75,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["CCF-2316159 and CCF-2316157"],"award-info":[{"award-number":["CCF-2316159 and CCF-2316157"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000185","name":"Defense Advanced Research Projects Agency","doi-asserted-by":"publisher","award":["N66001-21-C-4023"],"award-info":[{"award-number":["N66001-21-C-4023"]}],"id":[{"id":"10.13039\/100000185","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100006224","name":"Argonne National Laboratory","doi-asserted-by":"publisher","award":["Director?s Discretionary"],"award-info":[{"award-number":["Director?s Discretionary"]}],"id":[{"id":"10.13039\/100006224","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3730431","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"822-836","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Multi-Node Multi-GPU Datalog"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5718-4977","authenticated-orcid":false,"given":"Ahmedur Rahman","family":"Shovon","sequence":"first","affiliation":[{"name":"University of Illinois Chicago, Chicago, IL, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0946-2511","authenticated-orcid":false,"given":"Yihao","family":"Sun","sequence":"additional","affiliation":[{"name":"Syracuse University, Syracuse, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8650-0991","authenticated-orcid":false,"given":"Kristopher","family":"Micinski","sequence":"additional","affiliation":[{"name":"Syracure University, Syracuse, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0393-8542","authenticated-orcid":false,"given":"Thomas","family":"Gilray","sequence":"additional","affiliation":[{"name":"Washington State University, Pullman, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0418-9962","authenticated-orcid":false,"given":"Sidharth","family":"Kumar","sequence":"additional","affiliation":[{"name":"University of Illinois Chicago, Chicago, IL, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1145\/3079079.3079103"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"crossref","unstructured":"Ahmad Abdelfattah David Keyes and Hatem Ltaief. 2016. Kblas: An optimized library for dense matrix-vector multiplication on gpu accelerators. ACM Transactions on Mathematical Software (TOMS) 42 3 (2016) 1\u201331.","DOI":"10.1145\/2818311"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.5555\/551350"},{"key":"e_1_3_3_1_5_2","unstructured":"AMD. 2024. 5TH GEN AMD EPYC\u2122 PROCESSOR ARCHITECTURE. https:\/\/www.amd.com\/content\/dam\/amd\/en\/documents\/epyc-business-docs\/white-papers\/5th-gen-amd-epyc-processor-architecture-white-paper.pdf."},{"key":"e_1_3_3_1_6_2","unstructured":"Argonne Leadership Computing Facility. 2022. Polaris. https:\/\/www.alcf.anl.gov\/polaris."},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/2966884.2966912"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-53413-7_5"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"crossref","unstructured":"Cagri Balkesen Gustavo Alonso Jens Teubner and M.\u00a0Tamer \u00d6zsu. 2013. Multi-core main-memory joins: sort vs. hash revisited. Proc. VLDB Endow. 7 1 (Sept. 2013) 85\u201396. https:\/\/doi.org\/10.14778\/2732219.2732227","DOI":"10.14778\/2732219.2732227"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"Claude Barthels Ingo M\u00fcller Timo Schneider Gustavo Alonso and Torsten Hoefler. 2017. Distributed join algorithms on thousands of cores. Proceedings of the VLDB Endowment 10 5 (2017) 517\u2013528.","DOI":"10.14778\/3055540.3055545"},{"key":"e_1_3_3_1_11_2","unstructured":"PAUL BIBERSTEIN ZIYANG LI JOSEPH DEVIETTI and MAYUR NAIK. [n. d.]. Lobster: A GPU-Accelerated Framework for Neurosymbolic Programming. ([n. d.])."},{"key":"e_1_3_3_1_12_2","first-page":"54","volume-title":"VLDB","author":"Boncz Peter\u00a0A","year":"1999","unstructured":"Peter\u00a0A Boncz, Stefan Manegold, Martin\u00a0L Kersten, et\u00a0al. 1999. Database architecture optimized for the new bottleneck: Memory access. In VLDB , Vol.\u00a099. 54\u201365."},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/1640089.1640108"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"Stefano Ceri Georg Gottlob Letizia Tanca et\u00a0al. 1989. What you always wanted to know about Datalog(and never dared to ask). IEEE transactions on knowledge and data engineering 1 1 (1989) 146\u2013166.","DOI":"10.1109\/69.43410"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"Evgeny Dantsin Thomas Eiter Georg Gottlob and Andrei Voronkov. 2001. Complexity and Expressive Power of Logic Programming. ACM Comput. Surv. 33 3 (sep 2001) 374\u2013425. https:\/\/doi.org\/10.1145\/502807.502810","DOI":"10.1145\/502807.502810"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Timothy\u00a0A. Davis and Yifan Hu. 2011. The University of Florida Sparse Matrix Collection. ACM Trans. Math. Softw. 38 1 Article 1 (dec 2011) 25\u00a0pages. https:\/\/doi.org\/10.1145\/2049662.2049663","DOI":"10.1145\/2049662.2049663"},{"key":"e_1_3_3_1_17_2","volume-title":"Datalog Reloaded: First International Workshop, Datalog 2010, Oxford, UK, March 16-19, 2010. Revised Selected Papers","author":"De\u00a0Moor Oege","year":"2012","unstructured":"Oege De\u00a0Moor, Georg Gottlob, Tim Furche, and Andrew Sellers. 2012. Datalog Reloaded: First International Workshop, Datalog 2010, Oxford, UK, March 16-19, 2010. Revised Selected Papers. Vol.\u00a06702. Springer."},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3502181.3531468"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.23919\/ISC.2024.10528936"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","unstructured":"Zhiwei Fan Jianqiao Zhu Zuyu Zhang Aws Albarghouthi Paraschos Koutris and Jignesh\u00a0M. Patel. 2019. Scaling-up in-Memory Datalog Processing: Observations and Techniques. Proc. VLDB Endow. 12 6 (Feb 2019) 695\u2013708. https:\/\/doi.org\/10.14778\/3311880.3311886","DOI":"10.14778\/3311880.3311886"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/1058129.1058148"},{"key":"e_1_3_3_1_22_2","first-page":"1075","volume-title":"29th USENIX Security Symposium (USENIX Security 20)","author":"Flores-Montoya Antonio","year":"2020","unstructured":"Antonio Flores-Montoya and Eric Schulte. 2020. Datalog disassembly. In 29th USENIX Security Symposium (USENIX Security 20). 1075\u20131092."},{"key":"e_1_3_3_1_23_2","first-page":"209","volume-title":"VLDB","author":"Fushimi Shinya","year":"1986","unstructured":"Shinya Fushimi, Masaru Kitsuregawa, and Hidehiko Tanaka. 1986. An Overview of The System Software of A Parallel Relational Database Machine GRACE.. In VLDB , Vol.\u00a086. 209\u2013219."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"crossref","unstructured":"Thomas Gilray Arash Sahebolamri Yihao Sun Sowmith Kunapaneni Sidharth Kumar and Kristopher Micinski. 2024. Datalog with First-Class Facts. Proc. VLDB Endow. 18 3 (Nov. 2024) 651\u2013665. https:\/\/doi.org\/10.14778\/3712221.3712232","DOI":"10.14778\/3712221.3712232"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Oded Green. 2021. HashGraph\u2014Scalable hash tables using a sparse graph data structure. ACM Transactions on Parallel Computing (TOPC) 8 2 (2021) 1\u201317.","DOI":"10.1145\/3460872"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/1265530.1265535"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3299869.3324959"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/2588555.2594530"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Pieter Hijma Stijn Heldens Alessio Sclocco Ben van Werkhoven and Henri\u00a0E. Bal. 2023. Optimization Techniques for GPU Programming. ACM Comput. Surv. 55 11 Article 239 (March 2023) 81\u00a0pages. https:\/\/doi.org\/10.1145\/3570638","DOI":"10.1145\/3570638"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"crossref","unstructured":"Muhammad Imran G\u00e1bor\u00a0E G\u00e9vay Jorge-Arnulfo Quian\u00e9-Ruiz and Volker Markl. 2022. Fast datalog evaluation for batch and stream graph processing. World Wide Web 25 2 (2022) 971\u20131003.","DOI":"10.1007\/s11280-021-00960-w"},{"key":"e_1_3_3_1_31_2","unstructured":"Intel. 2023. Intel MPI for GPU Clusters. https:\/\/www.intel.com\/content\/www\/us\/en\/docs\/oneapi\/optimization-guide-gpu\/2023-2\/intel-mpi-for-gpu-clusters.html."},{"key":"e_1_3_3_1_32_2","unstructured":"Jiri Kraus. 2013. An Introduction to CUDA-Aware MPI. https:\/\/developer.nvidia.com\/blog\/introduction-cuda-aware-mpi\/."},{"key":"e_1_3_3_1_33_2","first-page":"422","volume-title":"Computer Aided Verification: 28th International Conference, CAV 2016, Toronto, ON, Canada, July 17-23, 2016, Proceedings, Part II 28","author":"Jordan Herbert","year":"2016","unstructured":"Herbert Jordan, Bernhard Scholz, and Pavle Suboti\u0107. 2016. Souffl\u00e9: On synthesis of program analyzers. In Computer Aided Verification: 28th International Conference, CAV 2016, Toronto, ON, Canada, July 17-23, 2016, Proceedings, Part II 28. Springer, 422\u2013430."},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"crossref","unstructured":"Changkyu Kim Tim Kaldewey Victor\u00a0W Lee Eric Sedlar Anthony\u00a0D Nguyen Nadathur Satish Jatin Chhugani Andrea Di\u00a0Blas and Pradeep Dubey. 2009. Sort vs. hash revisited: Fast join implementation on modern multi-core CPUs. Proceedings of the VLDB Endowment 2 2 (2009) 1378\u20131389.","DOI":"10.14778\/1687553.1687564"},{"key":"e_1_3_3_1_35_2","volume-title":"International Conference on High Performance Computing, Data, and Analytics (HiPC). IEEE","volume":"1","author":"Kumar Sidharth","year":"2019","unstructured":"Sidharth Kumar and Thomas Gilray. 2019. Distributed relational algebra at scale. In International Conference on High Performance Computing, Data, and Analytics (HiPC). IEEE , Vol.\u00a01."},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-50743-5_15"},{"key":"e_1_3_3_1_37_2","unstructured":"Zhuohang Lai Xibo Sun Qiong Luo and Xiaolong Xie. 2022. Accelerating multi-way joins on the GPU. The VLDB Journal (2022) 1\u201325."},{"key":"e_1_3_3_1_38_2","unstructured":"Jure Leskovec and Andrej Krevl. 2014. SNAP Datasets: Stanford Large Network Dataset Collection. http:\/\/snap.stanford.edu\/data."},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1007\/11535331_16"},{"key":"e_1_3_3_1_40_2","first-page":"152","volume-title":"International Conference on Applications of Declarative Programming and Knowledge Management","author":"Mart\u00ednez-Angeles Carlos\u00a0Alberto","year":"2013","unstructured":"Carlos\u00a0Alberto Mart\u00ednez-Angeles, In\u00eas Dutra, V\u00edtor\u00a0Santos Costa, and Jorge Buenabad-Ch\u00e1vez. 2013. A datalog engine for gpus. In International Conference on Applications of Declarative Programming and Knowledge Management. Springer, 152\u2013168."},{"key":"e_1_3_3_1_41_2","unstructured":"Adithya Murali Atharva Sehgal Paul Krogmeier and P Madhusudan. 2019. Composing neural learning and symbolic reasoning with an application to visual discrimination. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1907.05878 (2019)."},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-25010-6_1"},{"key":"e_1_3_3_1_43_2","unstructured":"NVIDIA. 2022. NVIDIA H100 Tensor Core GPU. https:\/\/www.nvidia.com\/en-us\/data-center\/h100\/."},{"key":"e_1_3_3_1_44_2","unstructured":"NVIDIA. 2024. CUDA C Programming Guide: SIMT. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/#simt-architecture."},{"key":"e_1_3_3_1_45_2","unstructured":"NVIDIA. 2025. CUDA Thrust API documentation: thrust::unique function. https:\/\/nvidia.github.io\/cccl\/thrust\/api\/function_group__stream__compaction_1gaccf33f1e24f8526b003f8a679591ad65.html."},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"crossref","first-page":"278","DOI":"10.1109\/ICDE.2013.6544832","volume-title":"2013 IEEE 29th International Conference on Data Engineering (ICDE)","author":"Seo Jiwon","year":"2013","unstructured":"Jiwon Seo, Stephen Guo, and Monica\u00a0S Lam. 2013. SociaLite: Datalog extensions for efficient social network analysis. In 2013 IEEE 29th International Conference on Data Engineering (ICDE). IEEE, 278\u2013289."},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3635035.3635047"},{"key":"e_1_3_3_1_48_2","first-page":"418","volume-title":"Proc. Int. Conf. on Signal Processing and Communications Systems (ICSPCS)","author":"Shams Ramtin","year":"2007","unstructured":"Ramtin Shams, RA Kennedy, et\u00a0al. 2007. Efficient histogram algorithms for NVIDIA CUDA compatible devices. In Proc. Int. Conf. on Signal Processing and Communications Systems (ICSPCS). Citeseer, 418\u2013422."},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/2882903.2915229"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/IA356718.2022.00012"},{"key":"e_1_3_3_1_51_2","first-page":"1009","volume-title":"2023 USENIX Annual Technical Conference (USENIX ATC 23)","author":"Shovon Ahmedur\u00a0Rahman","year":"2023","unstructured":"Ahmedur\u00a0Rahman Shovon, Thomas Gilray, Kristopher Micinski, and Sidharth Kumar. 2023. Towards iterative relational algebra on the { GPU}. In 2023 USENIX Annual Technical Conference (USENIX ATC 23). 1009\u20131016."},{"key":"e_1_3_3_1_52_2","first-page":"842","volume-title":"EDBT","author":"Skvortsov Evgeny","year":"2024","unstructured":"Evgeny Skvortsov, Yilin Xia, and Bertram Lud\u00e4scher. 2024. Logica: Declarative Data Science for Mere Mortals.. In EDBT. 842\u2013845."},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"publisher","DOI":"10.1145\/3035918.3064043"},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER52292.2023.00024"},{"key":"e_1_3_3_1_55_2","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707274"},{"key":"e_1_3_3_1_56_2","doi-asserted-by":"publisher","DOI":"10.23919\/ISC.2024.10528923"},{"key":"e_1_3_3_1_57_2","doi-asserted-by":"crossref","unstructured":"Hao Wang Sreeram Potluri Devendar Bureddy Carlos Rosales and Dhabaleswar\u00a0K Panda. 2013. GPU-aware MPI on RDMA-enabled clusters: Design implementation and evaluation. IEEE Transactions on Parallel and Distributed Systems 25 10 (2013) 2595\u20132605.","DOI":"10.1109\/TPDS.2013.222"},{"key":"e_1_3_3_1_58_2","doi-asserted-by":"publisher","DOI":"10.1145\/2581122.2544166"},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"publisher","DOI":"10.1145\/3584372.3588675"},{"key":"e_1_3_3_1_60_2","doi-asserted-by":"crossref","unstructured":"Hangdong Zhao Shaleen Deep Paraschos Koutris Sudeepa Roy and Val Tannen. 2024. Evaluating Datalog over Semirings: A Grounding-based Approach. Proc. ACM Manag. Data 2 2 Article 90 (May 2024) 26\u00a0pages. https:\/\/doi.org\/10.1145\/3651591","DOI":"10.1145\/3651591"}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3730431","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3730431","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:01:30Z","timestamp":1755867690000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3730431"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":59,"alternative-id":["10.1145\/3721145.3730431","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3730431","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}