{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T23:30:02Z","timestamp":1777937402203,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"vor","delay-in-days":75,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"DOE","award":["DE-AC02-06CH11357,DE-SC0024207"],"award-info":[{"award-number":["DE-AC02-06CH11357,DE-SC0024207"]}]},{"name":"NSF","award":["OAC-2104023,OAC-2311875"],"award-info":[{"award-number":["OAC-2104023,OAC-2311875"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3733642","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"43-56","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["ghZCCL: Advancing GPU-aware Collective Communications with Homomorphic Compression"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5092-3987","authenticated-orcid":false,"given":"Jiajun","family":"Huang","sequence":"first","affiliation":[{"name":"University of South Florida, Tampa, FL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9935-5674","authenticated-orcid":false,"given":"Sheng","family":"Di","sequence":"additional","affiliation":[{"name":"Argonne National Laboratory, Lemont, IL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7370-6766","authenticated-orcid":false,"given":"Yafan","family":"Huang","sequence":"additional","affiliation":[{"name":"University of Iowa, Iowa City, IA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2578-4940","authenticated-orcid":false,"given":"Zizhong","family":"Chen","sequence":"additional","affiliation":[{"name":"University of California, Riverside, Riverside, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7890-3934","authenticated-orcid":false,"given":"Franck","family":"Cappello","sequence":"additional","affiliation":[{"name":"Argonne National Laboratory, Lemont, IL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3731-5423","authenticated-orcid":false,"given":"Yanfei","family":"Guo","sequence":"additional","affiliation":[{"name":"Argonne National Laboratory, Lemont, IL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5532-3048","authenticated-orcid":false,"given":"Rajeev","family":"Thakur","sequence":"additional","affiliation":[{"name":"Argonne National Laboratory, Lemont, IL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_1_2_2","first-page":"265","volume-title":"12th { USENIX} symposium on operating systems design and implementation ({ OSDI} 16)","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, et\u00a0al. 2016. Tensorflow: A system for large-scale machine learning. In 12th { USENIX} symposium on operating systems design and implementation ({ OSDI} 16). 265\u2013283."},{"key":"e_1_3_3_1_3_2","unstructured":"Ahmed\u00a0M. Abdelmoniem Ahmed Elzanaty Mohamed-Slim Alouini and Marco Canini. 2021. An Efficient Statistical-based Gradient Compression Technique for Distributed Training Systems. arxiv:https:\/\/arXiv.org\/abs\/2101.10761\u00a0[cs.LG]"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/1088149.1088183"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3018743.3018769"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/ExaMPI49596.2019.00007"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","unstructured":"Gaurav Bansal Ajith Mascarenhas and Jacqueline\u00a0H. Chen. 2015. Direct Numerical Simulations of Autoignition in Stratified Dimethyl-Ether (DME)\/Air Turbulent Mixtures. Combustion and Flame 162 (2015) 688\u2013702. 10.1016\/j.combustflame.2014.08.021","DOI":"10.1016\/j.combustflame.2014.08.021"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER.2018.00014"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","unstructured":"Jon Calhoun Franck Cappello Luke\u00a0N Olson Marc Snir and William\u00a0D Gropp. 2019. Exploring the feasibility of lossy compression for PDE simulations. The International Journal of High Performance Computing Applications 33 2 (2019) 397\u2013410. 10.1177\/1094342018762036","DOI":"10.1177\/1094342018762036"},{"key":"e_1_3_3_1_10_2","unstructured":"Qiaoling Chen Qinghao Hu Guoteng Wang Yingtong Xiong Ting Huang Xun Chen Yang Gao Hang Yan Yonggang Wen Tianwei Zhang and Peng Sun. 2024. AMSP: Reducing Communication Overhead of ZeRO for Efficient LLM Training. arxiv:https:\/\/arXiv.org\/abs\/2311.00257\u00a0[cs.DC]"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00033"},{"key":"e_1_3_3_1_12_2","unstructured":"Community Earth System Model (CESM) Atmosphere Model. 2019. http:\/\/www.cesm.ucar.edu\/models\/. Online."},{"key":"e_1_3_3_1_13_2","unstructured":"NVIDIA Corp.2023. NCCL \u2013 Optimized primitives for inter-GPU communication. https:\/\/github.com\/NVIDIA\/nccl."},{"key":"e_1_3_3_1_14_2","unstructured":"Michael\u00a0Garland Duane\u00a0Merrill. 2016. Single-pass Parallel Prefix Scan with Decoupled Look-back. https:\/\/research.nvidia.com\/sites\/default\/files\/pubs\/2016-03_Single-pass-Parallel-Prefix\/nvr-2016-002.pdf."},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","unstructured":"R.\u00a0W. Grout A. Gruber H. Kolla P.-T. Bremer J.\u00a0C. Bennett A. Gyulassy and J.\u00a0H. Chen. 2012. A Direct Numerical Simulation Study of Turbulence and Flame Structure in Transverse Jets Analysed in Jet-Trajectory Based Coordinates. Journal of Fluid Mechanics 706 (2012) 351\u2013383. 10.1017\/jfm.2012.257","DOI":"10.1017\/jfm.2012.257"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPDC52870.2021.9521599"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/LDAV.2018.8739165"},{"key":"e_1_3_3_1_18_2","unstructured":"HPE. [n. d.]. Cray MPI\/MPICH. https:\/\/cpe.ext.hpe.com\/docs\/24.03\/mpt\/mpich\/index.html."},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3627535.3638467"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3650200.3656636"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00110"},{"key":"e_1_3_3_1_22_2","unstructured":"Jiajun Huang Sheng Di Xiaodong Yu Yujia Zhai Zhaorui Zhang Jinyang Liu Xiaoyi Lu Ken Raffenetti Hui Zhou Kai Zhao Khalid Alharthi Zizhong Chen Franck Cappello Yanfei Guo and Rajeev Thakur. 2025. ZCCL: Significantly Improving Collective Communication With Error-Bounded Lossy Compression. arxiv:https:\/\/arXiv.org\/abs\/2502.18554\u00a0[cs.DC]"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS57955.2024.00072"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/BigData59044.2023.10386386"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00021"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607048"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3307681.3326608"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","unstructured":"Sian Jin Chengming Zhang Xintong Jiang Yunhe Feng Hui Guan Guanpeng Li Shuaiwen\u00a0Leon Song and Dingwen Tao. 2021. COMET: a novel memory-efficient deep learning training framework by using error-bounded lossy compression. Proc. VLDB Endow. 15 4 (Dec. 2021) 886\u2013899. 10.14778\/3503585.3503597","DOI":"10.14778\/3503585.3503597"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Suha Kayum et\u00a0al. 2020. GeoDRIVE \u2013 A high performance computing flexible platform for seismic applications. First Break 38 2 (2020) 97\u2013100.","DOI":"10.3997\/1365-2397.fb2020015"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/PacificVis53943.2022.00017"},{"key":"e_1_3_3_1_31_2","unstructured":"Argonne\u00a0National Laboratory. 2023. MPICH \u2013 A high-performance and widely portable implementation of the MPI-4.0 standard. https:\/\/www.mpich.org."},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356193"},{"key":"e_1_3_3_1_33_2","volume-title":"The International Conference on Learning Representations","author":"Lin Yujun","year":"2018","unstructured":"Yujun Lin, Song Han, Huizi Mao, Yu Wang, and William\u00a0J Dally. 2018. Deep Gradient Compression: Reducing the communication bandwidth for distributed training. In The International Conference on Learning Representations."},{"key":"e_1_3_3_1_34_2","unstructured":"Peter Lindstrom. [n. d.]. cuzfp. https:\/\/github.com\/LLNL\/zfp\/tree\/develop\/src\/cuda_zfp."},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00019"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.18429\/JACoW-FEL2015-TUP007"},{"key":"e_1_3_3_1_37_2","unstructured":"NYX simulation. 2019. https:\/\/amrex-astro.github.io\/Nyx. Online."},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"crossref","unstructured":"Pitch Patarasuk and Xin Yuan. 2009. Bandwidth optimal all-reduce algorithms for clusters of workstations. J. Parallel and Distrib. Comput. 69 2 (2009) 117\u2013124.","DOI":"10.1016\/j.jpdc.2008.09.002"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2015.67"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"Rajeev Thakur Rolf Rabenseifner and William Gropp. 2005. Optimization of collective communication operations in MPICH. The International Journal of High Performance Computing Applications 19 1 (2005) 49\u201366.","DOI":"10.1177\/1094342005051521"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3410463.3414624"},{"key":"e_1_3_3_1_42_2","unstructured":"TOP500.org. 2024. TOP500 LIST - NOVEMBER 2024. https:\/\/top500.org\/lists\/top500\/2024\/11\/."},{"key":"e_1_3_3_1_43_2","unstructured":"Didem Unat Ilyas Turimbetov Mohammed Kefah\u00a0Taha Issa Do\u011fan Sa\u011fbili Flavio Vella Daniele\u00a0De Sensi and Ismayil Ismayilov. 2024. The Landscape of GPU-Centric Communication. arxiv:https:\/\/arXiv.org\/abs\/2409.09874\u00a0[cs.DC]"},{"key":"e_1_3_3_1_44_2","volume-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems","author":"Wen Wei","year":"2017","unstructured":"Wei Wen, Cong Xu, Feng Yan, Chunpeng Wu, Yandan Wang, Yiran Chen, and Hai Li. 2017. TernGrad: ternary gradients to reduce communication in distributed deep learning. In Proceedings of the 31st International Conference on Neural Information Processing Systems."},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356155"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/3502181.3531473"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3588195.3592994"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE51399.2021.00145"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS49936.2021.00053"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-07312-0_1"}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3733642","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3733642","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:04:25Z","timestamp":1755867865000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3733642"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":49,"alternative-id":["10.1145\/3721145.3733642","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3733642","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}