{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,13]],"date-time":"2025-11-13T18:33:49Z","timestamp":1763058829396,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,8,29]],"date-time":"2022-08-29T00:00:00Z","timestamp":1661731200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000015","name":"U.S. Department of Energy","doi-asserted-by":"publisher","award":["DE-AC05-00OR22725"],"award-info":[{"award-number":["DE-AC05-00OR22725"]}],"id":[{"id":"10.13039\/100000015","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,8,29]]},"DOI":"10.1145\/3547276.3548627","type":"proceedings-article","created":{"date-parts":[[2023,1,15]],"date-time":"2023-01-15T00:56:17Z","timestamp":1673744177000},"page":"1-10","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["A Study on Atomics-based Integer Sum Reduction in HIP on AMD GPU"],"prefix":"10.1145","author":[{"given":"Zheming","family":"Jin","sequence":"first","affiliation":[{"name":"Oak Ridge National Laboratory, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jeffrey","family":"Vetter","sequence":"additional","affiliation":[{"name":"Oak Ridge National Laboratory, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jeffrey","family":"Vetter","sequence":"additional","affiliation":[{"name":"Oak Ridge National Laboratory, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,1,13]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Mark H. 2008. Optimizing parallel reduction in CUDA. NVIDIA CUDA SDK."},{"volume-title":"High Performance Computing and Simulation (HPCS), 2012 International Conference on (pp. 511-519)","author":"Mart\u00edn P.J.","key":"e_1_3_2_1_2_1","unstructured":"Mart\u00edn, P.J., Ayuso, L.F., Torres, R. and Gavilanes, A., 2012, July. Algorithmic strategies for optimizing the parallel reduction primitive in CUDA. In High Performance Computing and Simulation (HPCS), 2012 International Conference on (pp. 511-519). IEEE."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2019.8661187"},{"key":"e_1_3_2_1_4_1","volume-title":"Feb.","author":"Luitjens","year":"2014","unstructured":"J. Luitjens, Faster Parallel Reductions on Kepler, Feb. 2014, [online] Available: http:\/\/devblogs.nvidia.com\/parallelforall\/faster-parallel-reductions-kepler."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.5194\/gmd-14-2781-2021"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3067731"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3318170.3318178"},{"volume-title":"August. Evaluating the Performance of Integer Sum Reduction in SYCL on GPUs. In 50th International Conference on Parallel Processing Workshop (pp. 1-8).","author":"Jin Z.","key":"e_1_3_2_1_8_1","unstructured":"Jin, Z. and Vetter, J., 2021, August. Evaluating the Performance of Integer Sum Reduction in SYCL on GPUs. In 50th International Conference on Parallel Processing Workshop (pp. 1-8)."},{"key":"e_1_3_2_1_9_1","unstructured":"HIP Programming Guide [online] Available: https:\/\/rocmdocs.amd.com\/en\/latest\/Programming_Guides\/HIP-GUIDE.html"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2009.7478342"},{"volume-title":"Proceedings of the International Workshop on OpenCL (pp. 1-3).","author":"Babej M.","key":"e_1_3_2_1_11_1","unstructured":"Babej, M. and J\u00e4\u00e4skel\u00e4inen, P., 2020, April. HIPCL: Tool for Porting CUDA Applications to Advanced OpenCL Platforms Through HIP. In Proceedings of the International Workshop on OpenCL (pp. 1-3)."},{"key":"e_1_3_2_1_12_1","first-page":"608","volume-title":"Pitfalls of analyzing GPUs at the intermediate language level.\" In 2018 IEEE International Symposium on High Performance Computer Architecture (HPCA)","author":"Gutierrez","year":"2018","unstructured":"Gutierrez, Anthony, Bradford M. Beckmann, Alexandru Dutu, Joseph Gross, Michael LeBeane, John Kalamatianos, Onur Kayiran \"Lost in abstraction: Pitfalls of analyzing GPUs at the intermediate language level.\" In 2018 IEEE International Symposium on High Performance Computer Architecture (HPCA), pp. 608-619. IEEE, 2018"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Reinders J. Ashbaugh B. Brodman J. Kinsner M. Pennycook J. and Tian X. 2021. Data parallel C++: mastering DPC++ for programming of heterogeneous systems using C++ and SYCL (p. 548). Springer Nature.","DOI":"10.1007\/978-1-4842-5574-2"},{"key":"e_1_3_2_1_14_1","unstructured":"AMD Instinct MI100 Instruction Set Architecture [online] Available: https:\/\/developer.amd.com\/wp-content\/resources\/CDNA1_Shader_ISA_14December2020.pdf"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/2775049.2602993"},{"key":"e_1_3_2_1_16_1","unstructured":"ROC profiler library [online] Available: https:\/\/github.com\/ROCm-Developer-Tools\/rocprofiler"},{"key":"e_1_3_2_1_17_1","unstructured":"ROCm Thrust - run Thrust dependent software on AMD GPUs [online] Available: https:\/\/github.com\/ROCmSoftwarePlatform\/rocThrust"},{"key":"e_1_3_2_1_18_1","unstructured":"ROCm Parallel Primitives [online] Available: https:\/\/github.com\/ROCmSoftwarePlatform\/rocPRIM"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/B978-0-12-385963-1.00026-5"},{"key":"e_1_3_2_1_20_1","unstructured":"Reusable software components for ROCm developers [online] Available: https:\/\/github.com\/ROCmSoftwarePlatform\/hipCUB"},{"key":"e_1_3_2_1_21_1","volume-title":"CUB: A pattern of \u201ccollective","author":"Merrill D.","year":"2015","unstructured":"Merrill, D., 2015. CUB: A pattern of \u201ccollective\u201d software design, abstraction, and reuse for kernel-level programming. Nvidia Research"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Tan Y. 2016. GPU-based parallel implementation of swarm intelligence algorithms. Morgan Kaufmann.","DOI":"10.1016\/B978-0-12-809362-7.50003-0"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2012.319"},{"key":"e_1_3_2_1_24_1","unstructured":"Gutierrez A. Beckmann B.M. Puthoor S. Sinclair M.D. Ta T. and Zhang X. 2018 June. AMD gem5 APU simulator: Modeling GPUs Using the Machine ISA. In Tutorial at International Symposium on Computer Architecture."},{"key":"e_1_3_2_1_25_1","volume-title":"Aug.","author":"Catanzaro","year":"2014","unstructured":"B. Catanzaro, OpenCL Optimization Case Study: Simple Reductions, published by Advanced Micro Devices, Aug. 2014."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/WSCAD.2018.00013"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2011.6114174"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3120895.3120915"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3203217.3203244"},{"key":"e_1_3_2_1_30_1","unstructured":"How does HIP compare with OpenCL [online] Available: https:\/\/rocmdocs.amd.com\/en\/latest\/Programming_Guides\/HIP-FAQ.html#how-does-hip-compare-with-opencl"},{"key":"e_1_3_2_1_31_1","unstructured":"Reduction example with atomic add using HIP [online] Available: https:\/\/github.com\/ROCm-Developer-Tools\/HIP-Examples\/blob\/master\/reduction\/"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Gaster B. Howes L. Kaeli D.R. Mistry P. and Schaa D. 2012. Heterogeneous Computing with OpenCL: Revised OpenCL 1.2 Edition","DOI":"10.1016\/B978-0-12-387766-6.00024-4"}],"event":{"name":"ICPP '22: 51st International Conference on Parallel Processing","acronym":"ICPP '22","location":"Bordeaux France"},"container-title":["Workshop Proceedings of the 51st International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3547276.3548627","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3547276.3548627","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3547276.3548627","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:02:56Z","timestamp":1750186976000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3547276.3548627"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,29]]},"references-count":32,"alternative-id":["10.1145\/3547276.3548627","10.1145\/3547276"],"URL":"https:\/\/doi.org\/10.1145\/3547276.3548627","relation":{},"subject":[],"published":{"date-parts":[[2022,8,29]]},"assertion":[{"value":"2023-01-13","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}