{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T07:21:04Z","timestamp":1777965664980,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100006374","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["1911245, 2432018"],"award-info":[{"award-number":["1911245, 2432018"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,16]]},"DOI":"10.1145\/3694906.3743337","type":"proceedings-article","created":{"date-parts":[[2025,7,16]],"date-time":"2025-07-16T16:19:56Z","timestamp":1752682796000},"page":"158-170","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Eliminating Bank Conflicts in GPU Mergesort"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8918-7067","authenticated-orcid":false,"given":"Kyle","family":"Berney","sequence":"first","affiliation":[{"name":"University of Hawaii at M\u0101noa, Honolulu, HI, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8876-4846","authenticated-orcid":false,"given":"Nodari","family":"Sitchinava","sequence":"additional","affiliation":[{"name":"University of Hawaii at M\u0101noa, Honolulu, HI, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,7,16]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-48350-3_2"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/48529.48535"},{"key":"e_1_3_2_1_3_1","volume-title":"Number Theory","author":"Andrews George E.","unstructured":"George E. Andrews. 1994. Number Theory. Dover Publications, Inc."},{"key":"e_1_3_2_1_4_1","volume-title":"Fundamental Parallel Algorithms for Private-cache Chip Multiprocessors. In Symposium on Parallelism in Algorithms and Architectures. ACM, 197--206","author":"Arge Lars","year":"2008","unstructured":"Lars Arge, Michael T. Goodrich, Michael J. Nelson, and Nodari Sitchinava. 2008. Fundamental Parallel Algorithms for Private-cache Chip Multiprocessors. In Symposium on Parallelism in Algorithms and Architectures. ACM, 197--206."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3108139"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/2925426.2926259"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2021.3075392"},{"key":"e_1_3_2_1_8_1","volume-title":"Engineering Worst-Case Inputs for Pairwise Merge Sort on GPUs. In International Parallel and Distributed Processing Symposium. IEEE Computer Society, 1133--1142","author":"Berney Kyle","year":"2020","unstructured":"Kyle Berney and Nodari Sitchinava. 2020. Engineering Worst-Case Inputs for Pairwise Merge Sort on GPUs. In International Parallel and Distributed Processing Symposium. IEEE Computer Society, 1133--1142."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Kyle Berney and Nodari Sitchinava. 2025. CF-Merge: Bank Conflict Free GPU Mergesort. https:\/\/github.com\/algoparc\/GPU-CFMerge\/.","DOI":"10.1145\/3694906.3743337"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2015.2485994"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/2555243.2555253"},{"key":"e_1_3_2_1_12_1","volume-title":"Leiserson","author":"Cormen Thomas H.","year":"2022","unstructured":"Thomas H. Cormen, Clifford Stein, Ronald L. Rivest, and Charles E. Leiserson. 2022. Introduction to Algorithms (4nd ed.). McGraw-Hill Higher Education."},{"key":"e_1_3_2_1_13_1","volume-title":"Israel Symposium on Theory of Computing and Systems. IEEE Computer Society, 11--19","author":"Czumaj Artur","unstructured":"Artur Czumaj, Friedhelm Meyer auf der Heide, and Volker Stemann. 1995. Improved Optimal Shared Memory Simulations, and the Power of Reconfiguration. In Israel Symposium on Theory of Computing and Systems. IEEE Computer Society, 11--19."},{"key":"e_1_3_2_1_14_1","volume-title":"Shared Memory Simulations with Triple-Logarithmic Delay. In European Symposium on Algorithms (Lecture Notes in Computer Science","volume":"59","author":"Czumaj Artur","unstructured":"Artur Czumaj, Friedhelm Meyer auf der Heide, and Volker Stemann. 1995. Shared Memory Simulations with Triple-Logarithmic Delay. In European Symposium on Algorithms (Lecture Notes in Computer Science, Vol. 979). Springer, 46--59."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1137\/S009753979529564X"},{"key":"e_1_3_2_1_16_1","volume-title":"Work-Efficient Parallel GPU Methods for Single-Source Shortest Paths. In International Parallel and Distributed Processing Symposium. IEEE Computer Society, 349--359","author":"Davidson Andrew A.","unstructured":"Andrew A. Davidson, Sean Baxter, Michael Garland, and John D. Owens. 2014. Work-Efficient Parallel GPU Methods for Single-Source Shortest Paths. In International Parallel and Distributed Processing Symposium. IEEE Computer Society, 349--359."},{"key":"e_1_3_2_1_17_1","volume-title":"Efficient Shared Memory Simulations. In Symposium on Parallel Algorithms and Architectures. ACM, 110--119","author":"Dietzfelbinger Martin","unstructured":"Martin Dietzfelbinger and Friedhelm Meyer auf der Heide. 1993. Simple, Efficient Shared Memory Simulations. In Symposium on Parallel Algorithms and Architectures. ACM, 110--119."},{"key":"e_1_3_2_1_18_1","volume-title":"Fast Scan Algorithms on Graphics Processors. In International Conference on Supercomputing. ACM, 205--213","author":"Dotsenko Yuri","year":"2008","unstructured":"Yuri Dotsenko, Naga K. Govindaraju, Peter-Pike J. Sloan, Charles Boyd, and John Manferdelli. 2008. Fast Scan Algorithms on Graphics Processors. In International Conference on Supercomputing. ACM, 205--213."},{"key":"e_1_3_2_1_19_1","volume-title":"Elster","author":"Falch Thomas L.","year":"2014","unstructured":"Thomas L. Falch and Anne C. Elster. 2014. Register Caching for Stencil Computations on GPUs. In Symbolic and Numeric Algorithms for Scientific Computing. IEEE, 479--486."},{"key":"e_1_3_2_1_20_1","volume-title":"Throughput-oriented GPU Memory Allocation. In Symposium on Principles and Practice of Parallel Programming. ACM, 27--37","author":"Gelado Isaac","year":"2019","unstructured":"Isaac Gelado and Michael Garland. 2019. Throughput-oriented GPU Memory Allocation. In Symposium on Principles and Practice of Parallel Programming. ACM, 27--37."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2010.61"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.1983.1676201"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10766-012-0201-1"},{"key":"e_1_3_2_1_24_1","volume-title":"GPU Merge Path: A GPU Merging Algorithm. In International Conference on Supercomputing. ACM, 331--340","author":"Green Oded","unstructured":"Oded Green, Robert McColl, and David A. Bader. 2012. GPU Merge Path: A GPU Merging Algorithm. In International Conference on Supercomputing. ACM, 331--340."},{"key":"e_1_3_2_1_25_1","volume-title":"Technical Report AD-759 248","author":"Habermann A. Nico","unstructured":"A. Nico Habermann. 1972. Parallel Neighbor-Sort (or the Glory of the Induction Principle). Technical Report AD-759 248. Carnegie Mellon University."},{"key":"e_1_3_2_1_26_1","volume-title":"An Introduction to Parallel Algorithms","author":"J\u00e1J\u00e1 Joseph F.","unstructured":"Joseph F. J\u00e1J\u00e1. 1992. An Introduction to Parallel Algorithms. Addison-Wesley."},{"key":"e_1_3_2_1_27_1","volume-title":"Symposium on Theory of Computing. ACM, 318--326","author":"Karp Richard","unstructured":"Richard M.Karp, Michael Luby, and Friedhelm Meyer auf der Heide. 1992. Efficient PRAM Simulation on a Distributed Memory Machine. In Symposium on Theory of Computing. ACM, 318--326."},{"key":"e_1_3_2_1_28_1","volume-title":"High Performance Computing","author":"Karsin Ben","unstructured":"Ben Karsin, Henri Casanova, and Nodari Sitchinava. 2015. Efficient Batched Predecessor Search in Shared Memory on GPUs. In High Performance Computing. IEEE Computer Society, 335--344."},{"key":"e_1_3_2_1_29_1","volume-title":"Analysis-driven Engineering of Comparison-based Sorting Algorithms on GPUs. In International Conference on Supercomputing. ACM, 86--95","author":"Karsin Ben","year":"2018","unstructured":"Ben Karsin, Volker Weichert, Henri Casanova, John Iacono, and Nodari Sitchinava. 2018. Analysis-driven Engineering of Comparison-based Sorting Algorithms on GPUs. In International Conference on Supercomputing. ACM, 86--95."},{"key":"e_1_3_2_1_30_1","volume-title":"An Implementation of Conflict-Free Offline Permutation on the GPU. In International Conference on Networking and Computing. IEEE Computer Society, 226--232","author":"Kasagi Akihiko","year":"2012","unstructured":"Akihiko Kasagi, Koji Nakano, and Yasuaki Ito. 2012. An Implementation of Conflict-Free Offline Permutation on the GPU. In International Conference on Networking and Computing. IEEE Computer Society, 226--232."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.5555\/2122.2124"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1142\/S0129626411000187"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1016\/0304-3975(96)00032-1"},{"key":"e_1_3_2_1_34_1","volume-title":"Simple Memory Machine Models for GPUs. In International Parallel and Distributed Processing Symposium, Workshops and PhD Forum. IEEE Computer Society, 794--803","author":"Nakano Koji","year":"2012","unstructured":"Koji Nakano. 2012. Simple Memory Machine Models for GPUs. In International Parallel and Distributed Processing Symposium, Workshops and PhD Forum. IEEE Computer Society, 794--803."},{"key":"e_1_3_2_1_35_1","unstructured":"NVIDIA. 2022. CUDA C Best Practices Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-best-practices-guide\/index.html."},{"key":"e_1_3_2_1_36_1","unstructured":"NVIDIA. 2022. CUDA C Programming Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html."},{"key":"e_1_3_2_1_37_1","unstructured":"NVIDIA. 2022. CUDA Toolkit Documentation v11. https:\/\/docs.nvidia.com\/cuda\/index.html."},{"key":"e_1_3_2_1_38_1","volume-title":"Thrust: The C++ Parallel Algorithms Library. https:\/\/github.com\/thrust\/thrust.","author":"NVIDIA.","year":"2022","unstructured":"NVIDIA. 2022. Thrust: The C++ Parallel Algorithms Library. https:\/\/github.com\/thrust\/thrust."},{"key":"e_1_3_2_1_39_1","volume-title":"Designing Efficient Sorting Algorithms for Manycore GPUs. In International Symposium on Parallel and Distributed Processing. IEEE, 1--10","author":"Satish Nadathur","year":"2009","unstructured":"Nadathur Satish, Mark J. Harris, and Michael Garland. 2009. Designing Efficient Sorting Algorithms for Manycore GPUs. In International Symposium on Parallel and Distributed Processing. IEEE, 1--10."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/7531.7926"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2017.79"},{"key":"e_1_3_2_1_42_1","volume-title":"International Symposium on Parallel and Distributed Processing. IEEE Computer Society, 1--12","author":"Xiao Shucai","year":"2010","unstructured":"Shucai Xiao and Wu-chun Feng. 2010. Inter-block GPU Communication via Fast Barrier Synchronization. In International Symposium on Parallel and Distributed Processing. IEEE Computer Society, 1--12."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Yunlong Xu Lan Gao Rui Wang Zhongzhi Luan Weiguo Wu and Depei Qian. 2016. Lock-based Synchronization for GPU Architectures. In Computing Frontiers. ACM 205--213.","DOI":"10.1145\/2903150.2903155"},{"key":"e_1_3_2_1_44_1","volume-title":"Fast Tridiagonal Solvers on the GPU. In Symposium on Principles and Practice of Parallel Programming, R. Govindarajan, David A. Padua, and Mary W. Hall (Eds.). ACM, 127--136","author":"Zhang Yao","unstructured":"Yao Zhang, Jonathan Cohen, and John D. Owens. 2010. Fast Tridiagonal Solvers on the GPU. In Symposium on Principles and Practice of Parallel Programming, R. Govindarajan, David A. Padua, and Mary W. Hall (Eds.). ACM, 127--136."}],"event":{"name":"SPAA '25: 37th ACM Symposium on Parallelism in Algorithms and Architectures","location":"Portland OR USA","acronym":"SPAA '25","sponsor":["SIGACT ACM Special Interest Group on Algorithms and Computation Theory","SIGARCH ACM Special Interest Group on Computer Architecture","EATCS European Association for Theoretical Computer Science"]},"container-title":["Proceedings of the 37th ACM Symposium on Parallelism in Algorithms and Architectures"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3694906.3743337","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T19:18:29Z","timestamp":1777922309000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3694906.3743337"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,16]]},"references-count":44,"alternative-id":["10.1145\/3694906.3743337","10.1145\/3694906"],"URL":"https:\/\/doi.org\/10.1145\/3694906.3743337","relation":{},"subject":[],"published":{"date-parts":[[2025,7,16]]},"assertion":[{"value":"2025-07-16","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}