{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T04:16:46Z","timestamp":1777954606724,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,16]]},"DOI":"10.1145\/3694906.3743326","type":"proceedings-article","created":{"date-parts":[[2025,7,16]],"date-time":"2025-07-16T16:19:56Z","timestamp":1752682796000},"page":"255-268","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Decoupled Fallback: A Portable Single-Pass GPU Scan"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-3612-865X","authenticated-orcid":false,"given":"Thomas","family":"Smith","sequence":"first","affiliation":[{"name":"Google, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4592-3389","authenticated-orcid":false,"given":"Raph","family":"Levien","sequence":"additional","affiliation":[{"name":"Google, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6582-8237","authenticated-orcid":false,"given":"John D.","family":"Owens","sequence":"additional","affiliation":[{"name":"University of California, Davis, USA and Google"}]}],"member":"320","published-online":{"date-parts":[[2025,7,16]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Onesweep: A Faster Least Significant Digit Radix Sort for GPUs. arXiv:2206.01784 [cs.DC] https:\/\/arxiv.org\/abs\/2206.01784","author":"Adinets Andy","year":"2022","unstructured":"Andy Adinets and Duane Merrill. 2022. Onesweep: A Faster Least Significant Digit Radix Sort for GPUs. arXiv:2206.01784 [cs.DC] https:\/\/arxiv.org\/abs\/2206.01784"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-37036-6_28"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/1654059.1654078"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/12.42122"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-21878-1_46"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.1982.1675982"},{"key":"e_1_3_2_1_7_1","volume-title":"Preston","author":"Clifford Alfred Hoblitzelle","year":"1961","unstructured":"Alfred Hoblitzelle Clifford and Gordon B. Preston. 1961. The Algebraic Theory of Semigroups, Vol. I. Number 7 in Mathematical Surveys. American Mathematical Society, Providence, RI."},{"key":"e_1_3_2_1_8_1","volume-title":"Intel\u00ae 64 and IA-32 Architectures Software Developer's Manual","author":"Intel Corporation","year":"2025","unstructured":"Intel Corporation. 2024. Intel\u00ae 64 and IA-32 Architectures Software Developer's Manual, Volume 3A: System Programming Guide, Part 1. https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/articles\/technical\/intel-sdm.html Accessed: 2025-02-16."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/1375527.1375559"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/800061.808738"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1111\/j.1467-8659.2006.00969.x"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/InPar.2012.6339596"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2012.336"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACSSC.2003.1292373"},{"key":"e_1_3_2_1_15_1","volume-title":"Owens","author":"Harris Mark","year":"2007","unstructured":"Mark Harris, Shubhabrata Sengupta, and John D. Owens. 2007. Parallel Prefix Sum (Scan) with CUDA. In GPU Gems 3, Hubert Nguyen (Ed.). Addison Wesley, Chapter 39, 851--876."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1111\/j.1467-8659.2005.00880.x"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-27764-4_11"},{"key":"e_1_3_2_1_18_1","volume-title":"GPU Gems 2","author":"Horn Daniel","unstructured":"Daniel Horn. 2005. Stream Reduction Operations for GPGPU Applications. In GPU Gems 2, Matt Pharr (Ed.). Addison Wesley, Chapter 36, 573--589."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/1460833.1460872"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1147\/rd.312.0249"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.1973.5009159"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/322217.322232"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575750"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2008.31"},{"key":"e_1_3_2_1_25_1","unstructured":"Duane Merrill and Michael Garland. 2016. Single-pass Parallel Prefix Scan with Decoupled Look-back. Technical Report NVR-2016-002. NVIDIA Corporation. https:\/\/research.nvidia.com\/publication\/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back"},{"key":"e_1_3_2_1_26_1","unstructured":"Duane Merrill and Andrew Grimshaw. 2009. Parallel Scan for Stream Architectures. Technical Report CS-2009-14. University of Virginia. https:\/\/doi.org\/10.18130\/V3XN3C Accessed: 2024-12-09."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/1365490.1365500"},{"key":"e_1_3_2_1_28_1","unstructured":"NVIDIA Corporation. 2024. CUDA C Programming Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/ Accessed: 2024-12-06."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1201\/b10376-29"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.2312\/EGGH\/EGGH07\/097-106"},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of the Workshop on Edge Computing Using New Commodity Architectures. D-26-27","author":"Sengupta Shubhabrata","unstructured":"Shubhabrata Sengupta, Aaron E. Lefohn, and John D. Owens. 2006. A Work-Efficient Step-Efficient Prefix Sum Algorithm. In Proceedings of the Workshop on Edge Computing Using New Commodity Architectures. D-26-27. https:\/\/escholarship.org\/uc\/item\/6j57h5zw"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1016\/0196-6774(86)90003-9"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3022671.2984032"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.4230\/LIPIcs.CONCUR.2018.23"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485508"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TEC.1956.5219801"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/2442516.2442539"}],"event":{"name":"SPAA '25: 37th ACM Symposium on Parallelism in Algorithms and Architectures","location":"Portland OR USA","acronym":"SPAA '25","sponsor":["SIGACT ACM Special Interest Group on Algorithms and Computation Theory","SIGARCH ACM Special Interest Group on Computer Architecture","EATCS European Association for Theoretical Computer Science"]},"container-title":["Proceedings of the 37th ACM Symposium on Parallelism in Algorithms and Architectures"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3694906.3743326","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T19:18:51Z","timestamp":1777922331000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3694906.3743326"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,16]]},"references-count":37,"alternative-id":["10.1145\/3694906.3743326","10.1145\/3694906"],"URL":"https:\/\/doi.org\/10.1145\/3694906.3743326","relation":{},"subject":[],"published":{"date-parts":[[2025,7,16]]},"assertion":[{"value":"2025-07-16","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}