{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,21]],"date-time":"2026-07-21T23:11:16Z","timestamp":1784675476035,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3650200.3656596","type":"proceedings-article","created":{"date-parts":[[2024,6,3]],"date-time":"2024-06-03T14:11:54Z","timestamp":1717423914000},"page":"537-548","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["RadiK: Scalable and Optimized GPU-Parallel Radix Top-K Selection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5041-1284","authenticated-orcid":false,"given":"Yifei","family":"Li","sequence":"first","affiliation":[{"name":"Alibaba Group, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5450-5124","authenticated-orcid":false,"given":"Bole","family":"Zhou","sequence":"additional","affiliation":[{"name":"Independent, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3341-2943","authenticated-orcid":false,"given":"Jiejing","family":"Zhang","sequence":"additional","affiliation":[{"name":"Alibaba Group, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0996-2260","authenticated-orcid":false,"given":"Xuechao","family":"Wei","sequence":"additional","affiliation":[{"name":"Alibaba Group, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9618-5523","authenticated-orcid":false,"given":"Yinghan","family":"Li","sequence":"additional","affiliation":[{"name":"Alibaba Group, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7358-819X","authenticated-orcid":false,"given":"Yingda","family":"Chen","sequence":"additional","affiliation":[{"name":"Alibaba Group, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,6,3]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek\u00a0G. Murray, Benoit Steiner, Paul Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng. 2016. TensorFlow: A System for Large-Scale Machine Learning. In Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation (Savannah, GA, USA) (OSDI\u201916). USENIX Association, USA, 265\u2013283."},{"key":"e_1_3_2_1_2_1","unstructured":"Andy Adinets. 2014. CUDA Pro Tip: Optimized Filtering with Warp-Aggregated Atomics. NVIDIA. https:\/\/developer.nvidia.com\/blog\/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics\/"},{"issue":"2","key":"e_1_3_2_1_3_1","article-title":"Fast k-Selection Algorithms for Graphics Processing Units","volume":"4","author":"Alabi Tolu","year":"2012","unstructured":"Tolu Alabi, Jeffrey\u00a0D. Blanchard, Bradley Gordon, and Russel Steinbach. 2012. Fast k-Selection Algorithms for Graphics Processing Units. ACM J. Exp. Algorithmics 17, Article 4.2 (10 2012), 29\u00a0pages.","journal-title":"ACM J. Exp. Algorithmics 17, Article"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1016\/0020-0190(80)90023-X"},{"key":"e_1_3_2_1_5_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang Binyuan Hui Luo Ji Mei Li Junyang Lin Runji Lin Dayiheng Liu Gao Liu Chengqiang Lu Keming Lu Jianxin Ma Rui Men Xingzhang Ren Xuancheng Ren Chuanqi Tan Sinan Tan Jianhong Tu Peng Wang Shijie Wang Wei Wang Shengguang Wu Benfeng Xu Jin Xu An Yang Hao Yang Jian Yang Shusheng Yang Yang Yao Bowen Yu Hongyi Yuan Zheng Yuan Jianwei Zhang Xingxuan Zhang Yichang Zhang Zhenru Zhang Chang Zhou Jingren Zhou Xiaohuan Zhou and Tianhang Zhu. 2023. Qwen Technical Report. arXiv:2309.16609."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/1468075.1468121"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.14778\/2824032.2824050"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3318464.3389729"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1082"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (St","author":"Gaihre Anil","year":"2021","unstructured":"Anil Gaihre, Da Zheng, Scott Weitze, Lingda Li, Shuaiwen\u00a0Leon Song, Caiwen Ding, Xiaoye\u00a0S. Li, and Hang Liu. 2021. Dr. Top-k: Delegate-Centric Top-k on GPUs. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (St. Louis, MO, USA) (SC \u201921). Association for Computing Machinery, New York, NY, USA, Article 39, 14\u00a0pages."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/MC.2008.125"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/1142473.1142511"},{"key":"e_1_3_2_1_13_1","unstructured":"Mark Harris. 2013. How to Access Global Memory Efficiently in CUDA C\/C++ Kernels. NVIDIA. https:\/\/developer.nvidia.com\/blog\/how-access-global-memory-efficiently-cuda-c-kernels\/"},{"key":"e_1_3_2_1_14_1","volume-title":"Computer Architecture: A Quantitative Approach","author":"Hennessy L.","year":"2017","unstructured":"John\u00a0L. Hennessy and David\u00a0A. Patterson. 2017. Computer Architecture: A Quantitative Approach (6th ed.). Morgan Kaufmann Publishers Inc., San Francisco, CA, USA.","edition":"6"},{"key":"e_1_3_2_1_15_1","volume-title":"8th International Conference on Learning Representations, ICLR","author":"Holtzman Ari","year":"2020","unstructured":"Ari Holtzman, Jan Buys, Li Du, Maxwell Forbes, and Yejin Choi. 2020. The Curious Case of Neural Text Degeneration. In 8th International Conference on Learning Representations, ICLR 2020, Addis Ababa, Ethiopia, April 26-30, 2020. OpenReview.net, Virtual Only Conference, 3679 \u2013 3694."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2019.2921572"},{"key":"e_1_3_2_1_17_1","volume-title":"Batched Sparse Iterative Solvers on GPU for the Collision Operator for Fusion Plasma Simulations. In 2022 IEEE International Parallel and Distributed Processing Symposium, IPDPS","author":"Kashi Aditya","year":"2022","unstructured":"Aditya Kashi, Pratik Nayak, Dhruva Kulkarni, Aaron Scheinberg, Paul Lin, and Hartwig Anzt. 2022. Batched Sparse Iterative Solvers on GPU for the Collision Operator for Fusion Plasma Simulations. In 2022 IEEE International Parallel and Distributed Processing Symposium, IPDPS 2022. IEEE, Lyon, France, 157\u2013167."},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the 36th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a097)","author":"Kool Wouter","year":"2019","unstructured":"Wouter Kool, Herke van Hoof, and Max Welling. 2019. Stochastic Beams and Where to Find Them: The Gumbel-Top-k Trick for Sampling Sequences Without Replacement. In Proceedings of the 36th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a097), Kamalika Chaudhuri and Ruslan Salakhutdinov (Eds.). PMLR, Long Beach, CA, USA, 3499\u20133508."},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0139)","author":"Kosaian Jack","year":"2021","unstructured":"Jack Kosaian, Amar Phanishayee, Matthai Philipose, Debadeepta Dey, and Rashmi Vinayak. 2021. Boosting the Throughput and Accelerator Utilization of Specialized CNN Inference Beyond Increasing Batch Size. In Proceedings of the 38th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0139), Marina Meila and Tong Zhang (Eds.). PMLR, Virtual Event, 5731\u20135741."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627535.3638478"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1142\/S0129626411000187"},{"key":"e_1_3_2_1_23_1","unstructured":"NVIDIA. 2022. CUB. NVIDIA. https:\/\/docs.nvidia.com\/cuda\/cub\/index.html"},{"key":"e_1_3_2_1_24_1","unstructured":"NVIDIA. 2024. CUDA C++ Programming Guide. NVIDIA. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html"},{"key":"e_1_3_2_1_25_1","unstructured":"NVIDIA. 2024. CUDA Toolkit Documentation. NVIDIA. https:\/\/docs.nvidia.com\/cuda\/"},{"key":"e_1_3_2_1_26_1","unstructured":"OpenAI. 2024. How to Count Tokens with tiktoken. OpenAI. https:\/\/github.com\/openai\/openai-cookbook\/blob\/555bbc0d34cb6f1f0702e7e36b4bcc0d6d4697d8\/examples\/How_to_count_tokens_with_tiktoken.ipynb"},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems. Curran Associates Inc.","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas K\u00f6pf, Edward Yang, Zach DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Proceedings of the 33rd International Conference on Neural Information Processing Systems. Curran Associates Inc., Red Hook, NY, USA, Article 721, 12\u00a0pages."},{"key":"e_1_3_2_1_28_1","volume-title":"Fast In-Place Sorting with CUDA Based on Bitonic Sort","author":"Peters Hagen","unstructured":"Hagen Peters, Ole Schulz-Hildebrandt, and Norbert Luttenberger. 2010. Fast In-Place Sorting with CUDA Based on Bitonic Sort. In Parallel Processing and Applied Mathematics, Roman Wyrzykowski, Jack Dongarra, Konrad Karczewski, and Jerzy Wasniewski (Eds.). Springer, Berlin, Heidelberg, 403\u2013410."},{"key":"e_1_3_2_1_29_1","unstructured":"Johan Philip. 2007. The Probability Distribution of the Distance between Two Random Points in a Box. KTH. https:\/\/people.kth.se\/\u00a0johanph\/habc.pdf"},{"key":"e_1_3_2_1_30_1","volume-title":"Language models are unsupervised multitask learners. OpenAI blog 1, 8","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, Ilya Sutskever, 2019. Language models are unsupervised multitask learners. OpenAI blog 1, 8 (2019), 9."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/AQTR49680.2020.9129967"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/2897839.2927468"},{"key":"e_1_3_2_1_33_1","volume-title":"Designing Efficient Sorting Algorithms for Manycore GPUs. In 2009 IEEE International Symposium on Parallel and Distributed Processing. IEEE","author":"Satish Nadathur","year":"2009","unstructured":"Nadathur Satish, Mark Harris, and Michael Garland. 2009. Designing Efficient Sorting Algorithms for Manycore GPUs. In 2009 IEEE International Symposium on Parallel and Distributed Processing. IEEE, Rome, Italy, 1\u201310."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3183713.3183735"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3035918.3064043"},{"key":"e_1_3_2_1_36_1","unstructured":"Benjamin Thirey and Randal Hickman. 2015. Distribution of Euclidean Distances Between Randomly Distributed Gaussian Points in n-Space. arXiv:1508.02238."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Jeffrey\u00a0S. Vetter Ron Brightwell Maya Gokhale Pat McCormick Rob Ross John Shalf Katie Antypas David Donofrio Anshu Dubey Travis Humble Catherine Schuman Brian Van\u00a0Essen Shinjae Yoo Alex Aiken David Bernholdt Suren Byna Kirk Cameron Frank Cappello Barbara Chapman Andrew Chien Mary Hall Rebecca Hartman-Baker Zhiling Lan Michael Lang John Leidel Sherry Li Robert Lucas John Mellor-Crummey Paul Peltz\u00a0Jr. Thomas Peterka Michelle Strout and Jeremiah Wilke. 2018. Extreme Heterogeneity 2018: Productive Computational Science in the Era of Extreme Heterogeneity - Report for DOE ASCR Basic Research Needs Workshop on Extreme Heterogeneity. Technical Report. US Department of Energy Office of Science Advanced Scientific Computing Research United States.","DOI":"10.2172\/1494112"},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0202)","author":"Vilnis Luke","year":"2023","unstructured":"Luke Vilnis, Yury Zemlyanskiy, Patrick Murray, Alexandre\u00a0Tachard Passos, and Sumit Sanghai. 2023. Arithmetic Sampling: Parallel Diverse Decoding for Large Language Models. In Proceedings of the 40th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, Honolulu, HI, USA, 35120\u201335136."},{"key":"e_1_3_2_1_39_1","volume-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo\u00a0Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, USA, 521\u2013538."},{"key":"e_1_3_2_1_40_1","unstructured":"Christina Zhang and Yong Wang. 2020. Accelerating Top-K Computation on GPU. NVIDIA. https:\/\/live.nvidia.cn\/gtc-od\/attachments\/CNS20315.pdf"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607062"}],"event":{"name":"ICS '24: 2024 International Conference on Supercomputing","location":"Kyoto Japan","acronym":"ICS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 38th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650200.3656596","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3650200.3656596","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:25:09Z","timestamp":1755876309000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650200.3656596"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":41,"alternative-id":["10.1145\/3650200.3656596","10.1145\/3650200"],"URL":"https:\/\/doi.org\/10.1145\/3650200.3656596","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}