{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T14:26:32Z","timestamp":1775744792378,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":27,"publisher":"ACM","license":[{"start":{"date-parts":[[2010,6,6]],"date-time":"2010-06-06T00:00:00Z","timestamp":1275782400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2010,6,6]]},"DOI":"10.1145\/1807167.1807207","type":"proceedings-article","created":{"date-parts":[[2010,6,8]],"date-time":"2010-06-08T12:37:34Z","timestamp":1276000654000},"page":"351-362","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":168,"title":["Fast sort on CPUs and GPUs"],"prefix":"10.1145","author":[{"given":"Nadathur","family":"Satish","sequence":"first","affiliation":[{"name":"Intel Corporation, Santa Clara, CA, USA"}]},{"given":"Changkyu","family":"Kim","sequence":"additional","affiliation":[{"name":"Intel Corporation, Santa Clara, CA, USA"}]},{"given":"Jatin","family":"Chhugani","sequence":"additional","affiliation":[{"name":"Intel Corporation, Santa Clara, CA, USA"}]},{"given":"Anthony D.","family":"Nguyen","sequence":"additional","affiliation":[{"name":"Intel Corporation, Santa Clara, CA, USA"}]},{"given":"Victor W.","family":"Lee","sequence":"additional","affiliation":[{"name":"Intel Corporation, Santa Clara, CA, USA"}]},{"given":"Daehyun","family":"Kim","sequence":"additional","affiliation":[{"name":"Intel Corporation, Santa Clara, CA, USA"}]},{"given":"Pradeep","family":"Dubey","sequence":"additional","affiliation":[{"name":"Intel Corporation, Santa Clara, CA, USA"}]}],"member":"320","published-online":{"date-parts":[[2010,6,6]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"CUDPP\n  : CUDA Data Parallel Primitives Library. gpgpu.org\/developer\/cudpp\/.  CUDPP: CUDA Data Parallel Primitives Library. gpgpu.org\/developer\/cudpp\/."},{"key":"e_1_3_2_1_2_1","unstructured":"Intel Performance Primitives. http:\/\/software.intel.com\/en-us\/intel-ipp\/.  Intel Performance Primitives. http:\/\/software.intel.com\/en-us\/intel-ipp\/."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/212094.212131"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/1468075.1468121"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/1559845.1559877"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.5555\/91254"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/375663.375681"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2009.34"},{"issue":"2","key":"e_1_3_2_1_9_1","first-page":"1313","volume":"1","author":"Chhugani J.","year":"2008","unstructured":"J. Chhugani , A. D. Nguyen , V. W. Lee , Efficient implementation of sorting on multi-core SIMD CPU architectures. VLDB , 1 ( 2 ): 1313 -- 1324 , 2008 . J. Chhugani, A. D. Nguyen, V. W. Lee, et al. Efficient implementation of sorting on multi-core SIMD CPU architectures. VLDB, 1(2):1313--1324, 2008.","journal-title":"VLDB"},{"key":"e_1_3_2_1_10_1","volume-title":"to Algorithms","author":"Cormen T.","year":"1990","unstructured":"T. Cormen , C. Leiserson , and R. Rivest . Intro . to Algorithms . MIT Press , 1990 . T. Cormen, C. Leiserson, and R. Rivest. Intro. to Algorithms. MIT Press, 1990."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.5555\/646422.691940"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/1142473.1142511"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.5555\/1299042.1299047"},{"key":"e_1_3_2_1_14_1","unstructured":"Intel Advanced Vector Extensions Programming Reference. 2008 http:\/\/softwarecommunity.intel.com\/isn\/downloads\/intelavx\/Intel-AVXProgramming-Reference-31943302.pdf.  Intel Advanced Vector Extensions Programming Reference. 2008 http:\/\/softwarecommunity.intel.com\/isn\/downloads\/intelavx\/Intel-AVXProgramming-Reference-31943302.pdf."},{"key":"e_1_3_2_1_15_1","first-page":"101","volume":"0","author":"Jim\u00e9nex-Gonz\u00e1lez D.","year":"2003","unstructured":"D. Jim\u00e9nex-Gonz\u00e1lez , J. J. Navarro , and J.-L. Larriba-Pey . CC-Radix:a Cache Conscious Sorting Based on Radix sort. Euromicro Conference on Parallel , Distributed, and Network-Based Processing , 0 : 101 , 2003 . D. Jim\u00e9nex-Gonz\u00e1lez, J. J. Navarro, and J.-L. Larriba-Pey. CC-Radix:a Cache Conscious Sorting Based on Radix sort. Euromicro Conference on Parallel, Distributed, and Network-Based Processing, 0:101, 2003.","journal-title":"Network-Based Processing"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.14778\/1687553.1687564"},{"key":"e_1_3_2_1_17_1","author":"Lamarca A.","year":"1997","unstructured":"A. Lamarca and R. E. Ladner . The Influence of Caches on the Performance of Sorting. In Journal of Algorithms, pages 370--379 , 1997 . A. Lamarca and R. E. Ladner. The Influence of Caches on the Performance of Sorting. In Journal of Algorithms, pages 370--379, 1997.","journal-title":"In Journal of Algorithms, pages 370--379"},{"key":"e_1_3_2_1_18_1","volume-title":"Gpu sample sort","author":"Leischner N.","year":"2009","unstructured":"N. Leischner , V. Osipov , and P. Sanders . Gpu sample sort , 2009 . N. Leischner, V. Osipov, and P. Sanders. Gpu sample sort, 2009."},{"key":"e_1_3_2_1_19_1","volume-title":"Fermi Architecture White Paper","author":"NVIDIA.","year":"2009","unstructured":"NVIDIA. Fermi Architecture White Paper , 2009 . NVIDIA. Fermi Architecture White Paper, 2009."},{"key":"e_1_3_2_1_20_1","volume-title":"NVIDIA CUDA Programming Guide 2.3","author":"NVIDIA.","year":"2009","unstructured":"NVIDIA. NVIDIA CUDA Programming Guide 2.3 . 2009 . NVIDIA. NVIDIA CUDA Programming Guide 2.3. 2009."},{"key":"e_1_3_2_1_21_1","volume-title":"HPEC","author":"Reilly M.","year":"2008","unstructured":"M. Reilly . When multicore isn't enough: Trends and the future for multi-multicore systems . In HPEC , 2008 . M. Reilly. When multicore isn't enough: Trends and the future for multi-multicore systems. In HPEC, 2008."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2009.5161005"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/1399504.1360617"},{"key":"e_1_3_2_1_24_1","first-page":"97","volume":"2007","author":"Sengupta S.","year":"2007","unstructured":"S. Sengupta , M. Harris , Y. Zhang , and J. D. Owens . Scan Primitives for GPU Computing. In Graphics Hardware 2007 , pages 97 -- 106 , Aug. 2007 . S. Sengupta, M. Harris, Y. Zhang, and J. D. Owens. Scan Primitives for GPU Computing. In Graphics Hardware 2007, pages 97--106, Aug. 2007.","journal-title":"Scan Primitives for GPU Computing. In Graphics Hardware"},{"key":"e_1_3_2_1_25_1","volume-title":"Workshop on GPGPU","author":"Sintorn E.","year":"2007","unstructured":"E. Sintorn and U. Assarsson . Fast Parallel GPU-Sorting Using a Hybrid Algorithm . In Workshop on GPGPU , 2007 . E. Sintorn and U. Assarsson. Fast Parallel GPU-Sorting Using a Hybrid Algorithm. In Workshop on GPGPU, 2007."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.5555\/147877.147896"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/125826.126164"}],"event":{"name":"SIGMOD\/PODS '10: International Conference on Management of Data","location":"Indianapolis Indiana USA","acronym":"SIGMOD\/PODS '10","sponsor":["SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 2010 ACM SIGMOD International Conference on Management of data"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/1807167.1807207","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/1807167.1807207","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T12:17:35Z","timestamp":1750249055000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/1807167.1807207"}},"subtitle":["a case for bandwidth oblivious SIMD sort"],"short-title":[],"issued":{"date-parts":[[2010,6,6]]},"references-count":27,"alternative-id":["10.1145\/1807167.1807207","10.1145\/1807167"],"URL":"https:\/\/doi.org\/10.1145\/1807167.1807207","relation":{},"subject":[],"published":{"date-parts":[[2010,6,6]]},"assertion":[{"value":"2010-06-06","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}