{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,19]],"date-time":"2026-03-19T02:19:06Z","timestamp":1773886746790,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,9]],"date-time":"2024-06-09T00:00:00Z","timestamp":1717891200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Guangdong Provincial Key Laboratory","award":["2020B121201001"],"award-info":[{"award-number":["2020B121201001"]}]},{"DOI":"10.13039\/501100006374","name":"Shenzhen Fundamental Research Program","doi-asserted-by":"publisher","award":["20220815112848002"],"award-info":[{"award-number":["20220815112848002"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,10]]},"DOI":"10.1145\/3662010.3663445","type":"proceedings-article","created":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T10:30:07Z","timestamp":1717065007000},"page":"1-9","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["How Does Software Prefetching Work on GPU Query Processing?"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-9487-1455","authenticated-orcid":false,"given":"Yangshen","family":"Deng","sequence":"first","affiliation":[{"name":"Department of Computer Science and Engineering, Southern University of Science and Technology, AlayaDB AI, Shenzhen, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5988-4233","authenticated-orcid":false,"given":"Shiwen","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Southern University of Science and Technology, AlayaDB AI, Shenzhen, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1798-8770","authenticated-orcid":false,"given":"Zhaoyang","family":"Hong","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Southern University of Science and Technology, AlayaDB AI, Shenzhen, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8424-0092","authenticated-orcid":false,"given":"Bo","family":"Tang","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Southern University of Science and Technology, AlayaDB AI, Shenzhen, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,6,9]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2020. Controlling Data Movement to Boost Performance on the NVIDIA Ampere Architecture. https:\/\/developer.nvidia.com\/blog\/controlling-data-movement-to-boost-performance-on-ampere-architecture"},{"key":"e_1_3_2_1_2_1","unstructured":"2022. Boosting Application Performance with GPU Memory Prefetching. https:\/\/developer.nvidia.com\/blog\/boosting-application-performance-with-gpu-memory-prefetching"},{"key":"e_1_3_2_1_3_1","unstructured":"2024. Single instruction multiple threads. https:\/\/en.wikipedia.org\/wiki\/Single_instruction _multiple_threads"},{"key":"e_1_3_2_1_4_1","unstructured":"2024. Source code. https:\/\/github.com\/DBGroup-SUSTech\/GPUDB-Prefetch"},{"key":"e_1_3_2_1_5_1","volume-title":"Owens","author":"Awad Muhammad A.","year":"2019","unstructured":"Muhammad A. Awad, Saman Ashkiani, Rob Johnson, Mart\u00edn Farach-Colton, and John D. Owens. 2019. Engineering a high-performance GPU B-Tree. In PPoPP. 145--157."},{"key":"e_1_3_2_1_6_1","volume-title":"Owens","author":"Awad Muhammad A.","year":"2023","unstructured":"Muhammad A. Awad, Serban D. Porumbescu, and John D. Owens. 2023. A GPU Multiversion B-Tree. In PACT. 481--493."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.14778\/2732219.2732227"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Nils Boeschen and Carsten Binnig. 2022. GaccO - A GPU-accelerated OLTP DBMS. In SIGMOD. 1003--1016.","DOI":"10.1145\/3514221.3517876"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.14778\/3632093.3632107"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"S. Chen A. Ailamaki P.B. Gibbons and T.C. Mowry. 2004. Improving hash join performance through prefetching. In ICDE. 116--127.","DOI":"10.1109\/ICDE.2004.1319989"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/376284.375688"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.14778\/3303753.3303760"},{"key":"e_1_3_2_1_13_1","volume-title":"Hardware-conscious Query Processing in GPU-accelerated Analytical Engines. In Conference on Innovative Data Systems Research.","author":"Chrysogelos Periklis","year":"2019","unstructured":"Periklis Chrysogelos, Panagiotis Sioulas, and Anastasia Ailamaki. 2019. Hardware-conscious Query Processing in GPU-accelerated Analytical Engines. In Conference on Innovative Data Systems Research."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.14778\/3659437.3659443"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.14778\/3368289.3368290"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Henning Funke Sebastian Bre\u00df Stefan Noll Volker Markl and Jens Teubner. 2018. Pipelined Query Processing in Coprocessor Environments. In SIGMOD. 1603--1618.","DOI":"10.1145\/3183713.3183734"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.14778\/3380750.3380758"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.14778\/1952376.1952381"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.14778\/3430915.3430932"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.14778\/3632093.3632117"},{"key":"e_1_3_2_1_21_1","volume-title":"Das","author":"Jog Adwait","year":"2013","unstructured":"Adwait Jog, Onur Kayiran, Asit K. Mishra, Mahmut T. Kandemir, Onur Mutlu, Ravishankar Iyer, and Chita R. Das. 2013. Orchestrated scheduling and prefetching for GPGPUs. In ISCA. 332--343."},{"key":"e_1_3_2_1_22_1","volume-title":"Optimizing GPU-accelerated Group-By and Aggregation. ADMS@ VLDB 8","author":"Karnagel Tomas","year":"2015","unstructured":"Tomas Karnagel, Ren\u00e9 M\u00fcller, and Guy M Lohman. 2015. Optimizing GPU-accelerated Group-By and Aggregation. ADMS@ VLDB 8 (2015), 20."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.14778\/1687553.1687564"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.14778\/2856318.2856321"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2018.00024"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Jaekyu Lee Nagesh B. Lakshminarayana Hyesoon Kim and Richard Vuduc. 2010. Many-Thread Aware Prefetching Mechanisms for GPGPU Applications. In MICRO. 213--224.","DOI":"10.1109\/MICRO.2010.44"},{"key":"e_1_3_2_1_27_1","first-page":"73","article-title":"Optimistic Lock Coupling: A Scalable and Efficient General-Purpose Synchronization Method","volume":"42","author":"Leis Viktor","year":"2019","unstructured":"Viktor Leis, Michael Haubenschild, and Thomas Neumann. 2019. Optimistic Lock Coupling: A Scalable and Efficient General-Purpose Synchronization Method. IEEE Data Eng. Bull. 42 (2019), 73--84.","journal-title":"IEEE Data Eng. Bull."},{"key":"e_1_3_2_1_28_1","volume-title":"Huan Li, Mingji Han, Qian Li, and Zhenghai Luo.","author":"Liu Haotian","year":"2022","unstructured":"Haotian Liu, Bo Tang, Jiashu Zhang, Yangshen Deng, Xiao Yan, Xinying Zheng, Qiaomu Shen, Dan Zeng, Zunyao Mao, Chaozu Zhang, Zhengxin You, Zhihao Wang, Runzhe Jiang, Fang Wang, Man Lung Yiu, Huan Li, Mingji Han, Qian Li, and Zhenghai Luo. 2022. GHive: accelerating analytical query processing in apache hive via CPU-GPU heterogeneous computing. In SoCC. 158--172."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Tobias Maltenberger Ivan Ilic Ilin Tolovski and Tilmann Rabl. 2020. Evaluating Multi-GPU Sorting with Modern Interconnects. In SIGMOD. 1795--1809.","DOI":"10.1145\/3514221.3517842"},{"key":"e_1_3_2_1_30_1","volume-title":"Snake: A Variable-length Chain-based Prefetching for GPUs. In MICRO. 728--741.","author":"Mostofi Saba","year":"2023","unstructured":"Saba Mostofi, Hajar Falahati, Negin Mahani, Pejman Lotfi-Kamran, and Hamid Sarbazi-Azad. 2023. Snake: A Variable-length Chain-based Prefetching for GPUs. In MICRO. 728--741."},{"key":"e_1_3_2_1_31_1","unstructured":"Todd Carl Mowry. 1995. Tolerating latency through software-controlled data prefetching. Ph. D. Dissertation."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Jan M\u00fchlig and Jens Teubner. 2021. MxTasks: How to Make Efficient Synchronization and Prefetching Easy. In SIGMOD. 1331--1344.","DOI":"10.1145\/3448016.3457268"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Johns Paul Bingsheng He Shengliang Lu and Chiew Tong Lau. 2019. Revisiting Hash Join on Graphics Processors: A Decade Later. In ICDEW. 294--299.","DOI":"10.1109\/ICDEW.2019.00008"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.14778\/3425879.3425890"},{"key":"e_1_3_2_1_35_1","volume-title":"Hamid Sarbazi-Azad, Mario Drumond, Babak Falsafi, Rachata Ausavarungnirun, and Onur Mutlu.","author":"Sadrosadati Mohammad","year":"2018","unstructured":"Mohammad Sadrosadati, Amirhossein Mirhosseini, Seyed Borna Ehsani, Hamid Sarbazi-Azad, Mario Drumond, Babak Falsafi, Rachata Ausavarungnirun, and Onur Mutlu. 2018. LTRF: Enabling High-Capacity Register Files for GPUs via Hardware\/Software Cooperative Register Prefetching. In ASPLOS. 489--502."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2009.5161005"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Ankit Sethia Ganesh Dasika Mehrzad Samadi and Scott Mahlke. 2013. APOGEE: adaptive prefetching on GPUs for energy efficiency. In PACT. 73--82.","DOI":"10.1109\/PACT.2013.6618798"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Amirhesam Shahvarani and Hans-Arno Jacobsen. 2016. A Hybrid B+-tree as Solution for In-Memory Indexing on CPU-GPU Heterogeneous Computing Platforms. In SIGMOD. 1523--1538.","DOI":"10.1145\/2882903.2882918"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Anil Shanbhag Samuel Madden and Xiangyao Yu. 2020. A Study of the Fundamental Performance Characteristics of GPUs and CPUs for Database Analytics. In SIGMOD. 1617--1632.","DOI":"10.1145\/3318464.3380595"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Panagiotis Sioulas Periklis Chrysogelos Manos Karpathiotakis Raja Appuswamy and Anastasia Ailamaki. 2019. Hardware-Conscious Hash-Joins on GPUs. In ICDE. 698--709.","DOI":"10.1109\/ICDE.2019.00068"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Elias Stehle and Hans-Arno Jacobsen. 2017. A Memory Bandwidth-Efficient Hybrid Radix Sort on GPUs. In SIGMOD. 417--432.","DOI":"10.1145\/3035918.3064043"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588709"},{"key":"e_1_3_2_1_43_1","volume-title":"Andersen","author":"Wang Ziqi","year":"2018","unstructured":"Ziqi Wang, Andrew Pavlo, Hyeontaek Lim, Viktor Leis, Huanchen Zhang, Michael Kaminsky, and David G. Andersen. 2018. Building a Bw-Tree Takes More Than Just Buzz Words. In SIGMOD. 473--488."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/1498765.1498785"},{"key":"e_1_3_2_1_45_1","unstructured":"Bowen Wu Dimitrios Koutsoukos and Gustavo Alonso. 2023. Efficiently Processing Large Relational Joins on GPUs. arXiv:2312.00720 [cs.DB]"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Bobbi Yogatama Brandon Miller Yunsong Wang Graham Markall Jacob Hemstad Gregory Kimball and Xiangyao Yu. 2023. Accelerating User-Defined Aggregate Functions (UDAF) with Block-wide Execution and JIT Compilation on GPUs. In DaMoN. 19--26.","DOI":"10.1145\/3592980.3595307"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.14778\/3551793.3551809"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Weihua Zhang Chuanlei Zhao Lu Peng Yuzhe Lin Fengzhe Zhang and Yunping Lu. 2023. Boosting Performance and QoS for Concurrent GPU B+trees by Combining-Based Synchronization. In PPoPP. 1--13.","DOI":"10.1145\/3572848.3577474"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.14778\/3636218.3636240"}],"event":{"name":"SIGMOD\/PODS '24: International Conference on Management of Data","location":"Santiago AA Chile","acronym":"SIGMOD\/PODS '24","sponsor":["SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 20th International Workshop on Data Management on New Hardware"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3662010.3663445","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3662010.3663445","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T18:04:48Z","timestamp":1755972288000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3662010.3663445"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,9]]},"references-count":49,"alternative-id":["10.1145\/3662010.3663445","10.1145\/3662010"],"URL":"https:\/\/doi.org\/10.1145\/3662010.3663445","relation":{},"subject":[],"published":{"date-parts":[[2024,6,9]]},"assertion":[{"value":"2024-06-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}