{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,28]],"date-time":"2025-09-28T15:28:49Z","timestamp":1759073329188,"version":"3.40.3"},"publisher-location":"Cham","reference-count":28,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031407437"},{"type":"electronic","value":"9783031407444"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-40744-4_8","type":"book-chapter","created":{"date-parts":[[2023,8,30]],"date-time":"2023-08-30T23:03:28Z","timestamp":1693436608000},"page":"114-128","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["OpenMP Target Offload Utilizing GPU Shared Memory"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9087-4646","authenticated-orcid":false,"given":"Mathias","family":"Gammelmark","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6620-6800","authenticated-orcid":false,"given":"Anton","family":"Rydahl","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0737-9992","authenticated-orcid":false,"given":"Sven","family":"Karlsson","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,9,1]]},"reference":[{"key":"8_CR1","unstructured":"Adinets, A., Merrill, D.: Onesweep: a faster least significant digit radix sort for gpus. arXiv preprint arXiv:2206.01784 (2022). https:\/\/doi.org\/10.48550\/arXiv.2206.01784"},{"key":"8_CR2","unstructured":"Blelloch, G.E.: Prefix sums and their applications. Tech. Rep. CMU-CS-90-190, School of Computer Science, Carnegie Mellon University (1990)"},{"key":"8_CR3","unstructured":"Burnus, T.: Offloading support in GCC (2023). https:\/\/gcc.gnu.org\/wiki\/Offloading. Accessed 17 May 2023"},{"key":"8_CR4","unstructured":"Center for Science: LUMI-G documentation, GPU nodes. https:\/\/docs.lumi-supercomputer.eu\/hardware\/lumig\/ (2023). Accessed 15 May 2023"},{"key":"8_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1007\/978-3-030-85262-7_5","volume-title":"OpenMP: Enabling Massive Node-Level Parallelism","author":"B Chapman","year":"2021","unstructured":"Chapman, B., et al.: Outcomes of openMP hackathon: openMP application experiences with the offloading model (part I). In: McIntosh-Smith, S., de Supinski, B.R., Klinkenberg, J. (eds.) IWOMP 2021. LNCS, vol. 12870, pp. 67\u201380. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-85262-7_5"},{"key":"8_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"81","DOI":"10.1007\/978-3-030-85262-7_6","volume-title":"OpenMP: Enabling Massive Node-Level Parallelism","author":"B Chapman","year":"2021","unstructured":"Chapman, B., et al.: Outcomes of openMP hackathon: openMP application experiences with the offloading model (part II). In: McIntosh-Smith, S., de Supinski, B.R., Klinkenberg, J. (eds.) IWOMP 2021. LNCS, vol. 12870, pp. 81\u201395. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-85262-7_6"},{"key":"8_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"37","DOI":"10.1007\/978-3-030-58144-2_3","volume-title":"OpenMP: Portable Multi-Level Parallelism on Modern Systems","author":"C Daley","year":"2020","unstructured":"Daley, C., Ahmed, H., Williams, S., Wright, N.: A case study of porting HPGMG from CUDA to openMP target offload. In: Milfeld, K., de Supinski, B.R., Koesterke, L., Klinkenberg, J. (eds.) IWOMP 2020. LNCS, vol. 12295, pp. 37\u201351. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58144-2_3"},{"key":"8_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"25","DOI":"10.1007\/978-3-030-74224-9_2","volume-title":"Accelerator Programming Using Directives","author":"JH Davis","year":"2021","unstructured":"Davis, J.H., Daley, C., Pophale, S., Huber, T., Chandrasekaran, S., Wright, N.J.: Performance assessment of OpenMP compilers targeting NVIDIA V100 GPUs. In: Bhalachandra, S., Wienke, S., Chandrasekaran, S., Juckeland, G. (eds.) WACCPD 2020. LNCS, vol. 12655, pp. 25\u201344. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-74224-9_2"},{"key":"8_CR9","unstructured":"DTU Computing Center: DTU Computing Center resources (2022). https:\/\/doi.org\/10.48714\/DTU.HPC.0001"},{"issue":"8","key":"8_CR10","doi-asserted-by":"publisher","first-page":"2368","DOI":"10.1111\/j.1467-8659.2009.01542.x","volume":"28","author":"L Ha","year":"2009","unstructured":"Ha, L., Kr\u00fcger, J., Silva, C.T.: Fast four-way parallel radix sorting on GPUs. Comput. Graph. Forum 28(8), 2368\u20132378 (2009). https:\/\/doi.org\/10.1111\/j.1467-8659.2009.01542.x","journal-title":"Comput. Graph. Forum"},{"key":"8_CR11","unstructured":"Harris, M., Sengupta, S., Owens, J.D.: Parallel prefix sum (scan) with CUDA. In: GPU Gems 3, pp. 851\u2013876. Addison-Wesley Professional (2007)"},{"key":"8_CR12","unstructured":"LLVM: Clang: a c language family frontend for LLVM (2023). https:\/\/clang.llvm.org\/. Accessed 26 May 2023"},{"key":"8_CR13","unstructured":"LLVM: Support, getting involved, and FAQ (2023). https:\/\/openmp.llvm.org\/SupportAndFAQ.html. Accessed 17 May 2023"},{"key":"8_CR14","unstructured":"LUMI: Cray compilers (2023). https:\/\/docs.lumi-supercomputer.eu\/development\/compiling\/cce\/. Accessed 26 May 2023"},{"key":"8_CR15","unstructured":"Merrill, D., Garland, M.: Single-pass parallel prefix scan with decoupled look-back. Tech. Rep. NVR-2016-002, NVIDIA (2016)"},{"key":"8_CR16","unstructured":"NVIDIA: Nvidia a100 tensor core gpu architecture, unprecedented acceleration at every scale (2020). https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf. Accessed 15 May 2023"},{"key":"8_CR17","unstructured":"NVIDIA: CUDA toolkit documentation v11.5.0 (2023). https:\/\/docs.nvidia.com\/cuda\/archive\/11.5.0\/. Accessed 26 May 2023"},{"key":"8_CR18","unstructured":"NVIDIA: Nvidia HPC SDK documentation (2023). https:\/\/docs.nvidia.com\/hpc-sdk\/archive\/22.7\/. Accessed 26 May 2023"},{"key":"8_CR19","unstructured":"OpenMP Architecture Review Board: OpenMP (2023). https:\/\/www.openmp.org\/. Accessed 15 May 2023"},{"key":"8_CR20","unstructured":"OpenMP Architecture Review Board: Openmp application programming interface version 4.0 (2023). https:\/\/www.openmp.org\/wp-content\/uploads\/OpenMP4.0.0.pdf. Accessed 15 May 2023"},{"key":"8_CR21","unstructured":"OpenMP Architecture Review Board: OpenMP application programming interface version 5.0 (2023). https:\/\/www.openmp.org\/wp-content\/uploads\/OpenMP-API-Specification-5.0.pdf. Accessed 15 May 2023"},{"key":"8_CR22","doi-asserted-by":"publisher","unstructured":"Rydahl, A., Gammelmark, M., Karlsson, S.: Feasibility studies in multi-GPU target offloading. In: Klemm, M., de Supinski, B.R., Klinkenberg, J., Neth, B. (eds.) OpenMP in a Modern World: From Multi-device Support to Meta Programming. IWOMP 2022. Lecture Notes in Computer Science, vol. 13527, pp. 81\u201393. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-15922-0_6","DOI":"10.1007\/978-3-031-15922-0_6"},{"key":"8_CR23","doi-asserted-by":"publisher","unstructured":"Talaashrafi, D., Maza, M.M., Doerfert, J.: Towards automatic openMP-aware utilization of fast GPU memory. In: Klemm, M., de Supinski, B.R., Klinkenberg, J., Neth, B. (eds.) OpenMP in a Modern World: From Multi-device Support to Meta Programming. IWOMP 2022. Lecture Notes in Computer Science, vol. 13527, pp. 67\u201380. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-15922-0_5","DOI":"10.1007\/978-3-031-15922-0_5"},{"key":"8_CR24","unstructured":"The GCC team: Offloading support in GCC (2023). https:\/\/gcc.gnu.org\/. Accessed 26 May 2023"},{"key":"8_CR25","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"159","DOI":"10.1007\/978-3-030-85262-7_11","volume-title":"OpenMP: Enabling Massive Node-Level Parallelism","author":"S Tian","year":"2021","unstructured":"Tian, S., Chesterfield, J., Doerfert, J., Chapman, B.: Experience report: writing a portable GPU runtime with OpenMP 5.1. In: McIntosh-Smith, S., de Supinski, B.R., Klinkenberg, J. (eds.) IWOMP 2021. LNCS, vol. 12870, pp. 159\u2013169. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-85262-7_11"},{"key":"8_CR26","doi-asserted-by":"publisher","unstructured":"Wu, K., Truong, N., Yuksel, C., Hoetzlein, R.: Fast fluid simulations with sparse volumes on the GPU. Comput. Graph. Forum 37(2), 157\u2013167 (2018). https:\/\/doi.org\/10.1111\/cgf.13350","DOI":"10.1111\/cgf.13350"},{"key":"8_CR27","doi-asserted-by":"publisher","unstructured":"Zegarra, M., Pereira, M., Martorell, X., Araujo, G.: Automatic scan parallelization in openmp. In: 2017 International Symposium on Computer Architecture and High Performance Computing Workshops (SBAC-PADW), pp. 85\u201390. IEEE (2017). https:\/\/doi.org\/10.1109\/SBAC-PADW.2017.23","DOI":"10.1109\/SBAC-PADW.2017.23"},{"issue":"6","key":"8_CR28","doi-asserted-by":"publisher","first-page":"508","DOI":"10.1177\/10943420211022811","volume":"35","author":"W Zhang","year":"2021","unstructured":"Zhang, W., Myers, A., Gott, K., Almgren, A., Bell, J.: AmReX: block-structured adaptive mesh refinement for multiphysics applications. Int. J. High Perform. Computing Applications 35(6), 508\u2013526 (2021). https:\/\/doi.org\/10.1177\/10943420211022811","journal-title":"Int. J. High Perform. Computing Applications"}],"container-title":["Lecture Notes in Computer Science","OpenMP: Advanced Task-Based, Device and Compiler Programming"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-40744-4_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,30]],"date-time":"2023-08-30T23:04:18Z","timestamp":1693436658000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-40744-4_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031407437","9783031407444"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-40744-4_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"1 September 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"IWOMP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Workshop on OpenMP","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Bristol","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"United Kingdom","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 September 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 September 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iwomp2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.iwomp.org","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"20","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"15","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"75% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}