{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,29]],"date-time":"2025-09-29T00:05:35Z","timestamp":1759104335695,"version":"3.44.0"},"publisher-location":"Cham","reference-count":18,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032063427","type":"print"},{"value":"9783032063434","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,9,29]],"date-time":"2025-09-29T00:00:00Z","timestamp":1759104000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,29]],"date-time":"2025-09-29T00:00:00Z","timestamp":1759104000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-06343-4_10","type":"book-chapter","created":{"date-parts":[[2025,9,28]],"date-time":"2025-09-28T13:23:55Z","timestamp":1759065835000},"page":"151-164","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Evaluating LLVM OpenMP Offload Optimizations on\u00a0NVIDIA GH200 Grace Hopper Superchip and\u00a0AMD Instinct\u2122\u00a0MI300A Accelerator Architectures"],"prefix":"10.1007","author":[{"given":"Kevin","family":"Sala","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Stephen L.","family":"Olivier","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rahulkumar","family":"Gayatri","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shilei","family":"Tian","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Johannes","family":"Doerfert","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,9,29]]},"reference":[{"key":"10_CR1","doi-asserted-by":"crossref","unstructured":"Bertolli, C., et al.: Performance analysis of runtime handling of zero-copy for OpenMP programs on MI300A APUs. In: SC24-W: Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1420\u20131429. IEEE (2024)","DOI":"10.1109\/SCW63240.2024.00183"},{"key":"10_CR2","doi-asserted-by":"publisher","unstructured":"Doerfert, J., et al.: Co-designing an OpenMP GPU runtime and optimizations for near-zero overhead execution. In: 2022 IEEE International Parallel and Distributed Processing Symposium, IPDPS 2022, Lyon, France, May 30 - June 3, 2022, pp. 504\u2013514. IEEE (2022). https:\/\/doi.org\/10.1109\/IPDPS53621.2022.00055","DOI":"10.1109\/IPDPS53621.2022.00055"},{"issue":"2","key":"10_CR3","doi-asserted-by":"publisher","first-page":"95","DOI":"10.1109\/MCSE.2022.3163817","volume":"24","author":"AC Elster","year":"2022","unstructured":"Elster, A.C., Haugdahl, T.A.: Nvidia Hopper GPU and Grace CPU highlights. Comput. Sci. Eng. 24(2), 95\u2013100 (2022). https:\/\/doi.org\/10.1109\/MCSE.2022.3163817","journal-title":"Comput. Sci. Eng."},{"key":"10_CR4","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"210","DOI":"10.1007\/978-3-031-40744-4_14","volume-title":"OpenMP: Advanced Task-Based, Device and Compiler Programming","author":"W Elwasif","year":"2023","unstructured":"Elwasif, W.: Experimental characterization of OpenMP offloading memory operations and unified shared memory support. In: McIntosh-Smith, S., Klemm, M., de Supinski, B.R., Deakin, T., Klinkenberg, J. (eds.) OpenMP: Advanced Task-Based, Device and Compiler Programming. LNCS, pp. 210\u2013225. Springer, Cham (2023). https:\/\/doi.org\/10.1007\/978-3-031-40744-4_14"},{"key":"10_CR5","doi-asserted-by":"publisher","unstructured":"Evans, J.: Nvidia Grace. In: 2022 IEEE Hot Chips 34 Symposium (HCS), pp. 1\u201320. IEEE Computer Society, Los Alamitos (2022). https:\/\/doi.org\/10.1109\/HCS55958.2022.9895599","DOI":"10.1109\/HCS55958.2022.9895599"},{"key":"10_CR6","unstructured":"Gayatri, R., et al.: Rapid exploration of optimization strategies on advanced architectures using TestSNAP and LAMMPS. arXiv preprint arXiv:2011.12875 (2020)"},{"key":"10_CR7","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1007\/978-3-031-40744-4_7","volume-title":"OpenMP: Advanced Task-Based, Device and Compiler Programming","author":"R Gayatri","year":"2023","unstructured":"Gayatri, R., Olivier, S.L., Trott, C.R., Doerfert, J., Ciesko, J., Lebrun-Grandie, D.: The Kokkos OpenMPTarget backend: implementation and lessons learned. In: McIntosh-Smith, S., Klemm, M., de Supinski, B.R., Deakin, T., Klinkenberg, J. (eds.) OpenMP: Advanced Task-Based, Device and Compiler Programming, pp. 99\u2013113. Springer Nature Switzerland, Cham (2023)"},{"key":"10_CR8","doi-asserted-by":"publisher","unstructured":"Gayatri, R., Tian, S., Olivier, S.L., Wright, E., Doerfert, J.: Leveraging LLVM OpenMP GPU offload optimizations for Kokkos applications. In: 2024 IEEE 31st International Conference on High Performance Computing, Data, and Analytics (HiPC), pp. 277\u2013287 (2024). https:\/\/doi.org\/10.1109\/HiPC62374.2024.00035","DOI":"10.1109\/HiPC62374.2024.00035"},{"key":"10_CR9","doi-asserted-by":"publisher","unstructured":"Jin, Z.: Sum reduction with OpenMP offload on NVIDIA Grace-Hopper system. In: SC24-W: Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1006\u20131013 (2024). https:\/\/doi.org\/10.1109\/SCW63240.2024.00140","DOI":"10.1109\/SCW63240.2024.00140"},{"key":"10_CR10","doi-asserted-by":"publisher","unstructured":"Li, J., Wang, Y., Liang, X., Liu, H.: Automatic BLAS offloading on unified memory architecture: a study on NVIDIA Grace-Hopper. In: Practice and Experience in Advanced Research Computing 2024: Human Powered Computing. PEARC \u201924, Association for Computing Machinery, New York, NY, USA (2024). https:\/\/doi.org\/10.1145\/3626203.3670561","DOI":"10.1145\/3626203.3670561"},{"key":"10_CR11","doi-asserted-by":"crossref","unstructured":"Ruzicka, J., Asch, C., Meneses, E., Rampp, M., Laure, E.: A study of performance portability in plasma physics simulations. In: Latin American High Performance Computing Conference, pp. 19\u201335. Springer (2024)","DOI":"10.1007\/978-3-031-80084-9_2"},{"key":"10_CR12","doi-asserted-by":"crossref","unstructured":"Sfiligoi, I.: Comparing CPU and GPU compute of PERMANOVA on MI300A (2025). https:\/\/arxiv.org\/abs\/2505.04556","DOI":"10.1145\/3708035.3736040"},{"key":"10_CR13","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"126","DOI":"10.1007\/978-3-031-72567-8_9","volume-title":"Advancing OpenMP for Future Accelerators","author":"B Shan","year":"2024","unstructured":"Shan, B., Araya-Polo, M., Chapman, B.: Evaluation of directive-based programming models for stencil computation on current GPGPU architectures. In: Espinosa, A., Klemm, M., de Supinski, B.R., Cytowski, M., Klinkenberg, J. (eds.) Advancing OpenMP for Future Accelerators. LNCS, pp. 126\u2013140. Springer, Cham (2024). https:\/\/doi.org\/10.1007\/978-3-031-72567-8_9"},{"key":"10_CR14","doi-asserted-by":"publisher","unstructured":"Smith, A., et al.: AMD InstinctTM MI300 series modular chiplet package \u2013 HPC and AI accelerator for exa-class systems. In: 2024 IEEE International Solid-State Circuits Conference (ISSCC), vol.\u00a067, pp. 490\u2013492 (2024). https:\/\/doi.org\/10.1109\/ISSCC49657.2024.10454441","DOI":"10.1109\/ISSCC49657.2024.10454441"},{"key":"10_CR15","doi-asserted-by":"publisher","unstructured":"Smith, A., et al.: Realizing the AMD exascale heterogeneous processor vision : industry product. In: 2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA), pp. 876\u2013889 (2024). https:\/\/doi.org\/10.1109\/ISCA59077.2024.00068","DOI":"10.1109\/ISCA59077.2024.00068"},{"key":"10_CR16","doi-asserted-by":"publisher","unstructured":"Tandon, S., et al.: Porting HPC applications to AMD Instinct\u2122 MI300A using unified memory and OpenMP\u00ae. In: ISC High Performance 2024 Research Paper Proceedings (39th International Conference), pp.\u00a01\u20139 (2024). https:\/\/doi.org\/10.23919\/ISC.2024.10528925","DOI":"10.23919\/ISC.2024.10528925"},{"key":"10_CR17","doi-asserted-by":"publisher","unstructured":"Tian, S., Scogland, T., Chapman, B., Doerfert, J.: OpenMP kernel language extensions for performance portable GPU codes. In: Proceedings of the SC \u201923 Workshops of The International Conference on High Performance Computing, Network, Storage, and Analysis, pp. 876\u2013883. SC-W \u201923, Association for Computing Machinery, New York, NY, USA (2023). https:\/\/doi.org\/10.1145\/3624062.3624164","DOI":"10.1145\/3624062.3624164"},{"issue":"4","key":"10_CR18","doi-asserted-by":"publisher","first-page":"805","DOI":"10.1109\/TPDS.2021.3097283","volume":"33","author":"CR Trott","year":"2022","unstructured":"Trott, C.R., et al.: Kokkos 3: Programming model extensions for the exascale era. IEEE Trans. Parallel Distrib. Syst. 33(4), 805\u2013817 (2022). https:\/\/doi.org\/10.1109\/TPDS.2021.3097283","journal-title":"IEEE Trans. Parallel Distrib. Syst."}],"container-title":["Lecture Notes in Computer Science","OpenMP: Balancing Productivity and Performance Portability"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-06343-4_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,28]],"date-time":"2025-09-28T13:23:57Z","timestamp":1759065837000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-06343-4_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,29]]},"ISBN":["9783032063427","9783032063434"],"references-count":18,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-06343-4_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,29]]},"assertion":[{"value":"29 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"IWOMP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Workshop on OpenMP","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Charlotte, NC","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"USA","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"3 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iwomp2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.iwomp.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}