{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,28]],"date-time":"2025-11-28T20:23:45Z","timestamp":1764361425166,"version":"3.46.0"},"publisher-location":"Cham","reference-count":32,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031997501"},{"type":"electronic","value":"9783031997518"}],"license":[{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-031-99751-8_6","type":"book-chapter","created":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T01:17:55Z","timestamp":1759281475000},"page":"117-138","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Multi-GPU Code Generation for\u00a0Out-of-Core Problems"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-0202-907X","authenticated-orcid":false,"given":"Patrick\u00a0van","family":"Beurden","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1031-7226","authenticated-orcid":false,"given":"Thomas","family":"Koopman","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8663-1043","authenticated-orcid":false,"given":"Sven-Bodo","family":"Scholz","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,10,1]]},"reference":[{"key":"6_CR1","doi-asserted-by":"publisher","unstructured":"Aaldering, J., Scholz, S.B., Gastel, B.v.: Type patterns: pattern matching on shape-carrying array types. In: Proceedings of the 35th Symposium on Implementation and Application of Functional Languages. IFL \u201923, Association for Computing Machinery, New York, NY, USA (2024). https:\/\/doi.org\/10.1145\/3652561.3652572","DOI":"10.1145\/3652561.3652572"},{"key":"6_CR2","doi-asserted-by":"publisher","unstructured":"Andreetta, C., et al.: FinPar: a Parallel Financial Benchmark. ACM Trans. Archit. Code Optim. 13(2) (2016). https:\/\/doi.org\/10.1145\/2898354","DOI":"10.1145\/2898354"},{"issue":"3","key":"6_CR3","doi-asserted-by":"publisher","first-page":"351","DOI":"10.1016\/0898-1221(75)90035-8","volume":"1","author":"J Barkley Rosser","year":"1975","unstructured":"Barkley Rosser, J.: Nine-point difference solutions for Poisson\u2019s equation. Comput. Math. Appl. 1(3), 351\u2013360 (1975). https:\/\/doi.org\/10.1016\/0898-1221(75)90035-8","journal-title":"Comput. Math. Appl."},{"key":"6_CR4","doi-asserted-by":"crossref","unstructured":"Barnes, J., Hut, P.: A hierarchical O (N log N) force-calculation algorithm. nature 324(6096), pp.446\u2013449 (1986)","DOI":"10.1038\/324446a0"},{"key":"6_CR5","unstructured":"van Beurden, P., Koopman, T., Scholz, S.B.: Multi GPU Ifl2024 (2024). https:\/\/gitlab.sac-home.org\/sac-group\/artefact-ifl2024-multi-gpu , commit 123528eb3080a1e5f244efc43079f7047c8cee77"},{"key":"6_CR6","doi-asserted-by":"publisher","unstructured":"van Beurden, P., Scholz, S.B.: On generating out-of-core GPU code for multi-dimensional array operations. In: Proceedings of the 34th Symposium on Implementation and Application of Functional Languages. IFL \u201922, Association for Computing Machinery, New York, NY, USA (2023). https:\/\/doi.org\/10.1145\/3587216.3587223","DOI":"10.1145\/3587216.3587223"},{"key":"6_CR7","doi-asserted-by":"publisher","unstructured":"Chakravarty, M.M., Keller, G., Lee, S., McDonell, T.L., Grover, V.: Accelerating haskell array codes with multicore GPUs. In: Proceedings of the Sixth Workshop on Declarative Aspects of Multicore Programming, pp. 3\u201314. DAMP \u201911, Association for Computing Machinery, New York, NY, USA (2011). https:\/\/doi.org\/10.1145\/1926354.1926358","DOI":"10.1145\/1926354.1926358"},{"key":"6_CR8","doi-asserted-by":"publisher","unstructured":"Chen, H., Kim, M., Razenshteyn, I., Rotaru, D., Song, Y., Wagh, S.: Maliciously secure matrix multiplication with applications to private deep learning. In: Moriai, S., Wang, H. (eds.) ASIACRYPT 2020. LNCS, vol. 12493, pp. 31\u201359. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-64840-4_2","DOI":"10.1007\/978-3-030-64840-4_2"},{"key":"6_CR9","doi-asserted-by":"publisher","unstructured":"Chien, S., Peng, I., Markidis, S.: Performance evaluation of advanced features in CUDA unified memory. In: 2019 IEEE\/ACM Workshop on Memory Centric High Performance Computing (MCHPC), pp. 50\u201357 (2019). https:\/\/doi.org\/10.1109\/MCHPC49590.2019.00014","DOI":"10.1109\/MCHPC49590.2019.00014"},{"key":"6_CR10","doi-asserted-by":"publisher","unstructured":"Cohen, M.B., Lee, Y.T., Song, Z.: Solving linear programs in the current matrix multiplication time. J. ACM 68(1) (2021). https:\/\/doi.org\/10.1145\/3424305","DOI":"10.1145\/3424305"},{"key":"6_CR11","doi-asserted-by":"crossref","unstructured":"Diogo, M., Grelck, C.: Towards heterogeneous computing without heterogeneous programming. In: Loidl, H.W., Pe\u00f1a, R. (eds.) Trends in Functional Programming, pp. 279\u2013294. Springer, Berlin Heidelberg, Berlin, Heidelberg (2013)","DOI":"10.1007\/978-3-642-40447-4_18"},{"key":"6_CR12","doi-asserted-by":"publisher","unstructured":"Glaser, J., et al.: Strong scaling of general-purpose molecular dynamics simulations on GPUs. Comput. Phys. Commun. 192, 97\u2013107 (2015).https:\/\/doi.org\/10.1016\/j.cpc.2015.02.028, https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0010465515000867","DOI":"10.1016\/j.cpc.2015.02.028"},{"key":"6_CR13","doi-asserted-by":"publisher","unstructured":"Gonz\u00e1lez, M., Morancho, E.: Multi-GPU systems and Unified Virtual Memory for scientific applications: the case of the NAS multi-zone parallel benchmarks. J. Parallel Distrib. Comput. 158, 138\u2013150 (2021).https:\/\/doi.org\/10.1016\/j.jpdc.2021.08.001, https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0743731521001672","DOI":"10.1016\/j.jpdc.2021.08.001"},{"key":"6_CR14","doi-asserted-by":"publisher","first-page":"383","DOI":"10.1007\/s10766-006-0018-x","volume":"34","author":"C Grelck","year":"2006","unstructured":"Grelck, C., Scholz, S.B.: SAC\u2013A functional array language for efficient multi-threaded execution. Int. J. Parallel Prog. 34, 383\u2013427 (2006)","journal-title":"Int. J. Parallel Prog."},{"key":"6_CR15","doi-asserted-by":"publisher","unstructured":"Guo, J., Thiyagalingam, J., Scholz, S.B.: Breaking the GPU programming barrier with the auto-parallelising SAC compiler. In: Proceedings of the Sixth Workshop on Declarative Aspects of Multicore Programming, pp. 15\u201324. DAMP \u201911, Association for Computing Machinery, New York, NY, USA (2011). https:\/\/doi.org\/10.1145\/1926354.1926359","DOI":"10.1145\/1926354.1926359"},{"key":"6_CR16","doi-asserted-by":"crossref","unstructured":"Henriksen, T., Serup, N.G., Elsman, M., Henglein, F., Oancea, C.E.: Futhark: purely functional GPU-programming with nested parallelism and in-place array updates. In: Proceedings of the 38th ACM SIGPLAN Conference on Programming Language Design and Implementation, pp. 556\u2013571 (2017)","DOI":"10.1145\/3062341.3062354"},{"key":"6_CR17","unstructured":"Holst\u00a0Larsen, S.: Multi-GPU futhark using parallel streams. Master\u2019s thesis, University of Copenhagen (2019). https:\/\/futhark-lang.org\/student-projects\/steffen-msc-thesis.pdf"},{"key":"6_CR18","doi-asserted-by":"publisher","unstructured":"Hsu, A.W.: Accelerating information experts through compiler design. In: Proceedings of the 2nd ACM SIGPLAN International Workshop on Libraries, Languages, and Compilers for Array Programming, pp. 37\u201342. ARRAY 2015, Association for Computing Machinery, New York, NY, USA (2015). https:\/\/doi.org\/10.1145\/2774959.2774968","DOI":"10.1145\/2774959.2774968"},{"key":"6_CR19","doi-asserted-by":"crossref","unstructured":"Iverson, K.E.: A programming language. In: Proceedings of the May 1-3, 1962, spring joint computer conference, pp. 345\u2013351 (1962)","DOI":"10.1145\/1460833.1460872"},{"key":"6_CR20","doi-asserted-by":"publisher","unstructured":"Janssen, N., Scholz, S.B.: On mapping n-dimensional data-parallelism efficiently into GPU-thread-spaces. In: Proceedings of the 33rd Symposium on Implementation and Application of Functional Languages. p. 54\u201366. IFL \u201921, Association for Computing Machinery, New York, NY, USA (2022). https:\/\/doi.org\/10.1145\/3544885.3544894","DOI":"10.1145\/3544885.3544894"},{"key":"6_CR21","doi-asserted-by":"publisher","unstructured":"Jeon, W., Ko, G., Lee, J., Lee, H., Ha, D., Ro, W.W.: Chapter Six - Deep learning with GPUs. In: Kim, S., Deka, G.C. (eds.) Hardware Accelerator Systems for Artificial Intelligence and Machine Learning, Advances in Computers, vol.\u00a0122, pp. 167\u2013215. Elsevier (2021). https:\/\/doi.org\/10.1016\/bs.adcom.2020.11.003, https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0065245820300905","DOI":"10.1016\/bs.adcom.2020.11.003"},{"key":"6_CR22","doi-asserted-by":"publisher","unstructured":"Jung, J., Park, D., Jo, G., Park, J., Lee, J.: SnuRHAC: A runtime for heterogeneous accelerator clusters with CUDA unified memory. In: Proceedings of the 30th International Symposium on High-Performance Parallel and Distributed Computing, pp. 107\u2013120. HPDC \u201921, Association for Computing Machinery, New York, NY, USA (2021). https:\/\/doi.org\/10.1145\/3431379.3460647","DOI":"10.1145\/3431379.3460647"},{"issue":"11","key":"6_CR23","doi-asserted-by":"publisher","first-page":"7625","DOI":"10.1007\/s11227-019-02966-8","volume":"75","author":"M Knap","year":"2019","unstructured":"Knap, M., Czarnul, P.: Performance evaluation of unified memory with prefetching and oversubscription for selected parallel CUDA applications on NVIDIA Pascal and Volta GPUs. J. Supercomput. 75(11), 7625\u20137645 (2019)","journal-title":"J. Supercomput."},{"key":"6_CR24","doi-asserted-by":"publisher","unstructured":"Landaverde, R., Zhang, T., Coskun, A.K., Herbordt, M.: An investigation of unified memory access performance in CUDA. In: 2014 IEEE High Performance Extreme Computing Conference (HPEC), pp.\u00a01\u20136 (2014). https:\/\/doi.org\/10.1109\/HPEC.2014.7040988","DOI":"10.1109\/HPEC.2014.7040988"},{"key":"6_CR25","doi-asserted-by":"publisher","unstructured":"Matsumura, K., Sato, M., Boku, T., Podobas, A., Matsuoka, S.: MACC: an OpenACC transpiler for automatic multi-GPU use. In: Yokota, R., Wu, W. (eds.) SCFA 2018. LNCS, vol. 10776, pp. 109\u2013127. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-319-69953-0_7","DOI":"10.1007\/978-3-319-69953-0_7"},{"key":"6_CR26","doi-asserted-by":"publisher","unstructured":"McDonell, T.L., Chakravarty, M.M., Keller, G., Lippmeier, B.: Optimising purely functional GPU programs. In: Proceedings of the 18th ACM SIGPLAN International Conference on Functional Programming, pp. 49\u201360. ICFP \u201913, Association for Computing Machinery, New York, NY, USA (2013). https:\/\/doi.org\/10.1145\/2500365.2500595","DOI":"10.1145\/2500365.2500595"},{"key":"6_CR27","doi-asserted-by":"publisher","unstructured":"Prades, J., Varghese, B., Rea\u00f1o, C., Silla, F.: Multi-tenant virtual GPUs for optimising performance of a financial risk application. J. Parallel Distrib. Comput. 108, 28\u201344 (2017).https:\/\/doi.org\/10.1016\/j.jpdc.2016.06.002, https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0743731516300673, special Issue on Scalable Computing Systems for Big Data Applications","DOI":"10.1016\/j.jpdc.2016.06.002"},{"key":"6_CR28","doi-asserted-by":"crossref","unstructured":"Svensson, B.J., Vollmer, M., Holk, E., McDonell, T.L., Newton, R.R.: Converting data-parallelism to task-parallelism by rewrites: purely functional programs across multiple GPUs. In: Proceedings of the 4th ACM SIGPLAN Workshop on Functional High-Performance Computing, pp. 12\u201322 (2015)","DOI":"10.1145\/2808091.2808093"},{"key":"6_CR29","doi-asserted-by":"publisher","unstructured":"Vedaldi, A., Lenc, K.: MatConvNet: Convolutional Neural Networks for MATLAB. In: Proceedings of the 23rd ACM International Conference on Multimedia, pp. 689\u2013692. MM \u201915, Association for Computing Machinery, New York, NY, USA (2015). https:\/\/doi.org\/10.1145\/2733373.2807412","DOI":"10.1145\/2733373.2807412"},{"key":"6_CR30","doi-asserted-by":"publisher","unstructured":"Verloop, M., Koopman, T., Scholz, S.B.: Modulo in high-performance code: strength reduction for modulo-based array indexing in loops. In: Proceedings of the 35th Symposium on Implementation and Application of Functional Languages. IFL \u201923, Association for Computing Machinery, New York, NY, USA (2024). https:\/\/doi.org\/10.1145\/3652561.3652573","DOI":"10.1145\/3652561.3652573"},{"key":"6_CR31","doi-asserted-by":"publisher","unstructured":"Vie\u00dfmann, H.N., Scholz, S.B.: Effective host-GPU memory management through code generation. In: Proceedings of the 32nd Symposium on Implementation and Application of Functional Languages, pp. 138\u2013149. IFL \u201920, Association for Computing Machinery, New York, NY, USA (2021). https:\/\/doi.org\/10.1145\/3462172.3462199","DOI":"10.1145\/3462172.3462199"},{"key":"6_CR32","doi-asserted-by":"publisher","unstructured":"\u0160inkarovs, A., Vie\u00dfmann, H.N., Scholz, S.B.: Array languages make neural networks fast. In: Proceedings of the 7th ACM SIGPLAN International Workshop on Libraries, Languages and Compilers for Array Programming, pp. 39\u201350. ARRAY 2021, Association for Computing Machinery, New York, NY, USA (2021). https:\/\/doi.org\/10.1145\/3460944.3464312","DOI":"10.1145\/3460944.3464312"}],"container-title":["Lecture Notes in Computer Science","Trends in Functional Programming"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-99751-8_6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,28]],"date-time":"2025-11-28T20:22:05Z","timestamp":1764361325000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-99751-8_6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,1]]},"ISBN":["9783031997501","9783031997518"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-99751-8_6","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025,10,1]]},"assertion":[{"value":"1 October 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"TFP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Symposium on Trends in Functional Programming","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Oxford","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"United Kingdom","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 January 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 January 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"tfp2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/trendsfp.github.io\/index.html","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}