{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T17:43:42Z","timestamp":1743011022390,"version":"3.40.3"},"publisher-location":"Cham","reference-count":67,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030162719"},{"type":"electronic","value":"9783030162726"}],"license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019]]},"DOI":"10.1007\/978-3-030-16272-6_2","type":"book-chapter","created":{"date-parts":[[2019,3,25]],"date-time":"2019-03-25T12:08:22Z","timestamp":1553515702000},"page":"36-68","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Parallelization of Hierarchical Matrix Algorithms for Electromagnetic Scattering Problems"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1154-9587","authenticated-orcid":false,"given":"Elisabeth","family":"Larsson","sequence":"first","affiliation":[]},{"given":"Afshin","family":"Zafari","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4652-8041","authenticated-orcid":false,"given":"Marco","family":"Righero","sequence":"additional","affiliation":[]},{"given":"M. Alessandro","family":"Francavilla","sequence":"additional","affiliation":[]},{"given":"Giorgio","family":"Giordanengo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0791-9269","authenticated-orcid":false,"given":"Francesca","family":"Vipiana","sequence":"additional","affiliation":[]},{"given":"Giuseppe","family":"Vecchi","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5241-0026","authenticated-orcid":false,"given":"Christoph","family":"Kessler","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3310-7651","authenticated-orcid":false,"given":"Corinne","family":"Ancourt","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3003-1388","authenticated-orcid":false,"given":"Clemens","family":"Grelck","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,3,26]]},"reference":[{"issue":"10","key":"2_CR1","doi-asserted-by":"publisher","first-page":"2794","DOI":"10.1109\/TPDS.2017.2697857","volume":"28","author":"E Agullo","year":"2017","unstructured":"Agullo, E., Aumage, O., Bramas, B., Coulaud, O., Pitoiset, S.: Bridging the gap between OpenMP and task-based runtime systems for the fast multipole method. IEEE Trans. Parallel Distrib. Syst. 28(10), 2794\u20132807 (2017). https:\/\/doi.org\/10.1109\/TPDS.2017.2697857","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"issue":"1","key":"2_CR2","doi-asserted-by":"publisher","first-page":"C66","DOI":"10.1137\/130915662","volume":"36","author":"E Agullo","year":"2014","unstructured":"Agullo, E., Bramas, B., Coulaud, O., Darve, E., Messner, M., Takahashi, T.: Task-based FMM for multicore architectures. SIAM J. Sci. Comput. 36(1), C66\u2013C93 (2014). https:\/\/doi.org\/10.1137\/130915662","journal-title":"SIAM J. Sci. Comput."},{"key":"2_CR3","doi-asserted-by":"publisher","DOI":"10.1137\/1.9780898719604","volume-title":"LAPACK Users\u2019 Guide","author":"E Anderson","year":"1999","unstructured":"Anderson, E., et al.: LAPACK Users\u2019 Guide, 3rd edn. Society for Industrial and Applied Mathematics, Philadelphia (1999)","edition":"3"},{"key":"2_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"92","DOI":"10.1007\/978-3-319-65578-9_7","volume-title":"Scaling OpenMP for Exascale Performance and Portability","author":"P Atkinson","year":"2017","unstructured":"Atkinson, P., McIntosh-Smith, S.: On the performance of parallel tasking runtimes for an irregular fast multipole method application. In: de Supinski, B.R., Olivier, S.L., Terboven, C., Chapman, B.M., M\u00fcller, M.S. (eds.) IWOMP 2017. LNCS, vol. 10468, pp. 92\u2013106. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-65578-9_7"},{"issue":"2","key":"2_CR5","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1002\/cpe.1631","volume":"23","author":"C Augonnet","year":"2011","unstructured":"Augonnet, C., Thibault, S., Namyst, R., Wacrenier, P.: StarPU: a unified platform for task scheduling on heterogeneous multicore architectures. Concurr. Comput.: Pract. Exper. 23(2), 187\u2013198 (2011). https:\/\/doi.org\/10.1002\/cpe.1631","journal-title":"Concurr. Comput.: Pract. Exper."},{"issue":"4","key":"2_CR6","doi-asserted-by":"publisher","first-page":"C335","DOI":"10.1137\/130945569","volume":"36","author":"AR Benson","year":"2014","unstructured":"Benson, A.R., Poulson, J., Tran, K., Engquist, B., Ying, L.: A parallel directional fast multipole method. SIAM J. Sci. Comput. 36(4), C335\u2013C352 (2014). https:\/\/doi.org\/10.1137\/130945569","journal-title":"SIAM J. Sci. Comput."},{"key":"2_CR7","unstructured":"Bordage, C.: Parallelization on heterogeneous multicore and multi-GPU systems of the fast multipole method for the Helmholtz equation using a runtime system. In: Omatu, S., Nguyen, T. (eds.) Proceedings of the Sixth International Conference on Advanced Engineering Computing and Applications in Sciences, pp. 90\u201395. International Academy, Research, and Industry Association (IARIA), Curran Associates Inc., Red Hook (2012)"},{"issue":"6","key":"2_CR8","doi-asserted-by":"publisher","first-page":"36","DOI":"10.1109\/MCSE.2013.98","volume":"15","author":"G Bosilca","year":"2013","unstructured":"Bosilca, G., Bouteiller, A., Danalis, A., Faverge, M., H\u00e9rault, T., Dongarra, J.J.: PaRSEC: exploiting heterogeneity to enhance scalability. Comput. Sci. Eng. 15(6), 36\u201345 (2013)","journal-title":"Comput. Sci. Eng."},{"key":"2_CR9","unstructured":"Budimli\u0107, Z., Chandramowlishwaran, A., Knobe, K., Lowney, G., Sarkar, V., Treggiari, L.: Multicore implementations of the Concurrent Collections programming model. In: 14th Workshop on Compilers for Parallel Computing, Z\u00fcrich, Switzerland (2009)"},{"issue":"1","key":"2_CR10","doi-asserted-by":"publisher","first-page":"38","DOI":"10.1016\/j.parco.2008.10.002","volume":"35","author":"A Buttari","year":"2009","unstructured":"Buttari, A., Langou, J., Kurzak, J., Dongarra, J.: A class of parallel tiled linear algebra algorithms for multicore architectures. Parallel Comput. 35(1), 38\u201353 (2009)","journal-title":"Parallel Comput."},{"key":"2_CR11","doi-asserted-by":"crossref","unstructured":"Chandramowlishwaran, A., Knobe, K., Vuduc, R.: Performance evaluation of Concurrent Collections on high-performance multicore computing systems. In: 24th IEEE International Parallel and Distributed Processing Symposium (IPDPS 2010), Atlanta, USA, pp. 1\u201312. IEEE, April 2010","DOI":"10.1109\/IPDPS.2010.5470404"},{"issue":"4","key":"2_CR12","doi-asserted-by":"publisher","first-page":"403","DOI":"10.1002\/nme.2972","volume":"85","author":"FA Cruz","year":"2011","unstructured":"Cruz, F.A., Knepley, M.G., Barba, L.A.: PetFMM\u2013a dynamically load-balancing parallel fast multipole library. Int. J. Numer. Methods Eng. 85(4), 403\u2013428 (2011). https:\/\/doi.org\/10.1002\/nme.2972","journal-title":"Int. J. Numer. Methods Eng."},{"issue":"2","key":"2_CR13","doi-asserted-by":"publisher","first-page":"185","DOI":"10.1016\/j.crme.2010.12.005","volume":"339","author":"E Darve","year":"2011","unstructured":"Darve, E., Cecka, C., Takahashi, T.: The fast multipole method on parallel clusters, multicore processors, and graphics processing units. Comptes Rendus M\u00e9canique 339(2), 185\u2013193 (2011). https:\/\/doi.org\/10.1016\/j.crme.2010.12.005","journal-title":"Comptes Rendus M\u00e9canique"},{"key":"2_CR14","doi-asserted-by":"publisher","unstructured":"Dastgeer, U., Kessler, C., Thibault, S.: Flexible runtime support for efficient skeleton programming on hybrid systems. In: Proceedings of the ParCo-2011 International Conference on Parallel Computing, Ghent, Belgium, September 2011. Advances in Parallel Computing, vol. 22, pp. 159\u2013166. IOS press (2012). https:\/\/doi.org\/10.3233\/978-1-61499-041-3-159","DOI":"10.3233\/978-1-61499-041-3-159"},{"issue":"02","key":"2_CR15","doi-asserted-by":"publisher","first-page":"173","DOI":"10.1142\/S0129626411000151","volume":"21","author":"A Duran","year":"2011","unstructured":"Duran, A., et al.: OmpSs: a proposal for programming heterogeneous multi-core architectures. Parallel Proces. Lett. 21(02), 173\u2013193 (2011)","journal-title":"Parallel Proces. Lett."},{"key":"2_CR16","unstructured":"Efield\u00ae. http:\/\/www.efieldsolutions.com\/"},{"key":"2_CR17","doi-asserted-by":"publisher","unstructured":"Enmyren, J., Kessler, C.: SkePU: a multi-backend skeleton programming library for multi-GPU systems. In: Proceedings of the 4th Internatioanl Workshop on High-Level Parallel Programming and Applications (HLPP-2010). ACM, September 2010. https:\/\/doi.org\/10.1145\/1863482.1863487","DOI":"10.1145\/1863482.1863487"},{"key":"2_CR18","doi-asserted-by":"publisher","unstructured":"Ernstsson, A., Li, L., Kessler, C.: SkePU\u00a02: flexible and type-safe skeleton programming for heterogeneous parallel systems. Int. J. Parallel Program. 46(1) (2018). https:\/\/doi.org\/10.1007\/s10766-017-0490-5","DOI":"10.1007\/s10766-017-0490-5"},{"key":"2_CR19","doi-asserted-by":"publisher","unstructured":"Filipovic, J., Benkner, S.: OpenCL kernel fusion for GPU, Xeon Phi and CPU. In: Proceedings of the 27th International Symposium on Computer Architecture and High-Performance Computing (SBAC-PAD 2015), pp. 98\u2013105. IEEE (2015). https:\/\/doi.org\/10.1109\/SAC-PAD.2015.29","DOI":"10.1109\/SAC-PAD.2015.29"},{"key":"2_CR20","doi-asserted-by":"publisher","first-page":"3934","DOI":"10.1007\/s11227-015-1483-z","volume":"71","author":"J Filipovic","year":"2015","unstructured":"Filipovic, J., Madzin, M., Fousek, J., Matyska, L.: Optimizing CUDA code by kernel fusion: application on BLAS. J. Supercomput. 71, 3934\u20133957 (2015). https:\/\/doi.org\/10.1007\/s11227-015-1483-z","journal-title":"J. Supercomput."},{"key":"2_CR21","doi-asserted-by":"publisher","unstructured":"Fukuda, K., Matsuda, M., Maruyama, N., Yokota, R., Taura, K., Matsuoka, S.: Tapas: an implicitly parallel programming framework for hierarchical $$n$$n-body algorithms. In: 2016 IEEE 22nd International Conference on Parallel and Distributed Systems (ICPADS), pp. 1100\u20131109, December 2016. https:\/\/doi.org\/10.1109\/ICPADS.2016.0145","DOI":"10.1109\/ICPADS.2016.0145"},{"issue":"2","key":"2_CR22","doi-asserted-by":"publisher","first-page":"97","DOI":"10.1145\/129630.129635","volume":"35","author":"D Gelernter","year":"1992","unstructured":"Gelernter, D., Carriero, N.: Coordination languages and their significance. Commun. ACM 35(2), 97\u2013107 (1992)","journal-title":"Commun. ACM"},{"issue":"6","key":"2_CR23","doi-asserted-by":"publisher","first-page":"988","DOI":"10.1007\/s10766-013-0271-8","volume":"42","author":"B Gijsbers","year":"2014","unstructured":"Gijsbers, B., Grelck, C.: An efficient scalable runtime system for macro data flow processing using S-Net. Int. J. Parallel Program. 42(6), 988\u20131011 (2014). https:\/\/doi.org\/10.1007\/s10766-013-0271-8","journal-title":"Int. J. Parallel Program."},{"key":"2_CR24","unstructured":"Gouin, F.: Methodology for image processing algorithms mapping on massively parallel architectures. Technical report, MINES ParisTech (2018)"},{"key":"2_CR25","unstructured":"Gouin, F., Ancourt, C., Guettier, C.: An up to date mapping methodology for GPUs. In: 20th Workshop on Compilers for Parallel Computing (CPC 2018), Dublin, Ireland, April 2018. https:\/\/hal-mines-paristech.archives-ouvertes.fr\/hal-01759238"},{"key":"2_CR26","doi-asserted-by":"publisher","unstructured":"Grelck, C., Julku, J., Penczek, F.: Distributed S-Net: cluster and grid computing without the hassle. In: 12th IEEE\/ACM International Conference on Cluster, Cloud and Grid Computing (CCGrid 2012), Ottawa, Canada. IEEE Computer Society (2012). https:\/\/doi.org\/10.1109\/CCGrid.2012.140","DOI":"10.1109\/CCGrid.2012.140"},{"issue":"1","key":"2_CR27","doi-asserted-by":"publisher","first-page":"38","DOI":"10.1007\/s10766-009-0121-x","volume":"38","author":"C Grelck","year":"2010","unstructured":"Grelck, C., Scholz, S., Shafarenko, A.: Asynchronous stream processing with S-Net. Int. J. Parallel Program. 38(1), 38\u201367 (2010). https:\/\/doi.org\/10.1007\/s10766-009-0121-x","journal-title":"Int. J. Parallel Program."},{"key":"2_CR28","doi-asserted-by":"publisher","unstructured":"Grelck, C., Scholz, S.B., Shafarenko, A.: Coordinating data parallel SAC programs with S-Net. In: Proceedings of the 21st IEEE International Parallel and Distributed Processing Symposium (IPDPS 2007), Long Beach, California, USA. IEEE Computer Society Press, Los Alamitos (2007). https:\/\/doi.org\/10.1109\/IPDPS.2007.370408","DOI":"10.1109\/IPDPS.2007.370408"},{"key":"2_CR29","doi-asserted-by":"publisher","unstructured":"Gupta, K., Stuart, J.A., Owens, J.D.: A study of persistent threads style GPU programming for GPGPU workloads. In: Innovative Parallel Computing - Foundations and Applications of GPU, Manycore, and Heterogeneous Systems (INPAR 2012), pp. 1\u201314. IEEE, May 2012. https:\/\/doi.org\/10.1109\/InPar.2012.6339596","DOI":"10.1109\/InPar.2012.6339596"},{"issue":"2","key":"2_CR30","doi-asserted-by":"publisher","first-page":"332","DOI":"10.1109\/JPROC.2012.2222331","volume":"101","author":"L G\u00fcrel","year":"2013","unstructured":"G\u00fcrel, L., Erg\u00fcl, O.: Hierarchical parallelization of the multilevel fast multipole algorithm (MLFMA). Proc. IEEE 101(2), 332\u2013341 (2013). https:\/\/doi.org\/10.1109\/JPROC.2012.2222331","journal-title":"Proc. IEEE"},{"key":"2_CR31","doi-asserted-by":"publisher","unstructured":"Holm, M., Engblom, S., Goude, A., Holmgren, S.: Dynamic autotuning of adaptive fast multipole methods on hybrid multicore CPU and GPU systems. SIAM J. Sci. Comput. 36(4) (2014). https:\/\/doi.org\/10.1137\/130943595","DOI":"10.1137\/130943595"},{"key":"2_CR32","doi-asserted-by":"publisher","unstructured":"Kessler, C., et al.: Programmability and performance portability aspects of heterogeneous multi-\/manycore systems. In: Proceedings of the DATE-2012 Conference on Design, Automation and Test in Europe, pp. 1403\u20131408. IEEE, March 2012. https:\/\/doi.org\/10.1109\/DATE.2012.6176582","DOI":"10.1109\/DATE.2012.6176582"},{"key":"2_CR33","unstructured":"Knobe, K.: Ease of use with Concurrent Collections (CnC). In: USENIX Workshop on Hot Topics in Parallelism (HotPar 2009), Berkeley USA (2009)"},{"issue":"7","key":"2_CR34","doi-asserted-by":"publisher","first-page":"870","DOI":"10.1016\/j.jpdc.2005.02.001","volume":"65","author":"J Kurzak","year":"2005","unstructured":"Kurzak, J., Pettitt, B.M.: Massively parallel implementation of a fast multipole method for distributed memory machines. J. Parallel Distrib. Comput. 65(7), 870\u2013881 (2005). https:\/\/doi.org\/10.1016\/j.jpdc.2005.02.001","journal-title":"J. Parallel Distrib. Comput."},{"issue":"5","key":"2_CR35","doi-asserted-by":"publisher","first-page":"101","DOI":"10.1145\/2160718.2160740","volume":"55","author":"I Lashuk","year":"2012","unstructured":"Lashuk, I., et al.: A massively parallel adaptive fast multipole method on heterogeneous architectures. Commun. ACM 55(5), 101\u2013109 (2012). https:\/\/doi.org\/10.1145\/2160718.2160740","journal-title":"Commun. ACM"},{"key":"2_CR36","doi-asserted-by":"publisher","unstructured":"Li, L., Kessler, C.: Lazy allocation and transfer fusion optimization for GPU-based heterogeneous systems. In: Proceedings of the Euromicro PDP-2018 International Conference on Parallel, Distributed, and Network-Based Processing, pp. 311\u2013315. IEEE, March 2018. https:\/\/doi.org\/10.1109\/PDP2018.2018.00054","DOI":"10.1109\/PDP2018.2018.00054"},{"issue":"7","key":"2_CR37","doi-asserted-by":"publisher","first-page":"3664","DOI":"10.1109\/TAP.2014.2321139","volume":"62","author":"M Li","year":"2014","unstructured":"Li, M., Francavilla, M., Vipiana, F., Vecchi, G., Chen, R.: Nested equivalent source approximation for the modeling of multiscale structures. IEEE Trans. Antennas Propag. 62(7), 3664\u20133678 (2014)","journal-title":"IEEE Trans. Antennas Propag."},{"issue":"5","key":"2_CR38","doi-asserted-by":"publisher","first-page":"1103","DOI":"10.1109\/TEMC.2014.2306691","volume":"56","author":"M Li","year":"2014","unstructured":"Li, M., Francavilla, M., Vipiana, F., Vecchi, G., Fan, Z., Chen, R.: A doubly hierarchical MoM for high-fidelity modeling of multiscale structures. IEEE Trans. Electromagn. Compat. 56(5), 1103\u20131111 (2014)","journal-title":"IEEE Trans. Electromagn. Compat."},{"issue":"5","key":"2_CR39","doi-asserted-by":"publisher","first-page":"2122","DOI":"10.1109\/TAP.2015.2402297","volume":"63","author":"M Li","year":"2015","unstructured":"Li, M., Francavilla, M.A., Chen, R., Vecchi, G.: Wideband fast kernel-independent modeling of large multiscale structures via nested equivalent source approximation. IEEE Trans. Antennas Propag. 63(5), 2122\u20132134 (2015). https:\/\/doi.org\/10.1109\/TAP.2015.2402297","journal-title":"IEEE Trans. Antennas Propag."},{"issue":"11","key":"2_CR40","doi-asserted-by":"publisher","first-page":"1935","DOI":"10.1002\/cpe.3132","volume":"26","author":"H Ltaief","year":"2014","unstructured":"Ltaief, H., Yokota, R.: Data-driven execution of fast multipole methods. Concurr. Comput.: Pract. Exp. 26(11), 1935\u20131946 (2014). https:\/\/doi.org\/10.1002\/cpe.3132","journal-title":"Concurr. Comput.: Pract. Exp."},{"key":"2_CR41","doi-asserted-by":"publisher","unstructured":"Maghazeh, A., Bordoloi, U.D., Dastgeer, U., Andrei, A., Eles, P., Peng, Z.: Latency-aware packet processing on CPU-GPU heterogeneous systems. In: Proceedings of the Design Automation Conference (DAC), pp. 41:1\u201341:6. ACM (2017). https:\/\/doi.org\/10.1145\/3061639.3062269","DOI":"10.1145\/3061639.3062269"},{"key":"2_CR42","first-page":"71","volume":"33","author":"JR Mautz","year":"1979","unstructured":"Mautz, J.R., Harrington, R.F.: Electromagnetic scattering from homogeneous material body of revolution. Arch. Electron. \u00dcbertragungstech 33, 71\u201380 (1979)","journal-title":"Arch. Electron. \u00dcbertragungstech"},{"key":"2_CR43","unstructured":"Nilsson, M.: Fast numerical techniques for electromagnetic problems in frequency domain. Ph.D. thesis, Division of Scientific Computing, Department of Information Technology, Uppsala University (2003)"},{"key":"2_CR44","doi-asserted-by":"publisher","unstructured":"Penczek, F., Cheng, W., Grelck, C., Kirner, R., Scheuermann, B., Shafarenko, A.: A data-flow based coordination approach to concurrent software engineering. In: 2nd Workshop on Data-Flow Execution Models for Extreme Scale Computing (DFM 2012), Minneapolis, USA. IEEE (2012). https:\/\/doi.org\/10.1109\/DFM.2012.14","DOI":"10.1109\/DFM.2012.14"},{"issue":"1","key":"2_CR45","doi-asserted-by":"publisher","first-page":"2079","DOI":"10.1016\/j.procs.2010.04.233","volume":"1","author":"F Penczek","year":"2010","unstructured":"Penczek, F., et al.: Parallel signal processing with S-Net. Procedia Comput. Sci. 1(1), 2079\u20132088 (2010). https:\/\/doi.org\/10.1016\/j.procs.2010.04.233 . http:\/\/www.sciencedirect.com\/science\/article\/B9865-506HM1Y-88\/2\/87fcf1cee7899f0eeaadc90bd0d56cd3 , iCCS 2010","journal-title":"Procedia Comput. Sci."},{"key":"2_CR46","doi-asserted-by":"publisher","unstructured":"P\u00e9rez, J.M., Badia, R.M., Labarta, J.: A dependency-aware task-based programming environment for multi-core architectures. In: Proceedings of the 2008 IEEE International Conference on Cluster Computing, Tsukuba, Japan, 29 September\u20131 October 2008, pp. 142\u2013151 (2008). https:\/\/doi.org\/10.1109\/CLUSTR.2008.4663765","DOI":"10.1109\/CLUSTR.2008.4663765"},{"key":"2_CR47","unstructured":"Puma-EM. https:\/\/sourceforge.net\/projects\/puma-em\/"},{"key":"2_CR48","doi-asserted-by":"publisher","unstructured":"Qiao, B., Reiche, O., Hannig, F., Teich, J.: Automatic kernel fusion for image processing DSLs. In: Proceedings of the 21th International Workshop on Software and Compilers for Embedded Systems (SCOPES 2018). ACM, May 2018. https:\/\/doi.org\/10.1145\/3207719.3207723","DOI":"10.1145\/3207719.3207723"},{"issue":"3","key":"2_CR49","doi-asserted-by":"publisher","first-page":"409","DOI":"10.1109\/TAP.1982.1142818","volume":"30","author":"S Rao","year":"1982","unstructured":"Rao, S., Wilton, D., Glisson, A.: Electromagnetic scattering by surfaces of arbitrary shape. IEEE Trans. Antennas Propag. 30(3), 409\u2013418 (1982)","journal-title":"IEEE Trans. Antennas Propag."},{"issue":"5","key":"2_CR50","doi-asserted-by":"publisher","first-page":"1476","DOI":"10.1109\/TMAG.2005.844564","volume":"41","author":"SM Seo","year":"2005","unstructured":"Seo, S.M., Lee, J.F.: A fast IE-FFT algorithm for solving PEC scattering problems. IEEE Trans. Magn. 41(5), 1476\u20131479 (2005)","journal-title":"IEEE Trans. Magn."},{"issue":"10","key":"2_CR51","doi-asserted-by":"publisher","first-page":"1488","DOI":"10.1109\/8.633855","volume":"45","author":"J Song","year":"1997","unstructured":"Song, J., Lu, C.C., Chew, W.C.: Multilevel fast multipole algorithm for electromagnetic scattering by large complex objects. IEEE Trans. Antennas Propag. 45(10), 1488\u20131493 (1997)","journal-title":"IEEE Trans. Antennas Propag."},{"key":"2_CR52","unstructured":"Thibault, S.: On Runtime Systems for Task-based Programming on Heterogeneous Platforms. Habilitation \u00e0 diriger des recherches, L\u2019Universit\u00e9 Bordeaux (2018)"},{"key":"2_CR53","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"164","DOI":"10.1007\/978-3-642-40047-6_19","volume-title":"Euro-Par 2013 Parallel Processing","author":"P Thoman","year":"2013","unstructured":"Thoman, P., Jordan, H., Fahringer, T.: Adaptive granularity control in task parallel programs using multiversioning. In: Wolf, F., Mohr, B., an Mey, D. (eds.) Euro-Par 2013. LNCS, vol. 8097, pp. 164\u2013177. Springer, Heidelberg (2013). https:\/\/doi.org\/10.1007\/978-3-642-40047-6_19"},{"key":"2_CR54","doi-asserted-by":"publisher","unstructured":"Tillenius, M.: SuperGlue: a shared memory framework using data versioning for dependency-aware task-based parallelization. SIAM J. Sci. Comput. 37(6) (2015). https:\/\/doi.org\/10.1137\/140989716","DOI":"10.1137\/140989716"},{"issue":"1","key":"2_CR55","doi-asserted-by":"publisher","first-page":"5:1","DOI":"10.1145\/2638554","volume":"14","author":"M Tillenius","year":"2015","unstructured":"Tillenius, M., Larsson, E., Badia, R.M., Martorell, X.: Resource-aware task scheduling. ACM Trans. Embedded Comput. Syst. 14(1), 5:1\u20135:25 (2015). https:\/\/doi.org\/10.1145\/2638554","journal-title":"ACM Trans. Embedded Comput. Syst."},{"issue":"8","key":"2_CR56","doi-asserted-by":"publisher","first-page":"2719","DOI":"10.1109\/TAP.2005.851859","volume":"53","author":"S Velamparambil","year":"2005","unstructured":"Velamparambil, S., Chew, W.C.: Analysis and performance of a distributed memory multilevel fast multipole algorithm. IEEE Trans. Antennas Propag. 53(8), 2719\u20132727 (2005). https:\/\/doi.org\/10.1109\/TAP.2005.851859","journal-title":"IEEE Trans. Antennas Propag."},{"issue":"7","key":"2_CR57","doi-asserted-by":"publisher","first-page":"2362","DOI":"10.1109\/TAP.2010.2048855","volume":"58","author":"F Vipiana","year":"2010","unstructured":"Vipiana, F., Francavilla, M., Vecchi, G.: EFIE modeling of high-definition multiscale structures. IEEE Trans. Antennas Propag. 58(7), 2362\u20132374 (2010)","journal-title":"IEEE Trans. Antennas Propag."},{"key":"2_CR58","doi-asserted-by":"publisher","unstructured":"Wahib, M., Maruyama, N.: Scalable kernel fusion for memory-bound GPU applications. In: Proceedings of the International Conference for High-Performance Computing, Networking, Storage and Analysis (SC 2014), pp. 191\u2013202. IEEE (2014). https:\/\/doi.org\/10.1109\/SC.2014.21","DOI":"10.1109\/SC.2014.21"},{"key":"2_CR59","doi-asserted-by":"publisher","unstructured":"Wang, G., Lin, Y., Yi, W.: Kernel fusion: an effective method for better power efficiency on multithreaded GPU. In: Proceedings of the IEEE\/ACM International Conference on Green Computing and Communications and International Conference on Cyber, Physical and Social Computing, pp. 344\u2013350 (2010). https:\/\/doi.org\/10.1109\/GreenCom-CPSCom.2010.102","DOI":"10.1109\/GreenCom-CPSCom.2010.102"},{"key":"2_CR60","doi-asserted-by":"publisher","unstructured":"Wen, Y., O\u2019Boyle, M.F., Fensch, C.: MaxPair: enhance OpenCL concurrent kernel execution by weighted maximum matching. In: Proceedings of the GPGPU-11. ACM (2018). https:\/\/doi.org\/10.1145\/3180270.3180272","DOI":"10.1145\/3180270.3180272"},{"key":"2_CR61","unstructured":"YarKhan, A., Kurzak, J., Dongarra, J.: Quark users\u2019 guide: queueing and runtime for kernels. Technical report. ICL-UT-11-02 (2011)"},{"key":"2_CR62","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"169","DOI":"10.1007\/978-3-319-78024-5_16","volume-title":"Parallel Processing and Applied Mathematics","author":"A Zafari","year":"2018","unstructured":"Zafari, A.: TaskUniVerse: a task-based unified interface for versatile parallel execution. In: Wyrzykowski, R., Dongarra, J., Deelman, E., Karczewski, K. (eds.) PPAM 2017. LNCS, vol. 10777, pp. 169\u2013184. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-319-78024-5_16"},{"key":"2_CR63","unstructured":"Zafari, A., et al.: Task parallel implementation of a solver for electromagnetic scattering problems. CoRR abs\/1801.03589 (2018). http:\/\/arxiv.org\/abs\/1801.03589"},{"key":"2_CR64","doi-asserted-by":"crossref","unstructured":"Zafari, A., Larsson, E., Tillenius, M.: DuctTeip: an efficient programming model for distributed task-based parallel computing (2019, submitted)","DOI":"10.1016\/j.parco.2019.102582"},{"key":"2_CR65","doi-asserted-by":"publisher","unstructured":"Zaichenkov, P., Gijsbers, B., Grelck, C., Tveretina, O., Shafarenko, A.: The cost and benefits of coordination programming: two case studies in Concurrent Collections (CnC) and S-Net. Parallel Process. Lett. 26(3) (2016). https:\/\/doi.org\/10.1142\/S0129626416500110","DOI":"10.1142\/S0129626416500110"},{"key":"2_CR66","doi-asserted-by":"publisher","unstructured":"Zhang, B.: Asynchronous task scheduling of the fast multipole method using various runtime systems. In: 2014 Fourth Workshop on Data-Flow Execution Models for Extreme Scale Computing, pp. 9\u201316 (2014). https:\/\/doi.org\/10.1109\/DFM.2014.14","DOI":"10.1109\/DFM.2014.14"},{"issue":"4","key":"2_CR67","doi-asserted-by":"publisher","first-page":"763","DOI":"10.1109\/TEMC.2005.857898","volume":"47","author":"K Zhao","year":"2005","unstructured":"Zhao, K., Vouvakis, M.N., Lee, J.F.: The adaptive cross approximation algorithm for accelerated method of moments computations of EMC problems. IEEE Trans. Electromagn. Compat. 47(4), 763\u2013773 (2005)","journal-title":"IEEE Trans. Electromagn. Compat."}],"container-title":["Lecture Notes in Computer Science","High-Performance Modelling and Simulation for Big Data Applications"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-16272-6_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,11,22]],"date-time":"2019-11-22T00:54:16Z","timestamp":1574384056000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-030-16272-6_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019]]},"ISBN":["9783030162719","9783030162726"],"references-count":67,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-16272-6_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2019]]},"assertion":[{"value":"26 March 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}}]}}