{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,31]],"date-time":"2025-12-31T12:04:39Z","timestamp":1767182679487,"version":"3.40.3"},"reference-count":43,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2025,3,18]],"date-time":"2025-03-18T00:00:00Z","timestamp":1742256000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,3,18]],"date-time":"2025-03-18T00:00:00Z","timestamp":1742256000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/100000015","name":"U.S. Department of Energy","doi-asserted-by":"crossref","award":["DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969"],"award-info":[{"award-number":["DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969","DE-NA0003969"]}],"id":[{"id":"10.13039\/100000015","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Parallel Prog"],"published-print":{"date-parts":[[2025,4]]},"DOI":"10.1007\/s10766-025-00788-1","type":"journal-article","created":{"date-parts":[[2025,3,18]],"date-time":"2025-03-18T19:26:17Z","timestamp":1742325977000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Performance Characterization of Python Runtimes for Multi-device Task Parallel Programming"],"prefix":"10.1007","volume":"53","author":[{"given":"William","family":"Ruys","sequence":"first","affiliation":[]},{"given":"Hochan","family":"Lee","sequence":"additional","affiliation":[]},{"given":"Bozhi","family":"You","sequence":"additional","affiliation":[]},{"given":"Shreya","family":"Talati","sequence":"additional","affiliation":[]},{"given":"Jaeyoung","family":"Park","sequence":"additional","affiliation":[]},{"given":"James","family":"Almgren-Bell","sequence":"additional","affiliation":[]},{"given":"Yineng","family":"Yan","sequence":"additional","affiliation":[]},{"given":"Milinda","family":"Fernando","sequence":"additional","affiliation":[]},{"given":"Mattan","family":"Erez","sequence":"additional","affiliation":[]},{"given":"Milos","family":"Gligoric","sequence":"additional","affiliation":[]},{"given":"Martin","family":"Burtscher","sequence":"additional","affiliation":[]},{"given":"Christopher J.","family":"Rossbach","sequence":"additional","affiliation":[]},{"given":"Keshav","family":"Pingali","sequence":"additional","affiliation":[]},{"given":"George","family":"Biros","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,18]]},"reference":[{"key":"788_CR1","doi-asserted-by":"publisher","DOI":"10.1109\/MCSE.2011.37","author":"S van der Walt","year":"2011","unstructured":"van der Walt, S., Colbert, S.C., Varoquaux, G.: The NumPy array: a structure for efficient numerical computation. Comput. Sci. Eng. (2011). https:\/\/doi.org\/10.1109\/MCSE.2011.37","journal-title":"Comput. Sci. Eng."},{"issue":"3","key":"788_CR2","doi-asserted-by":"crossref","first-page":"261","DOI":"10.1038\/s41592-019-0686-2","volume":"17","author":"P Virtanen","year":"2020","unstructured":"Virtanen, P., Gommers, R., Oliphant, T.E., Haberland, M., Reddy, T., Cournapeau, D., Burovski, E., Peterson, P., Weckesser, W., Bright, J., et al.: SciPy 1.0: fundamental algorithms for scientific computing in Python. Nat. Methods 17(3), 261\u2013272 (2020)","journal-title":"Nat. Methods"},{"key":"788_CR3","unstructured":"Preferred Networks, inc.: CuPy: A NumPy-compatible matrix library accelerated by CUDA (2020). https:\/\/cupy.chainer.org\/"},{"key":"788_CR4","unstructured":"Anaconda: Numba: A High-performance Python Compiler (2018). https:\/\/numba.pydata.org\/"},{"key":"788_CR5","doi-asserted-by":"crossref","unstructured":"Al\u00a0Awar, N., Zhu, S., Biros, G., Gligoric, M.: A performance portability framework for python. In: Proceedings of the ACM International Conference on Supercomputing (2021)","DOI":"10.1145\/3447818.3460376"},{"key":"788_CR6","unstructured":"Bradbury, J., Frostig, R., Hawkins, P., Johnson, M.J., Leary, C., Maclaurin, D., Necula, G., Paszke, A., VanderPlas, J., Wanderman-Milne, S., et al.: Jax: Autograd and xla. Astrophysics Source Code Library, 2111 (2021)"},{"key":"788_CR7","doi-asserted-by":"crossref","unstructured":"Augonnet, C., Thibault, S., Namyst, R., Wacrenier, P.-A.: StarPU: a unified platform for task scheduling on heterogeneous multicore architectures. Concurrency and Computation: Practice and Experience, 187\u2013198 (2011)","DOI":"10.1002\/cpe.1631"},{"issue":"6","key":"788_CR8","doi-asserted-by":"publisher","first-page":"36","DOI":"10.1109\/MCSE.2013.98","volume":"15","author":"G Bosilca","year":"2013","unstructured":"Bosilca, G., Bouteiller, A., Danalis, A., Faverge, M., Herault, T., Dongarra, J.J.: PaRSEC: exploiting Heterogeneity to Enhance Scalability. Comput. Sci. Eng. 15(6), 36\u201345 (2013). https:\/\/doi.org\/10.1109\/MCSE.2013.98. (Conference Name: Computing in Science Engineering)","journal-title":"Comput. Sci. Eng."},{"key":"788_CR9","doi-asserted-by":"publisher","unstructured":"Bauer, M., Treichler, S., Slaughter, E., Aiken, A.: Legion: Expressing locality and independence with logical regions. In: SC \u201912: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, pp. 1\u201311 (2012). https:\/\/doi.org\/10.1109\/SC.2012.71","DOI":"10.1109\/SC.2012.71"},{"key":"788_CR10","unstructured":"Foord, M., Muirhead, C.: IronPython in Action. Manning Publications Co. (2009)"},{"key":"788_CR11","unstructured":"Gross, S.: PEP 703 \u2013 Making the Global Interpreter Lock Optional in CPython (2023)"},{"key":"788_CR12","unstructured":"NVIDIA: cuFFT (2023). https:\/\/developer.nvidia.com\/cuFFT"},{"key":"788_CR13","unstructured":"NVIDIA: cuBLAS (2023). https:\/\/developer.nvidia.com\/cublas"},{"issue":"5\u20136","key":"788_CR14","doi-asserted-by":"publisher","first-page":"232","DOI":"10.1016\/j.parco.2009.12.005","volume":"36","author":"S Tomov","year":"2010","unstructured":"Tomov, S., Dongarra, J., Baboulin, M.: Towards dense linear algebra for hybrid GPU accelerated manycore systems. Parallel Comput. 36(5\u20136), 232\u2013240 (2010). https:\/\/doi.org\/10.1016\/j.parco.2009.12.005","journal-title":"Parallel Comput."},{"key":"788_CR15","doi-asserted-by":"publisher","DOI":"10.1145\/2818311","author":"A Abdelfattah","year":"2016","unstructured":"Abdelfattah, A., Keyes, D., Ltaief, H.: Kblas: an optimized library for dense matrix-vector multiplication on gpu accelerators. ACM Trans. Math. Softw. (2016). https:\/\/doi.org\/10.1145\/2818311","journal-title":"ACM Trans. Math. Softw."},{"key":"788_CR16","unstructured":"Moritz, P., Nishihara, R., Wang, S., Tumanov, A., Liaw, R., Liang, E., Elibol, M., Yang, Z., Paul, W., Jordan, M.I., et al.: Ray: A distributed framework for emerging $$\\{$$AI$$\\}$$ applications. In: Operating Systems Design and Implementation, pp. 561\u2013577 (2018)"},{"issue":"1","key":"788_CR17","doi-asserted-by":"publisher","first-page":"66","DOI":"10.1177\/1094342015594678","volume":"31","author":"E Tejedor","year":"2017","unstructured":"Tejedor, E., Becerra, Y., Alomar, G., Queralt, A., Badia, R.M., Torres, J., Cortes, T., Labarta, J.: Pycompss: parallel computational workflows in python. Int. J. High Perf. Comput. Appl. 31(1), 66\u201382 (2017). https:\/\/doi.org\/10.1177\/1094342015594678","journal-title":"Int. J. High Perf. Comput. Appl."},{"key":"788_CR18","doi-asserted-by":"publisher","unstructured":"Lee, H., Ruys, W., Henriksen, I., Peters, A., Yan, Y., Stephens, S., You, B., Fingler, H., Burtscher, M., Gligoric, M., Schulz, K., Pingali, K., Rossbach, C.J., Erez, M., Biros, G.: Parla: A python orchestration system for heterogeneous architectures. In: SC22: International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201315 (2022). https:\/\/doi.org\/10.1109\/SC41404.2022.00056","DOI":"10.1109\/SC41404.2022.00056"},{"key":"788_CR19","doi-asserted-by":"crossref","unstructured":"Slaughter, E., Wu, W., Fu, Y., Brandenburg, L., Garcia, N., Kautz, W., Marx, E., Morris, K.S., Cao, Q., Bosilca, G., et al.: Task bench: A parameterized benchmark for evaluating parallel runtime performance. In: SC20: International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201315 (2020). IEEE","DOI":"10.1109\/SC41405.2020.00066"},{"key":"788_CR20","unstructured":"DeVito, Z.: Torchscript: Optimized execution of pytorch programs. Retrieved January (2022)"},{"key":"788_CR21","unstructured":"Snow, E.: PEP 703 \u2013 A Per-Interpreter GIL (2022)"},{"key":"788_CR22","doi-asserted-by":"crossref","unstructured":"Gonthier, M., Marchal, L., Thibault, S.: Memory-Aware Scheduling of Tasks Sharing Data on Multiple GPUs with Dynamic Runtime Systems, p. 1. IEEE (2022). https:\/\/hal.inria.fr\/hal-03552243 Accessed 2022-05-20","DOI":"10.1109\/IPDPS53621.2022.00073"},{"key":"788_CR23","unstructured":"StarPU Handbook - Language Bindings (2023)"},{"key":"788_CR24","unstructured":"Cloudpickle: Extended Pickling Support for Python Objects (2023)"},{"key":"788_CR25","unstructured":"Beazley, D.: Understanding the Python GIL. In: PyCON Python Conference. Atlanta, Georgia, pp. 1\u201362 (2010)"},{"key":"788_CR26","unstructured":"Rossum, N.C.: PEP 343 \u2013 The \"with\" Statement (2005)"},{"key":"788_CR27","doi-asserted-by":"crossref","unstructured":"Stanzione, D., West, J., Evans, R.T., Minyard, T., Ghattas, O., Panda, D.K.: Frontera: The evolution of leadership computing at the national science foundation. In: Practice and Experience in Advanced Research Computing, pp. 106\u2013111 (2020)","DOI":"10.1145\/3311790.3396656"},{"key":"788_CR28","unstructured":"Texas Advanced Computing Center (TACC), The University of Texas at Austin (2018). https:\/\/www.tacc.utexas.edu\/"},{"key":"788_CR29","doi-asserted-by":"publisher","unstructured":"Almgren-Bell, J., Awar, N.A., Geethakrishnan, D., Grigoric, M., Biros, G.: A Multi-GPU Python solver for low-temperature non-equilibrium plasmas. In: IEEE 34th International Symposium on Computer Architecture and High Performance Computing SBAC-PAD 2022, p. 11 (2022). https:\/\/doi.org\/10.1109\/SBAC-PAD55451.2022.00025","DOI":"10.1109\/SBAC-PAD55451.2022.00025"},{"key":"788_CR30","doi-asserted-by":"crossref","unstructured":"Fan, Y., Lan, Z., Rich, P., Allcock, W.E., Papka, M.E., Austin, B., Paul, D.: Scheduling beyond CPUs for HPC. In: Proceedings of the 28th International Symposium on High-Performance Parallel and Distributed Computing, pp. 97\u2013108 (2019)","DOI":"10.1145\/3307681.3325401"},{"key":"788_CR31","unstructured":"Gog, I., Schwarzkopf, M., Gleave, A., Watson, R.N., Hand, S.: Firmament: Fast, centralized cluster scheduling at scale. In: 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16), pp. 99\u2013115 (2016)"},{"key":"788_CR32","doi-asserted-by":"crossref","first-page":"76","DOI":"10.1016\/j.jpdc.2017.06.009","volume":"111","author":"A Reuther","year":"2018","unstructured":"Reuther, A., Byun, C., Arcand, W., Bestor, D., Bergeron, B., Hubbell, M., Jones, M., Michaleas, P., Prout, A., Rosa, A., et al.: Scalable system scheduling for hpc and big data. J. Parallel Distrib. Comput. 111, 76\u201392 (2018)","journal-title":"J. Parallel Distrib. Comput."},{"key":"788_CR33","doi-asserted-by":"crossref","unstructured":"Schwiegeishohn, U., Yahyapour, R.: Improving first-come-first-serve job scheduling by gang scheduling. In: Workshop on Job Scheduling Strategies for Parallel Processing, pp. 180\u2013198 (1998). Springer","DOI":"10.1007\/BFb0053987"},{"key":"788_CR34","doi-asserted-by":"crossref","unstructured":"Han, J., Rafique, M.M., Xu, L., Butt, A.R., Lim, S.-H., Vazhkudai, S.S.: Marble: A multi-gpu aware job scheduler for deep learning on hpc systems. In: 2020 20th IEEE\/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID), pp. 272\u2013281 (2020)","DOI":"10.1109\/CCGrid49817.2020.00-66"},{"key":"788_CR35","doi-asserted-by":"crossref","unstructured":"Chen, C., Wang, S., Chen, Y., Han, J.: Tereis: A package-based scheduling in deep learning systems. In: 2022 IEEE 28th International Conference on Parallel and Distributed Systems (ICPADS), pp. 867\u2013874 (2023). IEEE","DOI":"10.1109\/ICPADS56603.2022.00117"},{"key":"788_CR36","unstructured":"Dask Development Team: Dask: Library for Dynamic Task Scheduling. (2016)"},{"key":"788_CR37","doi-asserted-by":"crossref","unstructured":"Babuji, Y., Woodard, A., Li, Z., Katz, D.S., Clifford, B., Kumar, R., Lacinski, L., Chard, R., Wozniak, J.M., Foster, I. and Wilde, M.: Parsl: Pervasive Parallel Programming in Python. HPDC \u201919 (2019)","DOI":"10.1145\/3332186.3332231"},{"key":"788_CR38","unstructured":"Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, G.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg, J., Man\u00e9, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., Vi\u00e9gas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: Large-Scale Machine Learning on Heterogeneous Systems (2015)"},{"key":"788_CR39","doi-asserted-by":"publisher","unstructured":"Chamberlain, B.L.: Chapel (Cray Inc. HPCS Language), pp. 249\u2013256 (2011). https:\/\/doi.org\/10.1007\/978-0-387-09766-4_54","DOI":"10.1007\/978-0-387-09766-4_54"},{"issue":"2","key":"788_CR40","doi-asserted-by":"crossref","first-page":"203","DOI":"10.1177\/1094342006064503","volume":"20","author":"J Nieplocha","year":"2006","unstructured":"Nieplocha, J., Palmer, B., Tipparaju, V., Krishnan, M., Trease, H., Apra, E.: Advances, applications and performance of the global arrays shared memory programming toolkit. Int. J. High Perf. Comput. Appl. 20(2), 203\u2013231 (2006)","journal-title":"Int. J. High Perf. Comput. Appl."},{"key":"788_CR41","doi-asserted-by":"crossref","unstructured":"Mishler, D., Ciesko, J., Olivier, S., Bosilca, G.: Performance Insights into Device-initiated RMA Using Kokkos Remote Spaces. In: 2023 IEEE International Conference on Cluster Computing Workshops (2023)","DOI":"10.1109\/CLUSTERWorkshops61457.2023.00028"},{"key":"788_CR42","doi-asserted-by":"crossref","unstructured":"Slaughter, E., Aiken, A.: Pygion: Flexible, scalable task-based parallelism with python. In: Parallel Applications Workshop, Alternatives To MPI, pp. 58\u201372 (2019)","DOI":"10.1109\/PAW-ATM49560.2019.00011"},{"key":"788_CR43","doi-asserted-by":"crossref","unstructured":"Bauer, M., Garland, M.: Legate NumPy: Accelerated and distributed array computing. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC19) (2019)","DOI":"10.1145\/3295500.3356175"}],"container-title":["International Journal of Parallel Programming"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-025-00788-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10766-025-00788-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-025-00788-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,5]],"date-time":"2025-04-05T20:42:29Z","timestamp":1743885749000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10766-025-00788-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,18]]},"references-count":43,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,4]]}},"alternative-id":["788"],"URL":"https:\/\/doi.org\/10.1007\/s10766-025-00788-1","relation":{},"ISSN":["0885-7458","1573-7640"],"issn-type":[{"type":"print","value":"0885-7458"},{"type":"electronic","value":"1573-7640"}],"subject":[],"published":{"date-parts":[[2025,3,18]]},"assertion":[{"value":"30 September 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 February 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 March 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"16"}}