{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T19:19:28Z","timestamp":1757618368011,"version":"3.44.0"},"reference-count":39,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"\u201cFutureHPC & BigData\u201d of the ICSC - Centro Nazionale di Ricerca in \u201cHigh Performance Computing, Big Data and Quantum Computing"},{"name":"DYMAN","award":["101161930"],"award-info":[{"award-number":["101161930"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Parallel Prog"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1007\/s10766-025-00802-6","type":"journal-article","created":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T12:39:41Z","timestamp":1751287181000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Analysis of Model Parallelism for AI Applications on a 64-core RV64 Server CPU"],"prefix":"10.1007","volume":"53","author":[{"given":"Giulio","family":"Malenza","sequence":"first","affiliation":[]},{"given":"Adriano Marques","family":"Garcia","sequence":"additional","affiliation":[]},{"given":"Robert","family":"Birke","sequence":"additional","affiliation":[]},{"given":"Luca","family":"Benini","sequence":"additional","affiliation":[]},{"given":"Marco","family":"Aldinucci","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"802_CR1","doi-asserted-by":"publisher","DOI":"10.3390\/info14020064","author":"S Kalapothas","year":"2023","unstructured":"Kalapothas, S., Galetakis, M., Flamis, G., Plessas, F., Kitsos, P.: A survey on RISC-v-based machine learning ecosystem. Information (2023). https:\/\/doi.org\/10.3390\/info14020064","journal-title":"Information"},{"key":"802_CR2","unstructured":"Yuwei, L.: First 64-bit Multi-core RISC-V CPU Server Cluster Launched. https:\/\/www.eet-china.com\/news\/202308282478.html. Accessed 1ST Oct 2024 (2023)"},{"key":"802_CR3","unstructured":"SiFive: SiFive Announces New High?performance RISC?V Datacenter Processor for Demanding AI Workloads. https:\/\/www.sifive.com\/press\/sifive-announces-high-performance-risc-v-datacenter-processor-for-ai-workloads. Accessed 30 Sept 2024 (2024)"},{"key":"802_CR4","doi-asserted-by":"publisher","unstructured":"Gholami, A., Yao, Z., Kim, S., Hooper, C., Mahoney, M.W., Keutzer, K.: Ai and memory wall. IEEE Micro, pp. 1\u20135 (2024) https:\/\/doi.org\/10.1109\/MM.2024.3373763","DOI":"10.1109\/MM.2024.3373763"},{"key":"802_CR5","doi-asserted-by":"publisher","unstructured":"Chen, C., Xiang, X., Liu, C., Shang, Y., Guo, R., Liu, D., Lu, Y., Hao, Z., Luo, J., Chen, Z., Li, C., Pu, Y., Meng, J., Yan, X., Xie, Y., Qi, X.: Xuantie-910: a commercial multi-core 12-stage pipeline out-of-order 64-bit high performance risc-v processor with vector extension. In: Proceedings of the ACM\/IEEE 47th Annual International Symposium on Computer Architecture. ISCA \u201920, pp. 52\u201364. IEEE Press, Virtual Event (2020). https:\/\/doi.org\/10.1109\/ISCA45697.2020.00016","DOI":"10.1109\/ISCA45697.2020.00016"},{"key":"802_CR6","doi-asserted-by":"crossref","unstructured":"Waterman, A., Lee, Y., Patterson, D., Asanovic, K., Isa, V.I.U., Waterman, A., Lee, Y., Patterson, D.: The RISC-v instruction set manual. Volume I: User-Level ISA\u2019, version 2, pp. 1\u201379 (2014)","DOI":"10.21236\/ADA605735"},{"key":"802_CR7","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"802_CR8","doi-asserted-by":"publisher","unstructured":"Aldinucci, M., Danelutto, M., Kilpatrick, P., Torquati, M.: Fastflow: high-level and efficient streaming on multicore, (2014). https:\/\/doi.org\/10.1002\/9781119332015.ch13","DOI":"10.1002\/9781119332015.ch13"},{"issue":"1","key":"802_CR9","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s10766-022-00750-5","volume":"51","author":"N Tonci","year":"2022","unstructured":"Tonci, N., Torquati, M., Mencagli, G., Danelutto, M.: Distributed-memory fastflow building blocks. Int. J. Parallel Program. 51(1), 1\u201321 (2022). https:\/\/doi.org\/10.1007\/s10766-022-00750-5","journal-title":"Int. J. Parallel Program."},{"key":"802_CR10","unstructured":"ONNX: Open Neural Network Exchange (ONNX). https:\/\/github.com\/onnx\/onnx"},{"key":"802_CR11","unstructured":"Aldinucci, M., Danelutto, M.: Stream parallel skeleton optimization (2024). https:\/\/arxiv.org\/abs\/2408.12394"},{"key":"802_CR12","unstructured":"Cole, M.: Algorithmic skeletons: Structured management of parallel computation. (1989). https:\/\/api.semanticscholar.org\/CorpusID:13901043"},{"issue":"1","key":"802_CR13","doi-asserted-by":"publisher","first-page":"205","DOI":"10.1016\/0167-739X(92)90040-I","volume":"8","author":"M Danelutto","year":"1992","unstructured":"Danelutto, M., Di Meglio, R., Orlando, S., Pelagatti, S., Vanneschi, M.: A methodology for the development and the support of massively parallel programs. Futur. Gener. Comput. Syst. 8(1), 205\u2013220 (1992). https:\/\/doi.org\/10.1016\/0167-739X(92)90040-I","journal-title":"Futur. Gener. Comput. Syst."},{"key":"802_CR14","doi-asserted-by":"publisher","unstructured":"Garcia, A.M., Griebler, D., Schepke, C., Garc\u00eda, J.D., Mu\u00f1oz, J.F., Fernandes, L.G.: Performance and programmability of GRPPI for parallel stream processing on multi-cores. J. Supercomput. (In press), pp. 1\u201335 (2024) https:\/\/doi.org\/10.1007\/s11227-024-05934-z","DOI":"10.1007\/s11227-024-05934-z"},{"issue":"24","key":"802_CR15","doi-asserted-by":"publisher","first-page":"4175","DOI":"10.1002\/cpe.4175","volume":"29","author":"D del Rio Astorga","year":"2017","unstructured":"del Rio Astorga, D., Dolz, M.F., Fern\u00e1ndez, J., Garc\u00eda, J.D.: A generic parallel pattern interface for stream and data processing. Concurr. Comput. Pract. Exp. 29(24), 4175 (2017). https:\/\/doi.org\/10.1002\/cpe.4175","journal-title":"Concurr. Comput. Pract. Exp."},{"key":"802_CR16","doi-asserted-by":"crossref","unstructured":"Voss, M., Asenjo, R., Reinders, J.: Pro TBB: C++ Parallel Programming with Threading Building Blocks, 1st edn. Apress, USA (2019)","DOI":"10.1007\/978-1-4842-4398-5"},{"issue":"8","key":"802_CR17","doi-asserted-by":"publisher","first-page":"9206","DOI":"10.1007\/s11227-022-05024-y","volume":"79","author":"AM Garcia","year":"2023","unstructured":"Garcia, A.M., Griebler, D., Schepke, C., Fernandes, L.G.: Micro-batch and data frequency for stream processing on multi-cores. J. Supercomput. 79(8), 9206\u20139244 (2023). https:\/\/doi.org\/10.1007\/s11227-022-05024-y","journal-title":"J. Supercomput."},{"key":"802_CR18","unstructured":"PyTorch Documentation: CPU Threading and TorchScript inference. https:\/\/pytorch.org\/docs\/stable\/notes\/cpu_threading_torchscript_inference.html. Accessed 30 Sept 2024 (2024)"},{"key":"802_CR19","doi-asserted-by":"publisher","unstructured":"Wang, Q., Zhang, X., Zhang, Y., Yi, Q.: Augem: Automatically generate high performance dense linear algebra kernels on x86 cpus. In: SC \u201913: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, pp. 1\u201312 (2013). https:\/\/doi.org\/10.1145\/2503210.2503219","DOI":"10.1145\/2503210.2503219"},{"key":"802_CR20","unstructured":"Guennebaud, G., Jacob, B., et al.: Eigen v3. http:\/\/eigen.tuxfamily.org (2010)"},{"key":"802_CR21","unstructured":"Intel Corporation: oneAPI Deep Neural Network Library. https:\/\/github.com\/oneapi-src\/oneDNN"},{"key":"802_CR22","unstructured":"Iandola, F.N., Han, S., Moskewicz, M.W., Ashraf, K., Dally, W.J., Keutzer, K.: Squeezenet: Alexnet-level accuracy with 50x fewer parameters and $$<$$0.5mb model size. arXiv:1602.07360 (2016)"},{"key":"802_CR23","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition (2015)"},{"issue":"6","key":"802_CR24","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/3065386","volume":"60","author":"A Krizhevsky","year":"2017","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. Commun. ACM 60(6), 84\u201390 (2017). https:\/\/doi.org\/10.1145\/3065386","journal-title":"Commun. ACM"},{"key":"802_CR25","unstructured":"Zeiler, M.D., Fergus, R.: Visualizing and understanding convolutional networks (2013). https:\/\/arxiv.org\/abs\/1311.2901"},{"issue":"2","key":"802_CR26","doi-asserted-by":"publisher","first-page":"157","DOI":"10.1109\/72.279181","volume":"5","author":"Y Bengio","year":"1994","unstructured":"Bengio, Y., Simard, P., Frasconi, P.: Learning long-term dependencies with gradient descent is difficult. IEEE Trans. Neural Networks 5(2), 157\u2013166 (1994). https:\/\/doi.org\/10.1109\/72.279181","journal-title":"IEEE Trans. Neural Networks"},{"key":"802_CR27","unstructured":"Glorot, X., Bengio, Y.: Understanding the difficulty of training deep feedforward neural networks. In: International Conference on Artificial Intelligence and Statistics (2010). https:\/\/api.semanticscholar.org\/CorpusID:5575601"},{"key":"802_CR28","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., Fei-Fei, L.: Imagenet: A large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255 (2009). IEEE","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"802_CR29","unstructured":"Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., Desmaison, A., Kopf, A., Yang, E., DeVito, Z., Raison, M., Tejani, A., Chilamkurthy, S., Steiner, B., Fang, L., Bai, J., Chintala, S.: Pytorch: An imperative style, high-performance deep learning library. In: Wallach, H., Larochelle, H., Beygelzimer, A., Alch\u00e9-Buc, F., Fox, E., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol. 32, pp. 1\u201312. Curran Associates, Inc., Vancouver, Canada (2019). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2019\/file\/bdbca288fee7f92f2bfa9f7012727740-Paper.pdf"},{"key":"802_CR30","unstructured":"Colonnelli, I., Birke, R., Aldinucci, M.: Experimenting with pytorch on risc-v. In: RISC-V Summit Europe 2023, Barcelona, Spain (2023). Poster. https:\/\/iris.unito.it\/retrieve\/429bf344-9090-42c3-809c-1b8ac320a930\/2023-06-08-Iacopo-COLONNELLI-abstract.pdf"},{"key":"802_CR31","unstructured":"Free Software Foundation, Inc.: GCC, the GNU Compiler Collection. https:\/\/gcc.gnu.org"},{"issue":"3","key":"802_CR32","first-page":"14","volume":"41","author":"FG Zee","year":"2015","unstructured":"Zee, F.G., Geijn, R.A.: BLIS: A framework for rapidly instantiating BLAS functionality. ACM Trans. Math. Softw. 41(3), 14\u201311433 (2015)","journal-title":"ACM Trans. Math. Softw."},{"key":"802_CR33","unstructured":"Grant, W.S., Voorhies, R.: Cereal: a C++11 library for serialization. https:\/\/github.com\/USCiLab\/cereal (2013)"},{"key":"802_CR34","doi-asserted-by":"publisher","unstructured":"Mittone, G., Tonci, N., Birke, R., Colonnelli, I., Medi\u0107, D., Bartolini, A., Esposito, R., Parisi, E., Beneventi, F., Polato, M., Torquati, M., Benini, L., Aldinucci, M.: Experimenting with emerging RISC-V systems for decentralised machine learning. In: 20th ACM International Conference on Computing Frontiers (CF \u201923). ACM, Bologna, Italy (2023). https:\/\/doi.org\/10.1145\/3587135.3592211","DOI":"10.1145\/3587135.3592211"},{"key":"802_CR35","doi-asserted-by":"publisher","unstructured":"Ficarelli, F., Bartolini, A., Parisi, E., Beneventi, F., Barchi, F., Gregori, D., Magugliani, F., Cicala, M., Gianfreda, C., Cesarini, D., Acquaviva, A., Benini, L.: Meet monte cimone: exploring risc-v high performance compute clusters. CF \u201922, pp. 207\u2013208. Association for Computing Machinery, New York, NY, USA (2022). https:\/\/doi.org\/10.1145\/3528416.3530869","DOI":"10.1145\/3528416.3530869"},{"key":"802_CR36","doi-asserted-by":"publisher","unstructured":"Aldinucci, M., Rabellino, S., Pironti, M., Spiga, F., Viviani, P., Drocco, M., Guerzoni, M., Boella, G., Mellia, M., Margara, P., Drago, I., Marturano, R., Marchetto, G., Piccolo, E., Bagnasco, S., Lusso, S., Vallero, S., Attardi, G., Barchiesi, A., Colla, A., Galeazzi, F.: Hpc4ai: an ai-on-demand federated platform endeavour. CF \u201918, pp. 279\u2013286. Association for Computing Machinery, New York, NY, USA (2018). https:\/\/doi.org\/10.1145\/3203217.3205340","DOI":"10.1145\/3203217.3205340"},{"key":"802_CR37","doi-asserted-by":"publisher","first-page":"419","DOI":"10.1007\/978-3-031-40843-4_31","volume-title":"High Performance Computing","author":"JKL Lee","year":"2023","unstructured":"Lee, J.K.L., Jamieson, M., Brown, N., Jesus, R.: Test-driving RISC-v vector hardware for HPC. In: Bienz, A., Weiland, M., Baboulin, M., Kruse, C. (eds.) High Performance Computing, pp. 419\u2013432. Springer, Cham (2023)"},{"key":"802_CR38","doi-asserted-by":"publisher","unstructured":"Brown, N., Jamieson, M., Lee, J., Wang, P.: Is RISC-V ready for HPC prime-time: Evaluating the 64-core Sophon SG2042 RISC-V CPU. In: Proceedings of the SC \u201923 Workshops of The International Conference on High Performance Computing, Network, Storage, and Analysis. SC-W \u201923, pp. 1566\u20131574. Association for Computing Machinery, New York, NY, USA (2023). https:\/\/doi.org\/10.1145\/3624062.3624234","DOI":"10.1145\/3624062.3624234"},{"key":"802_CR39","unstructured":"Lepers, B., Qu\u00e9ma, V., Fedorova, A.: Thread and memory placement on NUMA systems: asymmetry matters. In: Proceedings of the 2015 USENIX Conference on USENIX Annual Technical Conference. USENIX ATC \u201915, pp. 277\u2013289. USENIX Association, USA (2015)"}],"container-title":["International Journal of Parallel Programming"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-025-00802-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10766-025-00802-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-025-00802-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,7]],"date-time":"2025-09-07T00:11:02Z","timestamp":1757203862000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10766-025-00802-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":39,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2025,8]]}},"alternative-id":["802"],"URL":"https:\/\/doi.org\/10.1007\/s10766-025-00802-6","relation":{},"ISSN":["0885-7458","1573-7640"],"issn-type":[{"type":"print","value":"0885-7458"},{"type":"electronic","value":"1573-7640"}],"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"7 October 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 May 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 June 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"27"}}