{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T08:00:45Z","timestamp":1772870445759,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,2,27]],"date-time":"2023-02-27T00:00:00Z","timestamp":1677456000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1927880"],"award-info":[{"award-number":["1927880"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,2,27]]},"DOI":"10.1145\/3581576.3581615","type":"proceedings-article","created":{"date-parts":[[2023,2,3]],"date-time":"2023-02-03T17:06:17Z","timestamp":1675443977000},"page":"24-34","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Performance Study on CPU-based Machine Learning with PyTorch"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5540-2731","authenticated-orcid":false,"given":"Smeet","family":"Chheda","sequence":"first","affiliation":[{"name":"Institute for Advanced Computational Science, Stony Brook University, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8192-0700","authenticated-orcid":false,"given":"Anthony","family":"Curtis","sequence":"additional","affiliation":[{"name":"Institute for Advanced Computational Science, Stony Brook University, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1216-1576","authenticated-orcid":false,"given":"Eva","family":"Siegmann","sequence":"additional","affiliation":[{"name":"Institute for Advanced Computational Science, Stony Brook University, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8449-8579","authenticated-orcid":false,"given":"Barbara","family":"Chapman","sequence":"additional","affiliation":[{"name":"Institute for Advanced Computational Science, Stony Brook University, United States"}]}],"member":"320","published-online":{"date-parts":[[2023,2,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Imagenet Large Scale Visual Recognition Challenge","year":"2015","unstructured":"2015. Imagenet Large Scale Visual Recognition Challenge 2015. https:\/\/image-net.org\/challenges\/LSVRC\/2015\/results"},{"key":"e_1_3_2_1_2_1","unstructured":"2019. AdaSum with Horovod. https:\/\/horovod.readthedocs.io\/en\/stable\/adasum_user_guide_include.html"},{"key":"e_1_3_2_1_3_1","volume-title":"12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","author":"Abadi Martin","year":"2016","unstructured":"Martin Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek\u00a0G. Murray, Benoit Steiner, Paul Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng. 2016. TensorFlow: A system for large-scale machine learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16). 265\u2013283. https:\/\/www.usenix.org\/system\/files\/conference\/osdi16\/osdi16-abadi.pdf"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3146347.3146356"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/Cluster48925.2021.00106"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3084795"},{"key":"e_1_3_2_1_7_1","volume-title":"Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems. arXiv preprint arXiv:1512.01274(2015).","author":"Chen Tianqi","year":"2015","unstructured":"Tianqi Chen, Mu Li, Yutian Li, Min Lin, Naiyan Wang, Minjie Wang, Tianjun Xiao, Bing Xu, Chiyuan Zhang, and Zheng Zhang. 2015. Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems. arXiv preprint arXiv:1512.01274(2015)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/SBAC-PAD55451.2022.00027"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/MLHPC54614.2021.00009"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503470.3503478"},{"key":"e_1_3_2_1_12_1","unstructured":"Fujitsu Global. 2019. Fujitsu Processor A64FX. www.fujitsu.com\/global\/products\/computing\/servers\/supercomputer\/a64fx"},{"key":"e_1_3_2_1_13_1","unstructured":"Fujitsu Global. 2021. Fujitsu PyTorch fork. https:\/\/github.com\/fujitsu\/pytorch"},{"key":"e_1_3_2_1_14_1","unstructured":"Priya Goyal Piotr Doll\u00e1r Ross Girshick Pieter Noordhuis Lukasz Wesolowski Aapo Kyrola Andrew Tulloch Yangqing Jia and Kaiming He. 2017. Accurate large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677(2017)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Philipp Grete Joshua\u00a0C Dolence Jonah\u00a0M Miller Joshua Brown Ben Ryan Andrew Gaspar Forrest Glines Sriram Swaminarayan Jonas Lippuner Clell\u00a0J Solomon 2022. Parthenon\u2013a performance portable block-structured adaptive mesh refinement framework. arXiv preprint arXiv:2202.12309(2022).","DOI":"10.1177\/10943420221143775"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_17_1","unstructured":"hpl-ai.org. 2022. HPL AI Mixed Precision Benchmark. https:\/\/hpl-ai.org\/doc\/results\/"},{"key":"e_1_3_2_1_18_1","unstructured":"https:\/\/portal.tacc.utexas.edu\/. 2017. Stampede2 User Guide. https:\/\/portal.tacc.utexas.edu\/user-guides\/stampede2"},{"key":"e_1_3_2_1_19_1","unstructured":"https:\/\/www.oneapi.io\/open-source\/.2016. oneAPI Deep Neural Network Library. https:\/\/github.com\/oneapi-src\/oneDNN"},{"key":"e_1_3_2_1_20_1","unstructured":"IACS. 2020. Ookami Homepage. https:\/\/www.stonybrook.edu\/commcms\/ookami\/"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1587\/transele.2021LHP0001"},{"key":"e_1_3_2_1_22_1","unstructured":"Alex Krizhevsky Geoffrey Hinton 2009. Learning multiple layers of features from tiny images. (2009)."},{"key":"e_1_3_2_1_23_1","volume-title":"Advances in Neural Information Processing Systems, F.\u00a0Pereira, C.J. Burges, L.\u00a0Bottou, and K","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey\u00a0E Hinton. 2012. ImageNet Classification with Deep Convolutional Neural Networks. In Advances in Neural Information Processing Systems, F.\u00a0Pereira, C.J. Burges, L.\u00a0Bottou, and K.Q. Weinberger (Eds.). Vol.\u00a025. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper\/2012\/file\/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf"},{"key":"e_1_3_2_1_24_1","unstructured":"Nouamane Laanait Joshua Romero Junqi Yin M\u00a0Todd Young Sean Treichler Vitalii Starchenko Albina Borisevich Alex Sergeev and Michael Matheson. 2019. Exascale deep learning for scientific inverse problems. arXiv preprint arXiv:1909.11150(2019)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/5.726791"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.591"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00068"},{"key":"e_1_3_2_1_28_1","volume-title":"Comparing OpenMP Implementations with Applications Across A64FX Platforms","author":"Michalowicz Benjamin","unstructured":"Benjamin Michalowicz, Eric Raut, Yan Kang, Tony Curtis, Barbara Chapman, and Dossay Oryspayev. 2021. Comparing OpenMP Implementations with Applications Across A64FX Platforms. In OpenMP: Enabling Massive Node-Level Parallelism, Simon McIntosh-Smith, Bronis\u00a0R. de\u00a0Supinski, and Jannis Klinkenberg (Eds.). Springer International Publishing, Cham, 127\u2013141."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Benjamin Michalowicz Eric Raut Yan Kang Tony Curtis Barbara Chapman and Dossay Oryspayev. 2021. Comparing the behavior of OpenMP Implementations with various Applications on two different Fujitsu A64FX platforms. In Practice and Experience in Advanced Research Computing. 1\u20134.","DOI":"10.1145\/3437359.3465592"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.sysarc.2019.101635"},{"key":"e_1_3_2_1_31_1","unstructured":"MLCommons. 2020. ML Commons. https:\/\/mlcommons.org\/en\/training-hpc-07\/"},{"key":"e_1_3_2_1_32_1","volume-title":"PyTorch: An Imperative Style","author":"Paszke Adam","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Advances in Neural Information Processing Systems 32, H.\u00a0Wallach, H.\u00a0Larochelle, A.\u00a0Beygelzimer, F.\u00a0d'Alch\u00e9-Buc, E.\u00a0Fox, and R.\u00a0Garnett (Eds.). Curran Associates, Inc., 8024\u20138035. http:\/\/papers.neurips.cc\/paper\/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf"},{"key":"e_1_3_2_1_33_1","unstructured":"RIKEN. 2021. Fugaku Supercomputer. https:\/\/www.r-ccs.riken.jp\/en\/fugaku"},{"key":"e_1_3_2_1_34_1","unstructured":"Alexander Sergeev and Mike\u00a0Del Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. arXiv preprint arXiv:1802.05799(2018)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/HiPC53243.2021.00029"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330756"},{"key":"e_1_3_2_1_37_1","unstructured":"TOP500.org. 2020. HPCG June 2022. https:\/\/www.top500.org\/lists\/hpcg\/2022\/06\/"},{"key":"e_1_3_2_1_38_1","unstructured":"TOP500.org. 2020. Top500 November 2021. https:\/\/www.top500.org\/lists\/top500\/2021\/11\/"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE.2017.160"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Sergey Zagoruyko and Nikos Komodakis. 2016. Wide residual networks. arXiv preprint arXiv:1605.07146(2016).","DOI":"10.5244\/C.30.87"}],"event":{"name":"HPCAsia2023 Workshop: International Conference on High Performance Computing in Asia-Pacific Region Workshops","location":"Raffles Blvd Singapore","acronym":"HPCAsia2023 Workshop"},"container-title":["Proceedings of the HPC Asia 2023 Workshops"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581576.3581615","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581576.3581615","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581576.3581615","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:36:20Z","timestamp":1750178180000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581576.3581615"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,2,27]]},"references-count":41,"alternative-id":["10.1145\/3581576.3581615","10.1145\/3581576"],"URL":"https:\/\/doi.org\/10.1145\/3581576.3581615","relation":{},"subject":[],"published":{"date-parts":[[2023,2,27]]},"assertion":[{"value":"2023-02-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}