{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T17:08:25Z","timestamp":1773248905333,"version":"3.50.1"},"reference-count":32,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2021,4,12]],"date-time":"2021-04-12T00:00:00Z","timestamp":1618185600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,4,12]],"date-time":"2021-04-12T00:00:00Z","timestamp":1618185600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"name":"the National Key R&D Program of China","award":["2016YFB0200902"],"award-info":[{"award-number":["2016YFB0200902"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2021,11]]},"DOI":"10.1007\/s11227-021-03759-8","type":"journal-article","created":{"date-parts":[[2021,4,12]],"date-time":"2021-04-12T11:07:38Z","timestamp":1618225658000},"page":"12647-12665","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["Performance evaluation of convolutional neural network on Tianhe-3 prototype"],"prefix":"10.1007","volume":"77","author":[{"given":"Weiduo","family":"Chen","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoshe","family":"Dong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Heng","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qiang","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xingda","family":"Yu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xingjun","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2021,4,12]]},"reference":[{"key":"3759_CR1","unstructured":"Abadi M, Barham P, Chen J (2016) Tensorflow: A system for large-scale machine learning. In: 12th USENIX symposium on operating systems design and implementation, OSDI 2016, Savannah, GA, USA, November 2-4, 2016, USENIX Association, pp 265\u2013283"},{"key":"3759_CR2","doi-asserted-by":"crossref","unstructured":"Awan AA, Subramoni H, Panda DK (2017) An in-depth performance characterization of CPU- and gpu-based DNN training on modern architectures. In: Proceedings of the machine learning on HPC environments, MLHPC@SC 2017, Denver, CO, USA, November 13, 2017, ACM, pp 8:1\u20138:8","DOI":"10.1145\/3146347.3146356"},{"key":"3759_CR3","unstructured":"Chetlur S, Woolley C, Vandermersch P (2014) cudnn: Efficient primitives for deep learning. CoRR abs\/1410.0759, arXiv:1410.0759"},{"key":"3759_CR4","unstructured":"Chilimbi TM, Suzue Y, Apacible J (2014) Project adam: Building an efficient and scalable deep learning training system. In: 11th USENIX symposium on operating systems design and implementation, OSDI \u201914, Broomfield, CO, USA, October 6-8, 2014, USENIX Association, pp 571\u2013582"},{"key":"3759_CR5","unstructured":"Dean J, Corrado G, Monga R, (2012) Large scale distributed deep networks. In: Advances in neural information processing systems 25: 26th annual conference on neural information processing systems, (2012) Proceedings of a meeting held December 3\u20136, 2012. Lake Tahoe, Nevada, United States, pp 1232\u20131240"},{"key":"3759_CR6","unstructured":"Developer N (2018) Nvidia turing architecture whitepaper. Whitepaper, accessed April 26, 2020"},{"key":"3759_CR7","doi-asserted-by":"crossref","unstructured":"Fang J, Fu H, Zhao W (2017) swdnn: A library for accelerating deep learning applications on sunway taihulight. In: 2017 IEEE International parallel and distributed processing symposium, IPDPS 2017, Orlando, FL, USA, May 29 - June 2, 2017, IEEE Computer Society, pp 615\u2013624","DOI":"10.1109\/IPDPS.2017.20"},{"key":"3759_CR8","unstructured":"Gibiansky A (2016) Bringing hpc techniques to deep learning. Website, http:\/\/research.baidu.com\/bringing-hpc-techniques-deep-learning\/, accessed Mar 22, 2018"},{"key":"3759_CR9","doi-asserted-by":"crossref","unstructured":"Goto K, van de Geijn RA (2008) Anatomy of high-performance matrix multiplication. ACM Trans Math Softw 34(3):12:1-12:25","DOI":"10.1145\/1356052.1356053"},{"key":"3759_CR10","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"3759_CR11","unstructured":"Howard AG, Zhu M, Chen B (2017) Mobilenets: Efficient convolutional neural networks for mobile vision applications. CoRR abs\/1704.04861, arXiv:1704.04861"},{"key":"3759_CR12","unstructured":"Jack D (2017) Report on the tianhe-2a system. Technical report, https:\/\/www.icl.utk.edu\/files\/publications\/2017\/icl-utk-970-2017.pdf, accessed April 4, 2020"},{"key":"3759_CR13","doi-asserted-by":"crossref","unstructured":"Jang M, Kim K, Kim K (2011) The performance analysis of ARM NEON technology for mobile platforms. In: Research in applied computation symposium, RACS \u201911, Miami, FL, USA, October 19-22, 2011, ACM, pp 104\u2013106","DOI":"10.1145\/2103380.2103401"},{"key":"3759_CR14","unstructured":"JD M (1996) Stream benchmark. Website, http:\/\/www.cs.virginia.edu\/stream\/ref.html#what, accessed April 26, 2020"},{"key":"3759_CR15","unstructured":"Jia X, Song S, He W (2018) Highly scalable deep learning training system with mixed-precision: Training imagenet in four minutes. CoRR abs\/1807.11205, arXiv:1807.11205"},{"key":"3759_CR16","first-page":"1097","volume":"25","author":"A Krizhevsky","year":"2012","unstructured":"Krizhevsky A, Sutskever I, Hinton GE (2012) Imagenet classification with deep convolutional neural networks. Adv Neural Inform Process Syst 25:1097\u20131105","journal-title":"Adv Neural Inform Process Syst"},{"key":"3759_CR17","doi-asserted-by":"crossref","unstructured":"Lavin A, Gray S (2016) Fast algorithms for convolutional neural networks. In: 2016 IEEE conference on computer vision and pattern recognition, CVPR 2016, Las Vegas, NV, USA, June 27-30, 2016, IEEE Computer Society, pp 4013\u20134021","DOI":"10.1109\/CVPR.2016.435"},{"issue":"6","key":"3759_CR18","doi-asserted-by":"publisher","first-page":"939","DOI":"10.1631\/FITEE.1900075","volume":"21","author":"Y Li","year":"2020","unstructured":"Li Y, Chen X, Liu J (2020) OHTMA: an optimized heuristic topology-aware mapping algorithm on the tianhe-3 exascale supercomputer prototype. Front Inf Technol Electron Eng 21(6):939\u2013949","journal-title":"Front Inf Technol Electron Eng"},{"key":"3759_CR19","unstructured":"Lian X, Zhang C, Zhang H, (2017) Can decentralized algorithms outperform centralized algorithms? A case study for decentralized parallel stochastic gradient descent. Advances in neural information processing systems 30: annual conference on neural information processing systems, (2017) 4\u20139 December 2017. Long Beach, CA, USA, pp 5330\u20135340"},{"key":"3759_CR20","doi-asserted-by":"crossref","unstructured":"McIntosh-Smith S, Price J, Deakin T (2019) A performance analysis of the first generation of hpc-optimized arm processors. Concurr Comput Pract Exp 31(16)","DOI":"10.1002\/cpe.5110"},{"key":"3759_CR21","unstructured":"Molchanov P, Tyree S, Karras T (2017) Pruning convolutional neural networks for resource efficient inference. In: 5th International conference on learning representations, ICLR 2017, Toulon, France, April 24-26, 2017, Conference Track Proceedings, OpenReview.net"},{"key":"3759_CR22","doi-asserted-by":"publisher","first-page":"322","DOI":"10.1016\/j.future.2013.07.013","volume":"36","author":"N Rajovic","year":"2014","unstructured":"Rajovic N, Rico A, Puzovic N (2014) Tibidabo: making the case for an arm-based HPC system. Fut Gener Comput Syst 36:322\u2013334","journal-title":"Fut Gener Comput Syst"},{"key":"3759_CR23","unstructured":"Research B (2019) Deepbench. Website, https:\/\/github.com\/baidu-research\/DeepBench, accessed April 26, 2020"},{"key":"3759_CR24","unstructured":"Shazeer N, Mirhoseini A, Maziarz K (2017) Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. In: 5th International conference on learning representations, ICLR 2017, Toulon, France, April 24-26, 2017, Conference Track Proceedings, OpenReview.net"},{"key":"3759_CR25","unstructured":"Sun D, Liu S, Gaudiot J (2017) Enabling embedded inference engine with ARM compute library: a case study. CoRR abs\/1704.03751, arXiv:1704.03751"},{"key":"3759_CR26","doi-asserted-by":"crossref","unstructured":"Szegedy C, Liu W, Jia Y (2015) Going deeper with convolutions. In: IEEE conference on computer vision and pattern recognition, CVPR 2015, Boston, MA, USA, June 7-12, 2015, IEEE Computer Society, pp 1\u20139","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"3759_CR27","doi-asserted-by":"crossref","unstructured":"Watcharapichat P, Morales VL, Fernandez RC (2016) Ako: Decentralised deep learning with partial gradient exchange. In: Proceedings of the seventh ACM symposium on cloud computing, Santa Clara, CA, USA, October 5-7, 2016, ACM, pp 84\u201397","DOI":"10.1145\/2987550.2987586"},{"issue":"4","key":"3759_CR28","doi-asserted-by":"publisher","first-page":"65","DOI":"10.1145\/1498765.1498785","volume":"52","author":"S Williams","year":"2009","unstructured":"Williams S, Waterman A, Patterson DA (2009) Roofline: an insightful visual performance model for multicore architectures. Commun ACM 52(4):65\u201376","journal-title":"Commun ACM"},{"issue":"10","key":"3759_CR29","doi-asserted-by":"publisher","first-page":"7003","DOI":"10.1007\/s11227-019-02911-9","volume":"75","author":"D Yokoyama","year":"2019","unstructured":"Yokoyama D, Schulze B, Borges F (2019) The survey on ARM processors for HPC. J Supercomput 75(10):7003\u20137036","journal-title":"J Supercomput"},{"key":"3759_CR30","doi-asserted-by":"crossref","unstructured":"You X, Yang H, and ZL (2019) Performance evaluation and analysis of linear algebra kernels in the prototype tianhe-3 cluster. In: Supercomputing Frontiers - 5th Asian Conference, SCFA 2019, Singapore, March 11-14, 2019, Proceedings, Springer, Lecture Notes in Computer Science, vol 11416, pp 86\u2013105","DOI":"10.1007\/978-3-030-18645-6_6"},{"key":"3759_CR31","unstructured":"Zhang X, Wang Q, W S (2020) Openblas: an optimized blas library. Website, http:\/\/www.openblas.net\/, accessed April 25, 2020"},{"issue":"12","key":"3759_CR32","doi-asserted-by":"publisher","first-page":"2094","DOI":"10.14778\/3352063.3352127","volume":"12","author":"R Zhu","year":"2019","unstructured":"Zhu R, Zhao K, Yang H (2019) Aligraph: a comprehensive graph neural network platform. Proc VLDB Endow 12(12):2094\u20132105","journal-title":"Proc VLDB Endow"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-021-03759-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-021-03759-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-021-03759-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,24]],"date-time":"2022-12-24T02:24:08Z","timestamp":1671848648000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-021-03759-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,4,12]]},"references-count":32,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2021,11]]}},"alternative-id":["3759"],"URL":"https:\/\/doi.org\/10.1007\/s11227-021-03759-8","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"value":"0920-8542","type":"print"},{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,4,12]]},"assertion":[{"value":"18 March 2021","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 April 2021","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}