{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T17:59:25Z","timestamp":1771955965214,"version":"3.50.1"},"reference-count":35,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2021,9,1]],"date-time":"2021-09-01T00:00:00Z","timestamp":1630454400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,9,1]],"date-time":"2021-09-01T00:00:00Z","timestamp":1630454400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["CCF Trans. HPC"],"published-print":{"date-parts":[[2021,9]]},"DOI":"10.1007\/s42514-021-00072-x","type":"journal-article","created":{"date-parts":[[2021,9,28]],"date-time":"2021-09-28T09:03:30Z","timestamp":1632819810000},"page":"224-241","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["Establishing high performance AI ecosystem on Sunway platform"],"prefix":"10.1007","volume":"3","author":[{"given":"Sha","family":"Liu","sequence":"first","affiliation":[]},{"given":"Jie","family":"Gao","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7870-6535","authenticated-orcid":false,"given":"Xin","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Zeqiang","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Tianyu","family":"Zheng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,9,28]]},"reference":[{"key":"72_CR1","unstructured":"Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Zheng, X.: Tensorflow: Large-scale machine learning on heterogeneous distributed systems (2015)"},{"key":"72_CR2","first-page":"120","volume":"25","author":"G Bradski","year":"2000","unstructured":"Bradski, G.: The opencv library. Dr Dobbs J. Softw. Tools 25, 120\u2013125 (2000)","journal-title":"Dr Dobbs J. Softw. Tools"},{"key":"72_CR3","unstructured":"Brown, T.B., Mann, B., Ryder, N., Subbiah, M., Amodei, D.: Language models are few-shot learners (2020)"},{"key":"72_CR4","doi-asserted-by":"publisher","unstructured":"Chen, C., Peng, X., Xing, Z., Sun, J., Wang, X., Zhao, Y., Zhao, W.: IEEE Trans. Softw. Eng. (2020). https:\/\/doi.org\/10.1109\/TSE.2021.3074309","DOI":"10.1109\/TSE.2021.3074309"},{"key":"72_CR5","unstructured":"Chetlur, S., Woolley, C., Vandermersch, P., Cohen, J., Shelhamer, E.: Cudnn: Efficient primitives for deep learning. Comput. (2014)"},{"key":"72_CR6","unstructured":"Corporation, N.: Cublas library (2008)"},{"key":"72_CR7","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding (2018)"},{"key":"72_CR8","doi-asserted-by":"crossref","unstructured":"Fang, J., Fu, H., Zhao, W., Chen, B., Yang, G.: Swdnn: a library for accelerating deep learning applications on sunway taihulight. In: 2017 IEEE international parallel and distributed processing symposium (IPDPS) (2017)","DOI":"10.1109\/IPDPS.2017.20"},{"key":"72_CR9","unstructured":"Fang, J., Li, L., Fu, H., Jiang, J., Zhao, W., He, C., You, X., Yang, G.: swcaffe: a parallel framework for accelerating deep learning applications on sunway taihulight (2019)"},{"key":"72_CR10","unstructured":"Fedus, W., Zoph, B., Shazeer, N.: Switch transformers: scaling to trillion parameter models with simple and efficient sparsity (2021)"},{"key":"72_CR11","unstructured":"Forum, M.P.: MPI: a message-passing interface standard. MPI: A Message-Passing Interface Standard (1994)"},{"issue":"7","key":"72_CR12","first-page":"113","volume":"59","author":"H Fu","year":"2016","unstructured":"Fu, H., Liao, J., Yang, J., Wang, L., Song, Z., Huang, X., Yang, C., Xue, W., Liu, F., Qiao, F., Zhao, W.: The sunway taihu light supercomputer: system and applications. Sci. China (Inf. Sci.) 59(7), 113\u2013128 (2016)","journal-title":"Sci. China (Inf. Sci.)"},{"key":"72_CR13","doi-asserted-by":"crossref","unstructured":"Gao, J., Zhou, J., Zhou, C., Yu, J.X.: Glog: a high level graph analysis system using mapreduce. In: IEEE, pp. 544\u2013555 (2014)","DOI":"10.1109\/ICDE.2014.6816680"},{"key":"72_CR14","unstructured":"Gaskill, B.: Onnx: the open neural network exchange format. Linux J (2018)"},{"key":"72_CR15","unstructured":"Hak, M.: Gad-el: flow control : passive, active, and reactive flow management (2000)"},{"key":"72_CR16","unstructured":"Huang, Y., Cheng, Y., Chen, D., Lee, H., Ngiam, J., Le, Q.V., Chen, Z.: Gpipe: Efficient training of giant neural networks using pipeline parallelism. arXiv:1811.06965 (2019)"},{"key":"72_CR17","unstructured":"Intel.: Mkl-dnn for scalable deep learning (2017)"},{"key":"72_CR18","doi-asserted-by":"crossref","unstructured":"Jia, Y., Shelhamer, E., Donahue, J., Karayev, S., Darrell, T.: Caffe: convolutional architecture for fast feature embedding. ACM (2014)","DOI":"10.1145\/2647868.2654889"},{"issue":"7553","key":"72_CR19","doi-asserted-by":"publisher","first-page":"436","DOI":"10.1038\/nature14539","volume":"521","author":"Y Lecun","year":"2015","unstructured":"Lecun, Y., Bengio, Y., Hinton, G.: Deep learning. Nature 521(7553), 436 (2015)","journal-title":"Nature"},{"key":"72_CR20","unstructured":"Lepikhin, D., Lee, H.J., Xu, Y., Chen, D., Firat, O., Huang, Y., Krikun, M., Shazeer, N., Chen, Z.: Gshard: scaling giant models with conditional computation and automatic sharding (2020)"},{"key":"72_CR21","unstructured":"Luitjens, J.: Cuda streams: best practices and common pitfalls (2014)"},{"key":"72_CR22","unstructured":"Micikevicius, P., Narang, S., Alben, J., Diamos, G., Elsen, E., Garcia, D., Ginsburg, B., Houston, M., Kuchaiev, O., Venkatesh, G.: Mixed precision training (2017)"},{"key":"72_CR23","doi-asserted-by":"crossref","unstructured":"Myers, J.L., Well, A.D., Lorch, R.: Research design and statistical analysis. Res. Des. Stat. Anal. (2013)","DOI":"10.4324\/9780203726631"},{"issue":"3","key":"72_CR24","doi-asserted-by":"publisher","first-page":"10","DOI":"10.1109\/MCSE.2007.58","volume":"9","author":"TE Oliphant","year":"2007","unstructured":"Oliphant, T.E.: Python for scientific computing. Comput. Sci. Eng. 9(3), 10\u201320 (2007)","journal-title":"Comput. Sci. Eng."},{"key":"72_CR25","unstructured":"Oliphant, T.E.: Guide to NumPy. Guide to NumPy (2015)"},{"key":"72_CR26","unstructured":"Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., Devito, Z., Lin, Z., Desmaison, A., Antiga, L., Lerer, A.: Automatic differentiation in pytorch (2017)"},{"key":"72_CR27","unstructured":"Paszke, A., Gross, S., Massa, F., Lerer, A., Chintala, S.: Pytorch: an imperative style, high-performance deep learning library (2019)"},{"key":"72_CR28","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I.: Improving language understanding by generative pre-training (2018)"},{"key":"72_CR29","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I.: Language models are unsupervised multitask learners (2019)"},{"key":"72_CR30","doi-asserted-by":"crossref","unstructured":"Rajbhandari, S., Rasley, J., Ruwase, O., He, Y.: Zero: Memory optimization towards training a trillion parameter models (2019)","DOI":"10.1109\/SC41405.2020.00024"},{"key":"72_CR31","unstructured":"Sergeev, A., Balso, M.D.: Horovod: fast and easy distributed deep learning in tensorflow (2018)"},{"key":"72_CR32","unstructured":"Shoeybi, M., Patwary, M., Puri, R., Legresley, P., Catanzaro, B.: Megatron-lm: training multi-billion parameter language models using gpu model parallelism (2019)"},{"key":"72_CR33","unstructured":"Whitehead, M.: Creating fast and accurate machine learning ensembles through training dataset preprocessing. Ph.D. thesis, Indiana University (2010)"},{"key":"72_CR34","unstructured":"Zhang, H., Cheng, X., Zang, H., Park, D.H.: Compiler-level matrix multiplication optimization for deep learning (2019)"},{"key":"72_CR35","unstructured":"Zhao, R., Vogel, B., Ahmed, T.: Adaptive loss scaling for mixed precision training (2019)"}],"container-title":["CCF Transactions on High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-021-00072-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42514-021-00072-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-021-00072-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,10,20]],"date-time":"2021-10-20T21:07:47Z","timestamp":1634764067000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42514-021-00072-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,9]]},"references-count":35,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2021,9]]}},"alternative-id":["72"],"URL":"https:\/\/doi.org\/10.1007\/s42514-021-00072-x","relation":{},"ISSN":["2524-4922","2524-4930"],"issn-type":[{"value":"2524-4922","type":"print"},{"value":"2524-4930","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,9]]},"assertion":[{"value":"16 April 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 July 2021","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 September 2021","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}