{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,31]],"date-time":"2025-12-31T20:16:59Z","timestamp":1767212219079,"version":"3.37.3"},"reference-count":54,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2021,9,20]],"date-time":"2021-09-20T00:00:00Z","timestamp":1632096000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,9,20]],"date-time":"2021-09-20T00:00:00Z","timestamp":1632096000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100013076","name":"National Major Science and Technology Projects of China","doi-asserted-by":"crossref","award":["2018ZX01028-102"],"award-info":[{"award-number":["2018ZX01028-102"]}],"id":[{"id":"10.13039\/501100013076","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2022,5]]},"DOI":"10.1007\/s10489-021-02794-5","type":"journal-article","created":{"date-parts":[[2021,9,20]],"date-time":"2021-09-20T16:31:03Z","timestamp":1632155463000},"page":"7054-7069","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Evaluating performance of AI operators using roofline model"],"prefix":"10.1007","volume":"52","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1208-0760","authenticated-orcid":false,"given":"Zhengbo","family":"Chen","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fang","family":"Zheng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qi","family":"Yu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rujun","family":"Sun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Feng","family":"Guo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zuoning","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2021,9,20]]},"reference":[{"key":"2794_CR1","first-page":"01","volume":"25","author":"A Krizhevsky","year":"2012","unstructured":"Krizhevsky A, Sutskever I, Hinton G (2012) Imagenet classification with deep convolutional neural networks. Neural Inf Process Syst 25:01","journal-title":"Neural Inf Process Syst"},{"key":"2794_CR2","first-page":"2493","volume":"12","author":"R Collobert","year":"2011","unstructured":"Collobert R, Weston J, Bottou L, Karlen M, Kavukcuoglu K, Kuksa P (2011) Natural language processing (almost) from scratch. J Mach Learn Res 12:2493\u20132537","journal-title":"J Mach Learn Res"},{"issue":"6","key":"2794_CR3","doi-asserted-by":"publisher","first-page":"11","DOI":"10.1109\/MSP.2012.2205597","volume":"29","author":"G Hinton","year":"2012","unstructured":"Hinton G, Deng l, Yu D, Dahl G, Mohamed A-R, Jaitly N, Senior A, Vanhoucke V, Nguyen P, Sainath T, Kingsbury B (2012) Deep neural networks for acoustic modeling in speech recognition. IEEE Sig Process Mag 29(6):11","journal-title":"IEEE Sig Process Mag"},{"key":"2794_CR4","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y Lecun","year":"1998","unstructured":"Lecun Y, Bottou L, Bengio Y, Haffner P (1998) Gradient-based learning applied to document recognition. Proc IEEE 86:2278\u20132324","journal-title":"Proc IEEE"},{"key":"2794_CR5","unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition. arXiv:1409.1556"},{"key":"2794_CR6","doi-asserted-by":"crossref","unstructured":"Szegedy C, Liu W, Jia Y, Sermanet P, Reed S, Anguelov D, Erhan D, Vanhoucke V, Rabinovich A (2015) Going deeper with convolutions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1\u20139","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"2794_CR7","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"issue":"8","key":"2794_CR8","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput 9(8):1735\u20131780","journal-title":"Neural Comput"},{"key":"2794_CR9","unstructured":"Goodfellow I (2016) Nips 2016 tutorial: Generative adversarial networks. arXiv:1701.00160"},{"issue":"3","key":"2794_CR10","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1023\/A:1018628609742","volume":"9","author":"JAK Suykens","year":"1999","unstructured":"Suykens JAK, Vandewalle J (1999) Least squares support vector machine classifiers. Neural Process Lett 9(3):293\u2013300","journal-title":"Neural Process Lett"},{"key":"2794_CR11","doi-asserted-by":"crossref","unstructured":"Cheng J, Greiner R (2001) Learning bayesian belief network classifiers: Algorithms and system. In: Conference of the Canadian society for computational studies of intelligence. Springer, pp 141\u2013151","DOI":"10.1007\/3-540-45153-6_14"},{"key":"2794_CR12","doi-asserted-by":"crossref","unstructured":"Keller JM, Gray MR, Givens JA (1985) A fuzzy k-nearest neighbor algorithm. IEEE Trans Syst Man Cybern (4):580\u2013585","DOI":"10.1109\/TSMC.1985.6313426"},{"issue":"2","key":"2794_CR13","doi-asserted-by":"publisher","first-page":"29","DOI":"10.1109\/MM.2019.2899330","volume":"39","author":"M Arafa","year":"2019","unstructured":"Arafa M, Fahim B, Kottapalli S, Kumar A, Looi LP, Mandava S, Rudoff A, Steiner IM, Valentine B, Vedaraman G et al (2019) Cascade lake: Next generation intel xeon scalable processor. IEEE Micro 39(2):29\u201336","journal-title":"IEEE Micro"},{"key":"2794_CR14","unstructured":"Nvidia (2017) Nvidia tesla v100 gpu architecture. https:\/\/images.nvidia.com\/content\/volta-architecture\/pdf\/volta-architecture-whitepaper.pdf\/https:\/\/images.nvidia.com\/content\/volta-architecture\/pdf\/volta-architecture-whitepaper.pdf\/https:\/\/images.nvidia.com\/content\/volta-architecture\/pdf\/volta-architecture-whitepaper.pdf\/"},{"key":"2794_CR15","unstructured":"Nvidia (2020) Nvidia a100 tensor core gpu architecture. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/nvidia-ampere-architecture-whitepaper.pdf\/"},{"issue":"1","key":"2794_CR16","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1109\/TCAD.2017.2705069","volume":"37","author":"K Guo","year":"2017","unstructured":"Guo K, Sui L, Qiu J, Yu J, Wang J, Yao S, Han S, Wang Y, Yang H (2017) Angel-eye: A complete design flow for mapping cnn onto embedded fpga. IEEE Trans Comput-Aided Des Integr Circ Syst 37(1):35\u201347","journal-title":"IEEE Trans Comput-Aided Des Integr Circ Syst"},{"key":"2794_CR17","doi-asserted-by":"crossref","unstructured":"Venieris SI, Bouganis C-S (2017) fpgaconvnet: Automated mapping of convolutional neural networks on fpgas. In: Proceedings of the 2017 ACM\/SIGDA international symposium on field-programmable gate arrays, pp 291\u2013292","DOI":"10.1145\/3020078.3021791"},{"key":"2794_CR18","doi-asserted-by":"crossref","unstructured":"Nakahara H, Fujii T, Sato S (2017) A fully connected layer elimination for a binarizec convolutional neural network on an fpga. In: 2017 27th international conference on field programmable logic and applications (FPL). IEEE, pp 1\u20134","DOI":"10.23919\/FPL.2017.8056771"},{"issue":"1","key":"2794_CR19","doi-asserted-by":"publisher","first-page":"269","DOI":"10.1145\/2654822.2541967","volume":"42","author":"T Chen","year":"2014","unstructured":"Chen T, Du Z, Sun N, Wang J, Wu C, Chen Y, Temam O (2014) Diannao: A small-footprint high-throughput accelerator for ubiquitous machine-learning. ACM SIGARCH Comput Architect News 42(1):269\u2013284","journal-title":"ACM SIGARCH Comput Architect News"},{"issue":"1","key":"2794_CR20","doi-asserted-by":"publisher","first-page":"127","DOI":"10.1109\/JSSC.2016.2616357","volume":"52","author":"Y-H Chen","year":"2016","unstructured":"Chen Y-H, Krishna T, Emer JS, Sze V (2016) Eyeriss: An energy-efficient reconfigurable accelerator for deep convolutional neural networks. IEEE J Solid-State Circ 52(1):127\u2013138","journal-title":"IEEE J Solid-State Circ"},{"issue":"2","key":"2794_CR21","doi-asserted-by":"publisher","first-page":"27","DOI":"10.1145\/3140659.3080254","volume":"45","author":"A Parashar","year":"2017","unstructured":"Parashar A, Rhu M, Mukkara A, Puglielli A, Venkatesan R, Khailany B, Emer J, Keckler SW, Dally WJ (2017) Scnn: An accelerator for compressed-sparse convolutional neural networks. ACM SIGARCH Comput Archit News 45(2):27\u201340","journal-title":"ACM SIGARCH Comput Archit News"},{"issue":"3","key":"2794_CR22","doi-asserted-by":"publisher","first-page":"243","DOI":"10.1145\/3007787.3001163","volume":"44","author":"S Han","year":"2016","unstructured":"Han S, Liu X, Mao H, Jing P u, Pedram A, Horowitz MA, Dally WJ (2016) Eie: efficient inference engine on compressed deep neural network. ACM SIGARCH Comput Archit News 44(3):243\u2013254","journal-title":"ACM SIGARCH Comput Archit News"},{"key":"2794_CR23","doi-asserted-by":"crossref","unstructured":"Jouppi NP, Young C, Patil N, Patterson D, Agrawal G, Bajwa R, Bates S, Bhatia S, Boden N, Borchers A et al (2017) In-datacenter performance analysis of a tensor processing unit. In: Proceedings of the 44th annual international symposium on computer architecture, pp 1\u201312","DOI":"10.1145\/3079856.3080246"},{"key":"2794_CR24","unstructured":"Lipton RJ, Lopresti D (1985) A systolic array for rapid string comparison. In: Proceedings of the chapel hill conference on VLSI. Chapel Hill NC, pp 363\u2013376"},{"key":"2794_CR25","doi-asserted-by":"crossref","unstructured":"Zhang S, Du Z, Zhang L, Lan H, Liu S, Li L, Qi G, Chen T, Chen Y (2016) Cambricon-x: An accelerator for sparse neural networks. In: 2016 49th Annual IEEE\/ACM international symposium on microarchitecture (MICRO). IEEE, pp 1\u201312","DOI":"10.1109\/MICRO.2016.7783723"},{"key":"2794_CR26","doi-asserted-by":"crossref","unstructured":"He P, Chen G, Deng K, Yao P, Fu L (2020) Improve image classification by convolutional network on Cambricon. pp 75\u201382","DOI":"10.1007\/978-3-030-49556-5_7"},{"issue":"4","key":"2794_CR27","doi-asserted-by":"publisher","first-page":"65","DOI":"10.1145\/1498765.1498785","volume":"52","author":"S Williams","year":"2009","unstructured":"Williams S, Waterman A, Patterson D (2009) Roofline: an insightful visual performance model for multicore architectures. Commun ACM 52(4):65\u201376","journal-title":"Commun ACM"},{"key":"2794_CR28","unstructured":"Md AR, Goli N, Aamodt TM (2019) Modeling deep learning accelerator enabled gpus. In: 2019 IEEE international symposium on performance analysis of systems and software (ISPASS). IEEE, pp 79\u201392"},{"key":"2794_CR29","unstructured":"Kalamkar D, Mudigere D, Mellempudi N, Das D, Banerjee K, Avancha S, Vooturi DT, Jammalamadaka N, Huang J, Yuen H et al (2019) A study of bfloat16 for deep learning training. arXiv:1905.12322"},{"key":"2794_CR30","doi-asserted-by":"crossref","unstructured":"Yao Z, Cao S, Xiao W, Zhang C, Nie L (2019) Balanced sparsity for efficient dnn inference on gpu. In: Proceedings of the AAAI conference on artificial intelligence, vol 33, pp 5676\u2013 5683","DOI":"10.1609\/aaai.v33i01.33015676"},{"key":"2794_CR31","doi-asserted-by":"crossref","unstructured":"Cao S, Zhang C, Yao Z, Xiao W, Nie L, Zhan D, Liu Y, Wu M, Zhang L (2019) Efficient and effective sparse lstm on fpga with bank-balanced sparsity. In: Proceedings of the 2019 ACM\/SIGDA international symposium on field-programmable gate arrays, pp 63\u201372","DOI":"10.1145\/3289602.3293898"},{"key":"2794_CR32","doi-asserted-by":"crossref","unstructured":"Norrie T, Patil N, Yoon DH, Kurian G, Li S, Laudon J, Young C, Jouppi NP, Patterson D (2020) Google\u2019s training chips revealed: Tpuv2 and tpuv3. Hotchips","DOI":"10.1109\/HCS49909.2020.9220735"},{"key":"2794_CR33","doi-asserted-by":"crossref","unstructured":"Hecht-Nielsen R (1992) Theory of the backpropagation neural network. In: Neural networks for perception. Elsevier, pp 65\u201393","DOI":"10.1016\/B978-0-12-741252-8.50010-8"},{"key":"2794_CR34","doi-asserted-by":"crossref","unstructured":"Hara K, Saito D, Shouno H (2015) Analysis of function of rectified linear unit used in deep learning. In: 2015 international joint conference on neural networks (IJCNN). IEEE, pp 1\u20138","DOI":"10.1109\/IJCNN.2015.7280578"},{"key":"2794_CR35","unstructured":"Ioffe S, Szegedy C (2015) Batch normalization: Accelerating deep network training by reducing internal covariate shift. arXiv:1502.03167"},{"key":"2794_CR36","unstructured":"Samuel W, Yang C, Wang Y (2020) Roofline performance modeling for hpc and deep learning applications"},{"key":"2794_CR37","volume-title":"The traveling salesman problem","author":"DL Applegate","year":"2011","unstructured":"Applegate DL, Bixby RE, Chv\u00e1tal V, Cook WJ (2011) The traveling salesman problem. Princeton University Press, Princeton"},{"key":"2794_CR38","unstructured":"NVIDIA (2021) Cuda toolkit documentation"},{"key":"2794_CR39","unstructured":"Mishra A, Latorre JA, Pool J, Stosic D, Stosic D, Venkatesh G, Yu C, Micikevicius P (2021) Accelerating sparse deep neural networks. arXiv:2104.08378"},{"key":"2794_CR40","doi-asserted-by":"crossref","unstructured":"Jacob B, Kligys S, Bo C, Zhu M, Tang M, Howard A, Adam H, Kalenichenko D (2018) Quantization and training of neural networks for efficient integer-arithmetic-only inference. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2704\u20132713","DOI":"10.1109\/CVPR.2018.00286"},{"key":"2794_CR41","unstructured":"Lin D, Talathi S, Annapureddy S (2016) Fixed point quantization of deep convolutional networks. In: International conference on machine learning, pp 2849\u20132858"},{"key":"2794_CR42","unstructured":"Gupta S, Agrawal A, Gopalakrishnan K, Narayanan P (2015) Deep learning with limited numerical precision. In: International conference on machine learning, pp 1737\u20131746"},{"key":"2794_CR43","unstructured":"Mark H (2014) Energy table for 45nm process. In: Stanford VLSI wiki"},{"key":"2794_CR44","unstructured":"Courbariaux M, Bengio Y, David J-P (2014) Training deep neural networks with low precision multiplications. arXiv:1412.7024"},{"key":"2794_CR45","doi-asserted-by":"crossref","unstructured":"Liu H, Ferdman M, Huh J, Burger D (2008) Cache bursts: A new approach for eliminating dead blocks and increasing cache efficiency. In: 2008 41st IEEE\/ACM international symposium on microarchitecture. IEEE, pp 222\u2013233","DOI":"10.1109\/MICRO.2008.4771793"},{"key":"2794_CR46","doi-asserted-by":"crossref","unstructured":"Ma S, Guo Y, Chen S, Huang L, Wang Z (2019) Improving the dram access efficiency for matrix multiplication on multicore accelerators. In: 2019 design, automation & test in europe conference & exhibition (DATE). IEEE, pp 1058\u20131063","DOI":"10.23919\/DATE.2019.8714915"},{"key":"2794_CR47","doi-asserted-by":"crossref","unstructured":"Ham TJ, Arag\u00f3n JL, Martonosi M (2015) Desc: Decoupled supply-compute communication management for heterogeneous architectures. In: 2015 48th annual IEEE\/ACM international symposium on microarchitecture (MICRO). IEEE, pp 191\u2013 203","DOI":"10.1145\/2830772.2830800"},{"key":"2794_CR48","doi-asserted-by":"crossref","unstructured":"Wang Z, Nowatzki T (2019) Stream-based memory access specialization for general purpose processors. In: 2019 ACM\/IEEE 46th annual international symposium on computer architecture (ISCA). IEEE, pp 736\u2013749","DOI":"10.1145\/3307650.3322229"},{"issue":"3","key":"2794_CR49","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3007787.3001138","volume":"44","author":"J Albericio","year":"2016","unstructured":"Albericio J, Judd P, Hetherington T, Aamodt T, Jerger NE, Moshovos A (2016) Cnvlutin: Ineffectual-neuron-free deep neural network computing. ACM SIGARCH Comput Archit News 44 (3):1\u201313","journal-title":"ACM SIGARCH Comput Archit News"},{"key":"2794_CR50","unstructured":"Han S, Kang J, Mao H, Hu Y, Li X, Li Y, Xie D, Luo H, Yao S, Wang Y et al (2016) Ese: Efficient speech recognition engine with compressed lstm on fpga. arXiv:1612.00694"},{"key":"2794_CR51","first-page":"6","volume":"27","author":"K Kiningham","year":"2016","unstructured":"Kiningham K, Graczyk M, Ramkumar A, Stanford SCPD (2016) Design and analysis of a hardware cnn accelerator. Small 27:6","journal-title":"Small"},{"key":"2794_CR52","doi-asserted-by":"crossref","unstructured":"Xu R, Han F, Ta Q (2018) Deep learning at scale on nvidia v100 accelerators. pp 23\u201332","DOI":"10.1109\/PMBS.2018.8641600"},{"issue":"7","key":"2794_CR53","doi-asserted-by":"publisher","first-page":"1035","DOI":"10.1109\/TC.2019.2895031","volume":"68","author":"H Zhang","year":"2019","unstructured":"Zhang H, Chen D, Ko SB (2019) Efficient multiple-precision floating-point fused multiply-add with mixed-precision support. IEEE Trans Comput 68(7):1035\u20131048","journal-title":"IEEE Trans Comput"},{"key":"2794_CR54","unstructured":"Han S, Mao H, William JD (2015) Deep compression: Compressing deep neural networks with pruning, trained quantization and huffman coding. arXiv:1510.00149"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-021-02794-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-021-02794-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-021-02794-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,5]],"date-time":"2022-05-05T09:15:35Z","timestamp":1651742135000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-021-02794-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,9,20]]},"references-count":54,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2022,5]]}},"alternative-id":["2794"],"URL":"https:\/\/doi.org\/10.1007\/s10489-021-02794-5","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"type":"print","value":"0924-669X"},{"type":"electronic","value":"1573-7497"}],"subject":[],"published":{"date-parts":[[2021,9,20]]},"assertion":[{"value":"25 August 2021","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 September 2021","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}