{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T13:54:32Z","timestamp":1774965272375,"version":"3.50.1"},"reference-count":174,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2020,5,28]],"date-time":"2020-05-28T00:00:00Z","timestamp":1590624000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,5,28]],"date-time":"2020-05-28T00:00:00Z","timestamp":1590624000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/100016714","name":"University of Sharjah","doi-asserted-by":"crossref","award":["CPS"],"award-info":[{"award-number":["CPS"]}],"id":[{"id":"10.13039\/100016714","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2021,2]]},"DOI":"10.1007\/s11227-020-03325-8","type":"journal-article","created":{"date-parts":[[2020,5,28]],"date-time":"2020-05-28T18:03:32Z","timestamp":1590689012000},"page":"1897-1938","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":156,"title":["A systematic literature review on hardware implementation of artificial intelligence algorithms"],"prefix":"10.1007","volume":"77","author":[{"given":"Manar Abu","family":"Talib","sequence":"first","affiliation":[]},{"given":"Sohaib","family":"Majzoub","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2837-3402","authenticated-orcid":false,"given":"Qassim","family":"Nasir","sequence":"additional","affiliation":[]},{"given":"Dina","family":"Jamal","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,5,28]]},"reference":[{"issue":"12","key":"3325_CR1","doi-asserted-by":"publisher","first-page":"2295","DOI":"10.1109\/JPROC.2017.2761740","volume":"105","author":"V Sze","year":"2017","unstructured":"Sze V, Chen Y-H, Yang T-J, Emer JS (2017) Efficient processing of deep neural networks: a tutorial and survey. Proc IEEE 105(12):2295\u20132329","journal-title":"Proc IEEE"},{"issue":"2","key":"3325_CR2","doi-asserted-by":"publisher","first-page":"137","DOI":"10.1109\/69.87994","volume":"3","author":"LF Pau","year":"1991","unstructured":"Pau LF (1991) Artificial intelligence and financial services. IEEE Trans Knowl Data Eng 3(2):137\u2013148","journal-title":"IEEE Trans Knowl Data Eng"},{"key":"3325_CR3","doi-asserted-by":"crossref","unstructured":"Yao X, Zhou J, Zhang J, Boer CR (2017) From intelligent manufacturing to smart manufacturing for industry 4.0 driven by next generation artificial intelligence and further on. In: 5th International Conference on Enterprise Systems (ES)","DOI":"10.1109\/ES.2017.58"},{"key":"3325_CR4","doi-asserted-by":"crossref","unstructured":"Bishnoi L, Narayan Singh S (2018) Artificial intelligence techniques used in medical sciences: a review. In: 8th International Conference on Cloud Computing, Data Science and Engineering (Confluence), pp 106\u2013113","DOI":"10.1109\/CONFLUENCE.2018.8442729"},{"key":"3325_CR5","unstructured":"Parker DS (1989) Integrating AI and DBMS through stream processing. In: Proceedings of Fifth International Conference on Data Engineering"},{"key":"3325_CR6","doi-asserted-by":"crossref","unstructured":"Fraley JB, Cannady J (2017) The promise of machine learning in cybersecurity. SoutheastCon","DOI":"10.1109\/SECON.2017.7925283"},{"key":"3325_CR7","doi-asserted-by":"crossref","unstructured":"Farabet C, Poulet C, Han JY, LeCun Y (2009). CNP: an FPGA-based processor for convolutional networks. Presented at the 2009 International Conference on Field Programmable Logic and Applications (FPL)","DOI":"10.1109\/FPL.2009.5272559"},{"key":"3325_CR8","doi-asserted-by":"crossref","unstructured":"Rao Q, Frtunikj J (2018) Deep learning for self-driving cars. In: Proceedings of the 1st International Workshop on Software Engineering for AI in Autonomous Systems\u2014SEFAIS \u201918","DOI":"10.1145\/3194085.3194087"},{"key":"3325_CR9","doi-asserted-by":"crossref","unstructured":"Duffany JL (2010) Artificial intelligence in GPS navigation systems. Presented at the 2010 2nd International Conference on Software Technology and Engineering (ICSTE 2010)","DOI":"10.1109\/ICSTE.2010.5608862"},{"key":"3325_CR10","doi-asserted-by":"crossref","unstructured":"Schutzer D (1983) Applications of artificial intelligence to military communications. In: IEEE Military Communications Conference, pp 786\u2013790","DOI":"10.1109\/MILCOM.1983.4794808"},{"issue":"1\u20133","key":"3325_CR11","doi-asserted-by":"publisher","first-page":"239","DOI":"10.1016\/j.neucom.2010.03.021","volume":"74","author":"J Misra","year":"2010","unstructured":"Misra J, Saha I (2010) Artificial neural networks in hardware: a survey of two decades of progress. Neurocomputing 74(1\u20133):239\u2013255","journal-title":"Neurocomputing"},{"key":"3325_CR12","doi-asserted-by":"crossref","unstructured":"Baji T (2018) Evolution of the GPU device widely used in AI and massive parallel processing. In: IEEE 2nd Electron Devices Technology and Manufacturing Conference (EDTM)","DOI":"10.1109\/EDTM.2018.8421507"},{"issue":"1","key":"3325_CR13","first-page":"63","volume":"9","author":"P Jawandhiya","year":"2018","unstructured":"Jawandhiya P (2018) Hardware design for machine learning. Int J Artif Intell Appl (IJAIA) 9(1):63\u201384","journal-title":"Int J Artif Intell Appl (IJAIA)"},{"key":"3325_CR14","doi-asserted-by":"publisher","first-page":"7823","DOI":"10.1109\/ACCESS.2018.2890150","volume":"7","author":"A Shawahna","year":"2019","unstructured":"Shawahna A, Sait SM, El-Maleh A (2019) FPGA-based accelerators of deep learning networks for learning and classification: a review. IEEE Access 7:7823\u20137859","journal-title":"IEEE Access"},{"issue":"1","key":"3325_CR15","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TCIAIG.2009.2021433","volume":"1","author":"SM Lucas","year":"2009","unstructured":"Lucas SM (2009) Computational intelligence and AI in games: a new IEEE transaction. IEEE Trans Comput Intell AI Games 1(1):1\u20133","journal-title":"IEEE Trans Comput Intell AI Games"},{"key":"3325_CR16","unstructured":"Rigos S (2012) A hardware acceleration unit for face detection. In: Mediterranean Conference on Embedded Computing (MECO), Bar, pp 17\u201321"},{"issue":"4","key":"3325_CR17","doi-asserted-by":"publisher","first-page":"1109","DOI":"10.1007\/s00521-018-3761-1","volume":"32","author":"S Mittal","year":"2018","unstructured":"Mittal S (2018) A survey of FPGA-based accelerators for convolutional neural networks. Neural Comput Appl 32(4):1109\u20131139","journal-title":"Neural Comput Appl"},{"issue":"1","key":"3325_CR18","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3289185","volume":"12","author":"K Guo","year":"2019","unstructured":"Guo K, Zeng S, Yu J, Wang Y, Yang H (2019) [DL] A survey of FPGA-based neural network inference accelerators. ACM Trans Reconfig Technol Syst 12(1):1\u201326","journal-title":"ACM Trans Reconfig Technol Syst"},{"key":"3325_CR19","unstructured":"Wang T, Wang C, Zhou X, Chen H (2018) A survey of FPGA based deep learning accelerators: challenges and opportunities. arXiv preprint arXiv:1901.04988"},{"key":"3325_CR20","doi-asserted-by":"crossref","unstructured":"Budgen D, Brereton P (2006) Performing systematic literature reviews in software engineering. In: Proceeding of the 28th International Conference on Software Engineering\u2014ICSE \u201906","DOI":"10.1145\/1134285.1134500"},{"key":"3325_CR21","doi-asserted-by":"crossref","unstructured":"Ma Y, Cao Y, Vrudhula S, Seo J (2017) An automatic RTL compiler for high-throughput FPGA implementation of diverse deep convolutional neural networks. In: 27th International Conference on Field Programmable Logic and Applications (FPL)","DOI":"10.23919\/FPL.2017.8056824"},{"key":"3325_CR22","doi-asserted-by":"crossref","unstructured":"Nurvitadhi E, Venkatesh G, Sim J, Marr D, Huang R, Ong Gee Hock J, Liew YT, Srivatsan K, Moss D, Subhaschandra S, Boudoukh G (2017) Can FPGAs beat GPUs in accelerating next-generation deep neural networks? In: Proceedings of the 2017 ACM\/SIGDA International Symposium on Field-Programmable Gate Arrays\u2014FPGA \u201917","DOI":"10.1145\/3020078.3021740"},{"key":"3325_CR23","unstructured":"Lacey G, Taylor G, Areibi S (2016) Deep learning on FPGAs: past, present, and future, pp 1\u20138. arXiv: 1602.04283"},{"key":"3325_CR24","doi-asserted-by":"crossref","unstructured":"Faraone J, Gambardella G, Boland D, Fraser N, Blott M, Leong PHW (2018) Customizing low-precision deep neural networks for FPGAs. In: 28th International Conference on Field Programmable Logic and Applications (FPL)","DOI":"10.1109\/FPL.2018.00025"},{"key":"3325_CR25","doi-asserted-by":"crossref","unstructured":"Cheng Kwang-Ting, Wang Yi-Chu (2011) Using mobile GPU for general-purpose computing; a case study of face recognition on smartphones. In: Proceedings of 2011 International Symposium on VLSI Design, Automation and Test","DOI":"10.1109\/VDAT.2011.5783575"},{"key":"3325_CR26","doi-asserted-by":"crossref","unstructured":"Ouerhani Y, Jridi M, AlFalou A (2010) Fast face recognition approach using a graphical processing unit \u201cGPU\u201d. In: IEEE International Conference on Imaging Systems and Techniques","DOI":"10.1109\/IST.2010.5548545"},{"key":"3325_CR27","doi-asserted-by":"crossref","unstructured":"Li E, Wang B, Yang L, Peng Y, Du Y, Zhang Y, Chiu Y-J (2012) GPU and CPU cooperative acceleration for face detection on modern processors. Presented at the 2012 IEEE International Conference on Multimedia and Expo (ICME)","DOI":"10.1109\/ICME.2012.121"},{"key":"3325_CR28","doi-asserted-by":"crossref","unstructured":"Shah AA, Zaidi ZA, Chowdhry BS, Daudpoto J (2016) Real time face detection\/monitor using raspberry pi and MATLAB. In: IEEE 10th International Conference on Application of Information and Communication Technologies (AICT)","DOI":"10.1109\/ICAICT.2016.7991743"},{"key":"3325_CR29","doi-asserted-by":"crossref","unstructured":"Oro D, Fernandez C, Saeta JR, Martorell X, Hernando J (2011) Real-time GPU-based face detection in HD video sequences. In: IEEE International Conference on Computer Vision Workshops (ICCV Workshops)","DOI":"10.1109\/ICCVW.2011.6130288"},{"key":"3325_CR30","unstructured":"Gao C, Lu SL (2008) Novel FPGA based Haar classifier face detection algorithm acceleration. Presented at the 2008 International Conference on Field Programmable Logic and Applications (FPL)"},{"key":"3325_CR31","doi-asserted-by":"crossref","unstructured":"Cho J, Mirzaei S, Oberg J, Kastner R (2009) FPGA-based face detection system using Haar classifiers. In: Proceeding of the ACM\/SIGDA International Symposium on Field Programmable Gate Arrays\u2014FPGA \u201909","DOI":"10.1145\/1508128.1508144"},{"key":"3325_CR32","doi-asserted-by":"crossref","unstructured":"He C, Papakonstantinou A, Chen D (2009) A novel SoC architecture on FPGA for ultra fast face detection. Presented at the 2009 IEEE International Conference on Computer Design (ICCD 2009)","DOI":"10.1109\/ICCD.2009.5413122"},{"issue":"4","key":"3325_CR33","doi-asserted-by":"publisher","first-page":"597","DOI":"10.1109\/TCSVT.2009.2014013","volume":"19","author":"N Farrugia","year":"2009","unstructured":"Farrugia N, Mamalet F, Roux S, Fan Yang, Paindavoine M (2009) Fast and robust face detection on a parallel optimized architecture implemented on FPGA. IEEE Trans Circuits Syst Video Technol 19(4):597\u2013602","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"3325_CR34","doi-asserted-by":"crossref","unstructured":"Farabet C, Poulet C, LeCun Y (2009) An FPGA-based stream processor for embedded real-time vision with convolutional networks. In: IEEE 12th International Conference on Computer Vision Workshops, ICCV Workshops","DOI":"10.1109\/ICCVW.2009.5457611"},{"key":"3325_CR35","doi-asserted-by":"crossref","unstructured":"Kyrkou C, Theocharides T (2011) A flexible parallel hardware architecture for AdaBoost-based real-time object detection. IEEE Trans Very Large Scale Integr (VLSI) Syst 19(6):1034\u20131047","DOI":"10.1109\/TVLSI.2010.2048224"},{"key":"3325_CR36","unstructured":"Zhou W, Zou Y, Dai L, Zeng X (2011) A high speed reconfigurable face detection architecture. Presented at the 2011 IEEE 9th International Conference on ASIC (ASICON 2011)"},{"key":"3325_CR37","doi-asserted-by":"crossref","unstructured":"Wang N-J, Chang S-C, Chou P-J (2012) A real-time multi-face detection system implemented on FPGA. Presented at the 2012 International Symposium on Intelligent Signal Processing and Communications Systems (ISPACS 2012)","DOI":"10.1109\/ISPACS.2012.6473506"},{"key":"3325_CR38","unstructured":"Bauer S, Brunsmann U, Schlotterbeck-Macht S (2009) FPGA implementation of a HOG-based pedestrian recognition system. In: MPC Workshop, pp 49\u201358"},{"key":"3325_CR39","doi-asserted-by":"crossref","unstructured":"Hiromoto M, Miyamoto R (2009) Hardware architecture for high-accuracy real-time pedestrian detection with CoHOG features. In: IEEE 12th International Conference on Computer Vision Workshops, ICCV Workshops","DOI":"10.1109\/ICCVW.2009.5457609"},{"key":"3325_CR40","doi-asserted-by":"crossref","unstructured":"Bauer S, Kohler S, Doll K, Brunsmann U (2010) FPGA-GPU architecture for kernel SVM pedestrian detection. In: IEEE Computer Society Conference on Computer Vision and Pattern Recognition\u2014Workshops","DOI":"10.1109\/CVPRW.2010.5543772"},{"key":"3325_CR41","unstructured":"Kryjak T, Komorkiewicz M, Gorgon M (2012) FPGA implementation of real-time headshoulder detection using local binary patterns, SVM and foreground object detection. In: Conference on Design and Architectures for Signal and Image Processing (DASIP), pp 1\u20138"},{"key":"3325_CR42","doi-asserted-by":"crossref","unstructured":"Sharma B, Thota R, Vydyanathan N, Kale A (2009) Towards a robust, real-time face processing system using CUDA-enabled GPUs. In: International Conference on High Performance Computing (HiPC)","DOI":"10.1109\/HIPC.2009.5433189"},{"key":"3325_CR43","doi-asserted-by":"crossref","unstructured":"Kong J, Deng Y (2010) GPU accelerated face detection. In: International Conference on Intelligent Control and Information Processing","DOI":"10.1109\/ICICIP.2010.5564978"},{"key":"3325_CR44","doi-asserted-by":"crossref","unstructured":"Hefenbrock D, Oberg J, Thanh NTN, Kastner R, Baden SB (2010) Accelerating Viola-Jones face detection to FPGA-level using GPUs. In: 18th IEEE Annual International Symposium on Field-Programmable Custom Computing Machines","DOI":"10.1109\/FCCM.2010.12"},{"key":"3325_CR45","doi-asserted-by":"crossref","unstructured":"Masek J, Burget R, Uher V, Guney S (2013) Speeding up Viola-Jones algorithm using multi-Core GPU implementation. Presented at the 2013 36th International Conference on Telecommunications and Signal Processing (TSP)","DOI":"10.1109\/TSP.2013.6614050"},{"key":"3325_CR46","doi-asserted-by":"publisher","first-page":"156","DOI":"10.1016\/j.procs.2016.05.142","volume":"87","author":"V Jain","year":"2016","unstructured":"Jain V, Patel D (2016) A GPU based implementation of robust face detection system. Procedia Comput Sci 87:156\u2013163","journal-title":"Procedia Comput Sci"},{"issue":"1","key":"3325_CR47","first-page":"68","volume":"17","author":"G Lescano","year":"2017","unstructured":"Lescano G, Santana P, Costaguta R (2017) Analysis of a GPU implementation of Viola-Jones\u2019 algorithm for features selection. J Comput Sci Technol 17(1):68\u201373","journal-title":"J Comput Sci Technol"},{"key":"3325_CR48","doi-asserted-by":"crossref","unstructured":"Hahnle M, Saxen F, Hisung M, Brunsmann U, Doll K (2013) FPGA-based real-time pedestrian detection on high-resolution images. In: Proceedings IEEE Conference on Computer Vision and Pattern Recognition Workshops, pp 629\u2013635","DOI":"10.1109\/CVPRW.2013.95"},{"key":"3325_CR49","doi-asserted-by":"crossref","unstructured":"Komorkiewicz M, Kluczewski M, Gorgon M (2012) Floating point HOG implementation for real-time multiple object detection. Presented at the 2012 22nd International Conference on Field Programmable Logic and Applications (FPL)","DOI":"10.1109\/FPL.2012.6339159"},{"issue":"6","key":"3325_CR50","doi-asserted-by":"publisher","first-page":"1051","DOI":"10.1109\/TCSVT.2014.2360030","volume":"25","author":"X Ma","year":"2015","unstructured":"Ma X, Najjar WA, Roy-Chowdhury AK (2015) Evaluation and acceleration of high-throughput fixed-point object detection on FPGAs. IEEE Trans Circuits Syst Video Technol 25(6):1051\u20131062","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"3325_CR51","doi-asserted-by":"crossref","unstructured":"Dwith CYN, Rathna GN (2012) Parallel implementation of LBP based face recognition on GPU using OpenCL. In: The International Conference on Parallel and Distributed Computing, Applications and Technologies (PDCAT), pp 755\u2013760","DOI":"10.1109\/PDCAT.2012.107"},{"key":"3325_CR52","doi-asserted-by":"crossref","unstructured":"Oh C, Yi S, Yi Y (2015) Real-time face detection in full HD images exploiting both embedded CPU and GPU. Presented at the 2015 IEEE International Conference on Multimedia and Expo (ICME)","DOI":"10.1109\/ICME.2015.7177522"},{"issue":"12","key":"3325_CR53","doi-asserted-by":"publisher","first-page":"2878","DOI":"10.1587\/transinf.2018PAP0004","volume":"101","author":"C Oh","year":"2018","unstructured":"Oh C, Yi S, Yi Y (2018) Real-time and energy-efficient face detection on CPU-GPU heterogeneous embedded platforms. IEICE Trans Inf Syst E 101(12):2878\u20132888","journal-title":"IEICE Trans Inf Syst E"},{"key":"3325_CR54","doi-asserted-by":"crossref","unstructured":"Negi K, Dohi K, Shibata Y, Oguri K (2011) Deep pipelined one-chip FPGA implementation of a real-time image-based human detection algorithm. In: International Conference on Field-Programmable Technology","DOI":"10.1109\/FPT.2011.6132679"},{"key":"3325_CR55","doi-asserted-by":"crossref","unstructured":"Zhao J, Zhu S, Huang X (2013) Real-time traffic sign detection using SURF features on FPGA. In: IEEE High Performance Extreme Computing Conference (HPEC)","DOI":"10.1109\/HPEC.2013.6670350"},{"key":"3325_CR56","doi-asserted-by":"crossref","unstructured":"Nasse F, Thurau C, Fink GA (2009) Face detection using GPU-based convolutional neural networks. In Proceedings of the 13th international conference on computer analysis of images and patterns. Springer, Berlin, pp 83\u201390","DOI":"10.1007\/978-3-642-03767-2_10"},{"key":"3325_CR57","doi-asserted-by":"crossref","unstructured":"Li H, Lin Z, Shen X, Brandt J, Hua G (2015) A convolutional neural network cascade for face detection. In: IEEE Conference on Computer Vision and Pattern Recognition, pp 5325\u20135334","DOI":"10.1109\/CVPR.2015.7299170"},{"key":"3325_CR58","doi-asserted-by":"crossref","unstructured":"Cengil E, Cinar A, Guler Z (2017) A GPU-based convolutional neural network approach for image classification. Presented at the 2017 International Artificial Intelligence and Data Processing Symposium (IDAP)","DOI":"10.1109\/IDAP.2017.8090194"},{"key":"3325_CR59","first-page":"2110","volume-title":"ICCVW","author":"N Tijtgat","year":"2017","unstructured":"Tijtgat N, Ranst WV, Volckaert B, Goedeme T, Turck FD (2017) Embedded real-time object detection for a UAV warning system. ICCVW. Venice, Italy, pp 2110\u20132118"},{"issue":"2","key":"3325_CR60","doi-asserted-by":"publisher","first-page":"361","DOI":"10.1109\/TCE.2013.6531118","volume":"59","author":"D Berjon","year":"2013","unstructured":"Berjon D, Cuevas C, Moran F, Garcia N (2013) GPU-based implementation of an optimized nonparametric background modeling for real-time moving object detection. IEEE Trans Consum Electron 59(2):361\u2013369","journal-title":"IEEE Trans Consum Electron"},{"key":"3325_CR61","doi-asserted-by":"crossref","unstructured":"Obukhov A (2011) Haar classifiers for object detection with CUDA. In: GPU computing gems, Emerald Edition. Elsevier, pp 517\u2013544","DOI":"10.1016\/B978-0-12-384988-5.00033-4"},{"key":"3325_CR62","doi-asserted-by":"crossref","unstructured":"Pertsau D, Uvarov A (2013) Face detection algorithm using Haar-like feature for GPU architecture. In: IEEE 7th International Conference on Intelligent Data Acquisition and Advanced Computing Systems (IDAACS)","DOI":"10.1109\/IDAACS.2013.6663020"},{"key":"3325_CR63","doi-asserted-by":"crossref","unstructured":"Coates A, Baumstarck P, Le Q, Ng AY (2009) Scalable learning for object detection with GPU hardware. In: IEEE\/RSJ International Conference on Intelligent Robots and Systems","DOI":"10.1109\/IROS.2009.5354084"},{"key":"3325_CR64","doi-asserted-by":"crossref","unstructured":"Oro D, Fern\u2019ndez C, Segura C, Martorell X, Hernando J (2012) Accelerating boosting-based face detection on GPUs. In: 41st International Conference on Parallel Processing","DOI":"10.1109\/ICPP.2012.12"},{"issue":"3","key":"3325_CR65","doi-asserted-by":"publisher","first-page":"159","DOI":"10.1007\/s11554-010-0179-0","volume":"6","author":"A Herout","year":"2010","unstructured":"Herout A, Jo\u0161th R, Jur\u00e1nek R, Havel J, Hradi\u0161 M, Zem\u010d\u00edk P (2010) Real-time object detection on CUDA. J Real-Time Image Proc 6(3):159\u2013170","journal-title":"J Real-Time Image Proc"},{"issue":"8","key":"3325_CR66","doi-asserted-by":"publisher","first-page":"3299","DOI":"10.1109\/TIE.2011.2165451","volume":"59","author":"H Zhuang","year":"2012","unstructured":"Zhuang H, Low K-S, Yau W-Y (2012) Multichannel pulse-coupled-neural-network-based color image segmentation for object detection. IEEE Trans Ind Electron 59(8):3299\u20133308","journal-title":"IEEE Trans Ind Electron"},{"key":"3325_CR67","doi-asserted-by":"crossref","unstructured":"Lozano OM, Otsuka K (2008) Simultaneous and fast 3D tracking of multiple faces in video by GPU-based stream processing. In: IEEE International Conference on Acoustics. Speech and Signal Processing, ICASSP, p 2008","DOI":"10.1109\/ICASSP.2008.4517709"},{"issue":"10","key":"3325_CR68","doi-asserted-by":"publisher","first-page":"2376","DOI":"10.1109\/TC.2013.130","volume":"63","author":"PR Possa","year":"2014","unstructured":"Possa PR, Mahmoudi SA, Harb N, Valderrama C, Manneback P (2014) A multi-resolution FPGA-based architecture for real-time edge and corner detection. IEEE Trans Comput 63(10):2376\u20132388","journal-title":"IEEE Trans Comput"},{"issue":"10","key":"3325_CR69","doi-asserted-by":"publisher","first-page":"639","DOI":"10.1016\/j.sysarc.2015.09.005","volume":"61","author":"JPF Barbosa","year":"2015","unstructured":"Barbosa JPF, Ferreira APA, Rocha RCF, Albuquerque ES, Reis JR, Albuquerque DS, Barros ENS (2015) A high performance hardware accelerator for dynamic texture segmentation. J Syst Archit 61(10):639\u2013645","journal-title":"J Syst Archit"},{"issue":"1","key":"3325_CR70","doi-asserted-by":"publisher","first-page":"61","DOI":"10.1007\/s11554-012-0290-5","volume":"9","author":"T Kryjak","year":"2012","unstructured":"Kryjak T, Komorkiewicz M, Gorgon M (2012) Real-time background generation and foreground object segmentation for high-definition colour video stream in FPGA device. J Real-Time Image Proc 9(1):61\u201377","journal-title":"J Real-Time Image Proc"},{"key":"3325_CR71","doi-asserted-by":"crossref","unstructured":"Park J, Sung W (2016) FPGA based implementation of deep neural networks using on-chip memory only. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","DOI":"10.1109\/ICASSP.2016.7471828"},{"issue":"2","key":"3325_CR72","doi-asserted-by":"publisher","first-page":"350","DOI":"10.3390\/s19020350","volume":"19","author":"M Zhao","year":"2019","unstructured":"Zhao M, Hu C, Wei F, Wang K, Wang C, Jiang Y (2019) Real-time underwater image recognition with FPGA embedded system for convolutional neural network. Sensors 19(2):350","journal-title":"Sensors"},{"key":"3325_CR73","doi-asserted-by":"crossref","unstructured":"Zhang T, Zhou W, Jiang X, Liu Y (2018) FPGA-based implementation of hand gesture recognition using convolutional neural network. Presented at the 2018 IEEE International Conference on Cyborg and Bionic Systems (CBS)","DOI":"10.1109\/CBS.2018.8612238"},{"key":"3325_CR74","doi-asserted-by":"crossref","unstructured":"Reyes E, G\u00f3mez C, Norambuena E, Ruiz-del-Solar J (2019) Near real-time object recognition for pepper based on deep neural networks running on a backpack. In: RoboCup 2018: Robot World Cup XXII. Springer, pp 287\u2013298","DOI":"10.1007\/978-3-030-27544-0_24"},{"key":"3325_CR75","doi-asserted-by":"crossref","unstructured":"Zhou Y, Wang W, Huang X (2015) FPGA design for PCANet deep learning network. In: IEEE 23rd Annual International Symposium on Field-Programmable Custom Computing Machines","DOI":"10.1109\/FCCM.2015.45"},{"issue":"1","key":"3325_CR76","doi-asserted-by":"publisher","first-page":"153","DOI":"10.1109\/TCSVT.2014.2335831","volume":"25","author":"H Hikawa","year":"2015","unstructured":"Hikawa H, Kaida K (2015) Novel FPGA implementation of hand sign recognition system with SOM-Hebb classifier. IEEE Trans Circuits Syst Video Technol 25(1):153\u2013166","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"3325_CR77","doi-asserted-by":"crossref","unstructured":"Svab J, Krajnik T, Faigl J, Preucil L (2009) FPGA based speeded up robust features. Presented at the 2009 IEEE International Conference on Technologies for Practical Robot Applications (TePRA)","DOI":"10.1109\/TEPRA.2009.5339646"},{"key":"3325_CR78","doi-asserted-by":"crossref","unstructured":"Yao L, Feng H, Zhu Y, Jiang Z, Zhao D, Feng W (2009) An architecture of optimised SIFT feature detection for an FPGA implementation of an image matcher. In: International Conference on Field-Programmable Technology","DOI":"10.1109\/FPT.2009.5377651"},{"issue":"1","key":"3325_CR79","doi-asserted-by":"publisher","first-page":"30","DOI":"10.1109\/TCSVT.2012.2202195","volume":"23","author":"Q Gu","year":"2013","unstructured":"Gu Q, Takaki T, Ishii I (2013) Fast FPGA-based multiobject feature extraction. IEEE Trans Circuits Syst Video Technol 23(1):30\u201345","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"issue":"4","key":"3325_CR80","doi-asserted-by":"publisher","first-page":"1070","DOI":"10.1109\/JSSC.2014.2386892","volume":"50","author":"P Knag","year":"2015","unstructured":"Knag P, Kim JK, Chen T, Zhang Z (2015) A sparse coding neural network ASIC with on-chip learning for feature extraction and encoding. IEEE J Solid-State Circuits 50(4):1070\u20131079","journal-title":"IEEE J Solid-State Circuits"},{"key":"3325_CR81","doi-asserted-by":"crossref","unstructured":"Bouris D, Nikitakis A, Papaefstathiou I (2010) Fast and efficient FPGA-based feature detection employing the SURF algorithm. Presented at the 2010 18th IEEE Annual International Symposium on Field-Programmable Custom Computing Machines","DOI":"10.1109\/FCCM.2010.11"},{"key":"3325_CR82","doi-asserted-by":"crossref","unstructured":"Ali U, Malik MB, Munawar K (2009) FPGA\/soft-processor based real-time object tracking system. In: 5th Southern Conference on Programmable Logic (SPL)","DOI":"10.1109\/SPL.2009.4914888"},{"key":"3325_CR83","doi-asserted-by":"crossref","unstructured":"Liu S, Papakonstantinou A, Wang H, Chen D (2011) Real-time object tracking system on FPGAs. Presented at the 2011 Symposium on Application Accelerators in High-Performance Computing (SAAHPC 2011)","DOI":"10.1109\/SAAHPC.2011.22"},{"key":"3325_CR84","unstructured":"Kryjak T, Gorgon M (2013) Real-time implementation of the ViBe foreground object segmentation algorithm. In: Federated Conference on Computer Science and Information Systems (FedCSIS), pp 591\u2013596"},{"issue":"1","key":"3325_CR85","doi-asserted-by":"publisher","first-page":"280","DOI":"10.1109\/TC.2013.204","volume":"64","author":"F Saqib","year":"2015","unstructured":"Saqib F, Dutta A, Plusquellic J, Ortiz P, Pattichis MS (2015) Pipelined decision tree classification accelerator implementation in FPGA (DT-CAIF). IEEE Trans Comput 64(1):280\u2013285","journal-title":"IEEE Trans Comput"},{"key":"3325_CR86","doi-asserted-by":"crossref","unstructured":"Pan J, Lauterbach C, Manocha D (2010) g-Planner: real-time motion planning and global navigation using GPUs. In: Proceedings of AAAI Conference on Artificial Intelligence 1245\u20131251","DOI":"10.1609\/aaai.v24i1.7732"},{"issue":"3","key":"3325_CR87","doi-asserted-by":"publisher","first-page":"476","DOI":"10.1016\/j.engappai.2011.12.005","volume":"25","author":"B Vasumathi","year":"2012","unstructured":"Vasumathi B, Moorthi S (2012) Implementation of hybrid ANN-PSO algorithm on FPGA for harmonic estimation. Eng Appl Artif Intell 25(3):476\u2013483","journal-title":"Eng Appl Artif Intell"},{"key":"3325_CR88","unstructured":"Appleyard J, Kocisky T, Blunsom P (2016) Optimizing performance of recurrent neural networks on gpus. arXiv preprint arXiv:1604.01946"},{"key":"3325_CR89","doi-asserted-by":"crossref","unstructured":"Wang Y, Xu J, Han Y, Li H, Li X (2016) DeepBurning: automatic generation of FPGA-based learning accelerators for the neural network family, pp 1\u20136","DOI":"10.1145\/2897937.2898003"},{"key":"3325_CR90","doi-asserted-by":"crossref","unstructured":"Sharma H, Park J, Amaro E, Thwaites B, Kotha P, Gupta A, Kim Joon K, Mishra A, Esmaeilzadeh H (2016) DNNWeaver: from high-level deep network models to FPGA acceleration. In: Workshop on Cognitive Architectures","DOI":"10.1109\/MICRO.2016.7783720"},{"key":"3325_CR91","doi-asserted-by":"crossref","unstructured":"DiCecco R, Lacey G, Vasiljevic J, Chow P, Taylor G, Areibi S (2016) Caffeinated FPGAs: FPGA framework for convolutional neural networks. In: International Conference on Field-Programmable Technology (FPT)","DOI":"10.1109\/FPT.2016.7929549"},{"key":"3325_CR92","doi-asserted-by":"crossref","unstructured":"Umuroglu Y, Fraser NJ, Gambardella G, Blott M, Leong P, Jahre M, Vissers K (2017) FINN: a framework for fast, scalable binarized neural network inference. In: Proceedings of the 2017 ACM\/SIGDA International Symposium on Field-Programmable Gate Arrays\u2014FPGA \u201917","DOI":"10.1145\/3020078.3021744"},{"key":"3325_CR93","doi-asserted-by":"crossref","unstructured":"Geng T, Wang T, Sanaullah A, Yang C, Patel R, Herbordt M (2018) A framework for acceleration of CNN training on deeply-pipelined FPGA clusters with work and weight load balancing. Presented at the 2018 28th International Conference on Field Programmable Logic and Applications (FPL)","DOI":"10.1109\/FPL.2018.00074"},{"key":"3325_CR94","doi-asserted-by":"crossref","unstructured":"Jia Y, Shelhamer E, Donahue J, Karayev S, Long J, Girshick R, Guadarrama S, Darrell T (2014) Caffe: convolutional architecture for fast feature embedding. In: Proceedings of the ACM International Conference on Multimedia\u2014MM \u201914","DOI":"10.1145\/2647868.2654889"},{"key":"3325_CR95","doi-asserted-by":"crossref","unstructured":"Venieris SI, Bouganis C-S (2016) FPAGConvNet: a framework for mapping convolutional neural networks on FPGAs. In: IEEE 24th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)","DOI":"10.1109\/FCCM.2016.22"},{"key":"3325_CR96","doi-asserted-by":"crossref","unstructured":"Samragh M, Ghasemzadeh M, Koushanfar F (2017) Customizing neural networks for efficient FPGA implementation. Presented at the 2017 IEEE 25th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)","DOI":"10.1109\/FCCM.2017.43"},{"issue":"3","key":"3325_CR97","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3079758","volume":"10","author":"Z Liu","year":"2017","unstructured":"Liu Z, Dou Y, Jiang J, Xu J, Li S, Zhou Y, Xu Y (2017) Throughput-optimized FPGA accelerator for deep convolutional neural networks. ACM Trans Reconfig Technol Syst 10(3):1\u201323","journal-title":"ACM Trans Reconfig Technol Syst"},{"key":"3325_CR98","doi-asserted-by":"crossref","unstructured":"Guan Y, Liang H, Xu N, Wang W, Shi S, Chen X, Sun G, Zhang W, Cong J (2017) FP-DNN: an automated framework for mapping deep neural networks onto FPGAs with RTL-HLS hybrid templates. In: IEEE 25th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)","DOI":"10.1109\/FCCM.2017.25"},{"key":"3325_CR99","doi-asserted-by":"crossref","unstructured":"Wei X, Yu CH, Zhang P, Chen Y, Wang Y, Hu H, Cong J (2017) Automated systolic array architecture synthesis for high throughput CNN inference on FPGAs. Presented at the 54th Annual Design Automation Conference 2017","DOI":"10.1145\/3061639.3062207"},{"key":"3325_CR100","doi-asserted-by":"crossref","unstructured":"Zhao R, Ng H-C, Luk W, Niu X (2018) Towards efficient convolutional neural network for domain-specific applications on FPGA. In: 28th International Conference on Field Programmable Logic and Applications (FPL)","DOI":"10.1109\/FPL.2018.00033"},{"key":"3325_CR101","doi-asserted-by":"crossref","unstructured":"Bottleson J, Kim S, Andrews J, Bindu P, Murthy DN, Jin J (2016) clCaffe: OpenCL accelerated Caffe for convolutional neural networks. In: IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)","DOI":"10.1109\/IPDPSW.2016.182"},{"key":"3325_CR102","doi-asserted-by":"crossref","unstructured":"Rabhi S, Sun W, Perez J, Kristensen MRB, Liu J, Oldridge E (2019) Accelerating recommender system training 15x with RAPIDS. In: Proceedings of the Workshop on ACM Recommender Systems Challenge. RecSys Challenge \u201919: ACM Recommender Systems Challenge 2019 Workshop","DOI":"10.1145\/3359555.3359564"},{"key":"3325_CR103","doi-asserted-by":"crossref","unstructured":"Gong J, Shen H, Zhang G, Liu X, Li S, Jin G, Maheshwari N, Fomenko E, Segal E (2018) Highly efficient 8-bit low precision inference of convolutional neural networks with IntelCaffe. In: Proceedings of the 1st on Reproducible Quality-Efficient Systems Tournament on Co-designing Pareto-efficient Deep Learning (ReQuEST \u201918). Association for Computing Machinery, New York, NY, USA, Article 2, 1","DOI":"10.1145\/3229762.3229763"},{"issue":"4","key":"3325_CR104","doi-asserted-by":"publisher","first-page":"113","DOI":"10.1109\/LES.2017.2743247","volume":"9","author":"K Abdelouahab","year":"2017","unstructured":"Abdelouahab K, Pelcat M, Serot J, Bourrasset C, Berry F (2017) Tactics to directly map CNN graphs on embedded FPGAs. IEEE Embed Syst Lett 9(4):113\u2013116","journal-title":"IEEE Embed Syst Lett"},{"key":"3325_CR105","doi-asserted-by":"crossref","unstructured":"Sharma H et\u00a0al (2016) From High-level deep neural models to FPGAs. In: 49th Annual IEEE\/ACM International Symposium on Microarchitecture, pp 1\u201312","DOI":"10.1109\/MICRO.2016.7783720"},{"key":"3325_CR106","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1016\/j.vlsi.2017.12.009","volume":"62","author":"Y Ma","year":"2018","unstructured":"Ma Y, Suda N, Cao Y, Vrudhula S, Seo JS (2018) ALAMO: FPGA acceleration of deep learning algorithms with a modularized RTL compiler. Integration 62:14\u201323","journal-title":"Integration"},{"key":"3325_CR107","doi-asserted-by":"crossref","unstructured":"Venieris SI (2017) Latency-driven design for FPGA-based convolutional neural networks","DOI":"10.23919\/FPL.2017.8056828"},{"key":"3325_CR108","doi-asserted-by":"crossref","unstructured":"Zeng H, Zhang C, Prasanna V (2018) Fast generation of high throughput customized deep learning accelerators on FPGAs. In: International Conference on Reconfigurable Computing FPGAs, ReConFig 2017, vol 2018-Janua, pp 1\u20138","DOI":"10.1109\/RECONFIG.2017.8279792"},{"key":"3325_CR109","doi-asserted-by":"crossref","unstructured":"Venieris SI (2018) f-CNN x : a toolflow for mapping multiple convolutional neural networks on FPGAs","DOI":"10.1109\/FPL.2018.00072"},{"key":"3325_CR110","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2018.2884972","author":"Y Ma","year":"2020","unstructured":"Ma Y, Cao Y, Vrudhula S, Seo JS (2020) Automatic compilation of diverse CNNs onto high-performance FPGA accelerators. IEEE Trans Comput Des Integr Circuits Syst. https:\/\/doi.org\/10.1109\/TCAD.2018.2884972","journal-title":"IEEE Trans Comput Des Integr Circuits Syst"},{"key":"3325_CR111","unstructured":"Ma Y, Suda N, Cao Y, Seo JS, Vrudhula S (2016) Scalable and modularized RTL compilation of convolutional neural networks onto FPGA. In: 26th International Conference on Field-Programmable Logic and Applications (FPL)"},{"key":"3325_CR112","doi-asserted-by":"crossref","unstructured":"Cadambi S, Graf HP (2010) A programmable parallel accelerator for learning and classification, pp 273\u2013283","DOI":"10.1145\/1854273.1854309"},{"key":"3325_CR113","unstructured":"Art P (2011) Artificial neural network acceleration on FPGA using custom instruction, pp 450\u2013455"},{"key":"3325_CR114","doi-asserted-by":"crossref","unstructured":"Luo G, Zhang C, Cong J, Sun J, Sun G, Wu D (2016) Energy-efficient CNN implementation on a deeply pipelined FPGA cluster, pp 326\u2013331","DOI":"10.1145\/2934583.2934644"},{"key":"3325_CR115","unstructured":"Sun F et\u00a0al (2018) A high-performance accelerator for large-scale convolutional neural networks. In: Proceedings of the 15th IEEE International Symposium on International Parallel and Distributed Processing with Application. 16th IEEE International Conference on Ubiquitous Computing and Communications, ISPA\/IUCC 2017, pp 622\u2013629"},{"issue":"2","key":"3325_CR116","first-page":"2010","volume":"82","author":"Y Qiao","year":"2011","unstructured":"Qiao Y (2011) FPGA-accelerated deep convolutional neural networks for high throughput and energy efficiency. Seismol Res Lett 82(2):2010\u20132011","journal-title":"Seismol Res Lett"},{"key":"3325_CR117","doi-asserted-by":"crossref","unstructured":"Motamedi M, Gysel P, Akella V, Ghiasi S (2016) Design space exploration of FPGA-based deep convolutional neural networks. In: Proceeding of Asia and South Pacific Design Automation Conference, ASP-DAC, vol 25\u201328 Jan, pp 575\u2013580","DOI":"10.1109\/ASPDAC.2016.7428073"},{"key":"3325_CR118","doi-asserted-by":"crossref","unstructured":"Rahman A, Lee J, Choi K (2016) Efficient FPGA acceleration of convolutional neural networks using logical-3D compute array, pp 1393\u20131398","DOI":"10.3850\/9783981537079_0833"},{"key":"3325_CR119","doi-asserted-by":"crossref","unstructured":"Zhang J, Li J (2017) Improving the performance of OpenCL-based FPGA accelerator for convolutional neural network, pp 25\u201334","DOI":"10.1145\/3020078.3021698"},{"key":"3325_CR120","doi-asserted-by":"crossref","unstructured":"Yonekawa H, Nakahara H (2017) On-chip memory based binarized convolutional deep neural network applying batch normalization free technique on an FPGA. In: Proceedings of IEEE 31st International Parallel and Distributed Processing Symposium Work, IPDPSW, pp 98\u2013105","DOI":"10.1109\/IPDPSW.2017.95"},{"key":"3325_CR121","doi-asserted-by":"crossref","unstructured":"Nakahara H, Fujii T, Sato S (2017) A fully connected layer elimination for a binarizec convolutional neural network on an FPGA. In: 27th International Conference on Field-Programmable Logic and Applications (FPL), pp 1\u20134","DOI":"10.23919\/FPL.2017.8056771"},{"key":"3325_CR122","unstructured":"Kim L (2017) DeepX: deep learning accelerator for restricted Boltzmann machine artificial neural networks, pp 1\u201313"},{"key":"3325_CR123","doi-asserted-by":"crossref","unstructured":"Zhao R et\u00a0al (2017) Accelerating binarized convolutional neural networks with software-programmable FPGAs, pp 15\u201324","DOI":"10.1145\/3020078.3021741"},{"key":"3325_CR124","doi-asserted-by":"crossref","unstructured":"Aydonat U, O\u2019Connell S, Capalija D, Ling AC, Chiu GR (2017) An OpenCL(TM) deep learning accelerator on Arria 10, pp 55\u201364","DOI":"10.1145\/3020078.3021738"},{"key":"3325_CR125","doi-asserted-by":"crossref","unstructured":"Shimoda M, Sato S, Nakahara H (2018) All binarized convolutional neural network and its implementation on an FPGA. In: International Conference on Field-Programmable Technology, ICFPT, vol 2018-Janua, pp 291\u2013294","DOI":"10.1109\/FPT.2017.8280163"},{"key":"3325_CR126","doi-asserted-by":"crossref","unstructured":"Xian A, Chang M, Culurciello E (2017) Hardware accelerators for recurrent neural networks on FPGA, pp 0\u20133","DOI":"10.1109\/ISCAS.2017.8050816"},{"key":"3325_CR127","doi-asserted-by":"crossref","unstructured":"Guo J, Yin S, Ouyang P, Liu L, Wei S (2017) Bit-width based resource partitioning for CNN acceleration on FPGA. In: Proceedings of IEEE 25th Annual International Symposium on Field-Programmable Custom Computing Machines. FCCM 2017, p 31","DOI":"10.1109\/FCCM.2017.13"},{"key":"3325_CR128","doi-asserted-by":"crossref","unstructured":"Zhang C, Prasanna V (2017) Frequency domain acceleration of convolutional neural networks on CPU-FPGA shared memory system, pp 35\u201344","DOI":"10.1145\/3020078.3021727"},{"key":"3325_CR129","unstructured":"Yan S, Lu L, Liang Y, Xiao Q, Tai Y-W (2017) Exploring heterogeneous algorithms for accelerating deep convolutional neural networks on FPGAs, pp 1\u20136"},{"key":"3325_CR130","doi-asserted-by":"crossref","unstructured":"Gong L, Wang C, Li X, Chen X, Zhou X (2017) Work-in-progress: a power-efficient and high performance FPGA accelerator for convolutional neural networks","DOI":"10.1145\/3125502.3125534"},{"key":"3325_CR131","doi-asserted-by":"crossref","unstructured":"Ma Y, Cao Y, Vrudhula S, Seo J (2017) Optimizing loop operation and dataflow in FPGA acceleration of deep convolutional neural networks, pp 45\u201354","DOI":"10.1145\/3020078.3021736"},{"key":"3325_CR132","doi-asserted-by":"crossref","unstructured":"Nguyen D, Kim D, Lee J (2017) Double MAC: doubling the performance of convolutional neural networks on modern FPGAs. In: Proceedings of 2017 Design, Automation and Test in Europe Conference and Exhibition, pp 890\u2013893","DOI":"10.23919\/DATE.2017.7927113"},{"key":"3325_CR133","doi-asserted-by":"crossref","unstructured":"Hwang WJ, Jhang YJ, Tai TM (2017) An efficient FPGA-based architecture for convolutional neural networks. In: 40th International Conference on Telecommunications and Signal Processing, TSP, vol 2017-Janua, pp 582\u2013588","DOI":"10.1109\/TSP.2017.8076054"},{"issue":"7","key":"3325_CR134","doi-asserted-by":"publisher","first-page":"1354","DOI":"10.1109\/TVLSI.2018.2815603","volume":"26","author":"Y Ma","year":"2018","unstructured":"Ma Y, Cao Y, Vrudhula S, Seo JS (2018) Optimizing the convolution operation to accelerate deep neural networks on FPGA. IEEE Trans Very Large Scale Integr Syst 26(7):1354\u20131367","journal-title":"IEEE Trans Very Large Scale Integr Syst"},{"key":"3325_CR135","doi-asserted-by":"crossref","unstructured":"Guan Y, Yuan Z, Sun G, Cong J (2017) FPGA-based accelerator for long short-term memory recurrent neural networks. In: Proceedings of Asia and South Pacific Design Automation Conference, ASP-DAC, pp 629\u2013634","DOI":"10.1109\/ASPDAC.2017.7858394"},{"key":"3325_CR136","doi-asserted-by":"crossref","unstructured":"Ma Y, Kim M, Cao Y, Vrudhula S, Seo JS (2017) End-to-end scalable FPGA accelerator for deep residual networks. In: Proceedings of IEEE International Symposium on Circuits and Systems, pp 0\u20133","DOI":"10.1109\/ISCAS.2017.8050344"},{"key":"3325_CR137","doi-asserted-by":"crossref","unstructured":"Yu J et\u00a0al (2018) Instruction driven cross-layer CNN accelerator with winograd transformation on FPGA. In: International Conference on Field-Programmable Technology, ICFPT 2017, vol 2018-Janua, pp 227\u2013230","DOI":"10.1109\/FPT.2017.8280147"},{"key":"3325_CR138","doi-asserted-by":"crossref","unstructured":"Kim JH, Grady B, Lian B, Brothers J, Anderson JH (2017) FPGA-based CNN inference accelerator synthesized from multi-threaded C software, pp 268\u2013273","DOI":"10.1109\/SOCC.2017.8226056"},{"key":"3325_CR139","doi-asserted-by":"crossref","unstructured":"Moss DJM et\u00a0al (2017) High performance binary neural networks on the Xeon+FPGATM platform. In: 27th International Conference on Field-Programmable Logic and Applications (FPL)","DOI":"10.23919\/FPL.2017.8056823"},{"issue":"1","key":"3325_CR140","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1109\/TCAD.2017.2705069","volume":"37","author":"K Guo","year":"2018","unstructured":"Guo K et al (2018) Angel-Eye: a complete design flow for mapping CNN onto embedded FPGA. IEEE Trans Comput Des Integr Circuits Syst 37(1):35\u201347","journal-title":"IEEE Trans Comput Des Integr Circuits Syst"},{"issue":"11","key":"3325_CR141","doi-asserted-by":"publisher","first-page":"2601","DOI":"10.1109\/TCAD.2018.2857078","volume":"37","author":"L Gong","year":"2018","unstructured":"Gong L, Wang C, Li X, Chen H, Zhou X (2018) MALOC: a fully pipelined FPGA accelerator for convolutional neural networks with all layers mapped on chip. IEEE Trans Comput Des Integr Circuits Syst 37(11):2601\u20132612","journal-title":"IEEE Trans Comput Des Integr Circuits Syst"},{"key":"3325_CR142","unstructured":"Duarte RP (2018) Lite-CNN: a high-performance architecture to execute CNNs in low density FPGAs"},{"key":"3325_CR143","doi-asserted-by":"crossref","unstructured":"Rybalkin V, Pappalardo A, Ghaffar MM, Gambardella G, Wehn N, Blott M (2018) FINN-L: Library extensions and design trade-off analysis for variable precision LSTM networks on FPGAs. In: Proceedings of 2018 International Conference on Field-Programmable Logic and Applications (FPL), pp 89\u201396","DOI":"10.1109\/FPL.2018.00024"},{"key":"3325_CR144","doi-asserted-by":"crossref","unstructured":"Yu Q, Wang C, Ma X, Li X, Zhou X, (2015) A deep learning prediction process accelerator based FPGA. In: Proceedings of 2015 IEEE\/ACM International Symposium on Cluster, Cloud and Grid Computing, CCGrid 2015, no 500, pp 1159\u20131162","DOI":"10.1109\/CCGrid.2015.114"},{"key":"3325_CR145","doi-asserted-by":"crossref","unstructured":"Abdelfattah MS et\u00a0al (2018) DLA: compiler and FPGA overlay for neural network inference acceleration","DOI":"10.1109\/FPL.2018.00077"},{"key":"3325_CR146","doi-asserted-by":"crossref","unstructured":"Nurvitadhi E et\u00a0al (2018) In-package domain-specific ASICs for Intel\u00ae Stratix\u00ae 10 FPGAs: a case study of accelerating deep learning using TensorTile ASIC, pp 106\u2013110","DOI":"10.1145\/3174243.3174966"},{"key":"3325_CR147","doi-asserted-by":"crossref","unstructured":"Zhang C (2015) Optimizing FPGA-based accelerator design for deep convolutional neural networks, pp 161\u2013170","DOI":"10.1145\/2684746.2689060"},{"key":"3325_CR148","doi-asserted-by":"crossref","unstructured":"Qiu J et\u00a0al (2016) Going deeper with embedded FPGA platform for convolutional neural network, pp 26\u201335","DOI":"10.1145\/2847263.2847265"},{"key":"3325_CR149","unstructured":"Vrudhula S et\u00a0al (2016) Throughput-optimized OpenCL-based FPGA accelerator for large-scale convolutional neural networks, pp 16\u201325"},{"key":"3325_CR150","doi-asserted-by":"crossref","unstructured":"Wang Y et\u00a0al (2016) Low power convolutional neural networks on a chip. In: Proceedings of IEEE International Symposium on Circuits and Systems, vol 2016-July, no 1, pp 129\u2013132","DOI":"10.1109\/ISCAS.2016.7527187"},{"key":"3325_CR151","unstructured":"Feng G, Hu Z, Chen S, Wu F (2016) Energy-efficient and high-throughput FPGA-based accelerator for convolutional neural networks, pp 4\u20136"},{"issue":"3","key":"3325_CR152","first-page":"513","volume":"36","author":"C Wang","year":"2017","unstructured":"Wang C, Gong L, Yu Q, Li X, Xie Y, Zhou X (2017) DLAU: a scalable deep learning accelerator unit on FPGA. IEEE Trans Comput Des Integr Circuits Syst 36(3):513\u2013517","journal-title":"IEEE Trans Comput Des Integr Circuits Syst"},{"key":"3325_CR153","unstructured":"Park J, Lotfi-Kamran P, Sharma H, Esmaeilzadeh H, Yazdanbakhsh A (2016) Neural acceleration for GPU throughput processors, pp 482\u2013493"},{"key":"3325_CR154","doi-asserted-by":"crossref","unstructured":"Strigl D, Kofler K, Podlipnig S (2010) Performance and scalability of GPU-based convolutional neural networks. In: Proceedings of the 18th Euromicro Conference on Parallel, Distributed and Network-based Processing, PDP 2010, pp 317\u2013324","DOI":"10.1109\/PDP.2010.43"},{"key":"3325_CR155","doi-asserted-by":"crossref","unstructured":"Guzhva A, Dolenko S, Persiantsev I (2009) Multifold acceleration of neural network computations using GPU. In: Artificial Neural Networks\u2014ICANN 2009, pp 373\u2013380","DOI":"10.1007\/978-3-642-04274-4_39"},{"key":"3325_CR156","doi-asserted-by":"crossref","unstructured":"Li B, Zhou E, Huang B, Duan J, Wang Y, Xu N, Zhang J, Yang H (2014) Large scale recurrent neural network on GPU. In: International Joint Conference on Neural Networks (IJCNN)","DOI":"10.1109\/IJCNN.2014.6889433"},{"key":"3325_CR157","doi-asserted-by":"crossref","unstructured":"Kim Y, Lee J, Kim J-S, Jei H, Roh H (2018) Efficient multi-GPU memory management for deep learning acceleration. In: IEEE 3rd International Workshops on Foundations and Applications of Self* Systems (FAS*W)","DOI":"10.1109\/FAS-W.2018.00023"},{"key":"3325_CR158","doi-asserted-by":"crossref","unstructured":"Bhuiyan MA, Pallipuram VK, Smith MC (2010) Acceleration of spiking neural networks in emerging multi-core and GPU architectures. In: IEEE International Symposium on Parallel and Distributed Processing, Workshops and Phd Forum (IPDPSW)","DOI":"10.1109\/IPDPSW.2010.5470899"},{"key":"3325_CR159","doi-asserted-by":"crossref","unstructured":"Zhang X, Gu N, Ye H (2016) Multi-GPU based recurrent neural networks language model training. In: Communications in computer and information science, pp 484\u2013493","DOI":"10.1007\/978-981-10-2053-7_43"},{"key":"3325_CR160","doi-asserted-by":"crossref","unstructured":"Potluri S, Fasih A, Vutukuru LK, Machot FA, Kyamakya K (2011) CNN based high performance computing for real time image processing on GPU. Presented at the 16th Int\u2019l Symposium on Theoretical Electrical Engineering (ISTET)","DOI":"10.1109\/INDS.2011.6024781"},{"key":"3325_CR161","unstructured":"Farah NICLA (2014) A new classification approach for neural networks hardware: from standards chips to embedded systems on chip, pp 491\u2013534"},{"key":"3325_CR162","doi-asserted-by":"crossref","unstructured":"Jin L, Wang Z, Gu R, Yuan C, Huang Y (2014) Training large scale deep neural networks on the Intel Xeon Phi many-core coprocessor. In: IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)","DOI":"10.1109\/IPDPSW.2014.194"},{"key":"3325_CR163","doi-asserted-by":"crossref","unstructured":"Kurth T, Zhang J, Satish N, Racah E, Mitliagkas I, Patwary MMA, Malas T, Sundaram N, Bhimji W, Smorkalov M et\u00a0al (2017) Deep learning at 15PF: supervised and semi-supervised classification for scientific data. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. ACM, pp 7","DOI":"10.1145\/3126908.3126916"},{"key":"3325_CR164","doi-asserted-by":"crossref","unstructured":"Georganas E, Avancha S, Banerjee K, Kalamkar D, Henry G, Pabst H, Heinecke A (2018) Anatomy of high-performance deep learning convolutions on SIMD architectures. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis, SC \u201918, Piscataway, NJ, USA. IEEE Press, pp 66:1\u201366:12","DOI":"10.1109\/SC.2018.00069"},{"issue":"1","key":"3325_CR165","doi-asserted-by":"publisher","first-page":"197","DOI":"10.1007\/s11227-017-1994-x","volume":"75","author":"A Viebke","year":"2017","unstructured":"Viebke A, Memeti S, Pllana S, Abraham A (2017) CHAOS: a parallelization scheme for training convolutional neural networks on Intel Xeon Phi. J Supercomput 75(1):197\u2013227","journal-title":"J Supercomput"},{"key":"3325_CR166","doi-asserted-by":"crossref","unstructured":"Mathuriya A, Bard D, Mendygral P, Meadows L, Arnemann J, Shao L, He S, Karna T, Moise D, Pennycook SJ, Maschhoff K, Sewall J, Kumar N, Ho S, Ringenburg MF, Prabhat P, Lee V (2018) CosmoFlow: using deep learning to learn the universe at scale. In: SC18: International Conference for High Performance Computing, Networking, Storage and Analysis","DOI":"10.1109\/SC.2018.00068"},{"key":"3325_CR167","doi-asserted-by":"crossref","unstructured":"Hu Y, Zhai J, Li D, Gong Y, Zhu Y, Liu W, Su L, Jin J (2018) BitFlow: exploiting vector parallelism for binary neural networks on CPU. Presented at the 2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS)","DOI":"10.1109\/IPDPS.2018.00034"},{"key":"3325_CR168","unstructured":"\u201cVirtex-5\u201d, Xilinx.com (2019). https:\/\/www.xilinx.com\/products\/boards-and-kits\/device-family\/nav-virtex-5.html. Accessed 16 Oct 2019"},{"key":"3325_CR169","unstructured":"\u201cStratix V GX FPGA Development Kit\u201d, Intel.com (2019). https:\/\/intel.ly\/31pCBMl. Accessed 16 Oct 2019"},{"key":"3325_CR170","unstructured":"\u201cArria 10 GX FPGA Development Kit\u201d, Intel.com (2019). https:\/\/intel.ly\/2ITEPwO. Accessed 16 Oct 2019"},{"key":"3325_CR171","doi-asserted-by":"crossref","unstructured":"Chen Y et al (2014) DaDianNao: a machine-learning supercomputer","DOI":"10.1109\/MICRO.2014.58"},{"key":"3325_CR172","unstructured":"Amazon.com (2019). https:\/\/www.amazon.com\/NVIDIA-Computing-Processor-Graphic-900-22081-2250-000\/dp\/B00KDRRTB8. Accessed: 16 Oct 2019"},{"key":"3325_CR173","unstructured":"Amazon.com (2019). https:\/\/www.amazon.com\/Nvidia-TESLA-Accelerator-Processing-900-2G600-0000-000\/dp\/B01MDNO5BK. Accessed 16 Oct 2019"},{"key":"3325_CR174","unstructured":"\u201cNVIDIA GeForce GT 730 Review\u201d, Benchmarks.ul.com (2019). https:\/\/benchmarks.ul.com\/hardware\/gpu\/NVIDIA+GeForce+GT+730+review. Accessed 16 Oct 2019"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-020-03325-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-020-03325-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-020-03325-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,25]],"date-time":"2022-10-25T12:52:51Z","timestamp":1666702371000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-020-03325-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,5,28]]},"references-count":174,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2021,2]]}},"alternative-id":["3325"],"URL":"https:\/\/doi.org\/10.1007\/s11227-020-03325-8","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"value":"0920-8542","type":"print"},{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,5,28]]},"assertion":[{"value":"28 May 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}