{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,16]],"date-time":"2026-03-16T05:34:41Z","timestamp":1773639281915,"version":"3.50.1"},"reference-count":39,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100004543","name":"China Scholarship Council","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004543","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Integration"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.vlsi.2026.102695","type":"journal-article","created":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T16:54:51Z","timestamp":1772816091000},"page":"102695","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["BenDan: Benchmarking DPU performance on FPGAs"],"prefix":"10.1016","volume":"109","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-4706-4617","authenticated-orcid":false,"given":"Han","family":"Bao","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3815-4953","authenticated-orcid":false,"given":"Xingyu","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8435-3478","authenticated-orcid":false,"given":"Xiaoke","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9331-4692","authenticated-orcid":false,"given":"Ahmed","family":"Sadaqa","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5014-3419","authenticated-orcid":false,"given":"Yanxiang","family":"Zhu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5493-7411","authenticated-orcid":false,"given":"Shidi","family":"Tang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6837-5675","authenticated-orcid":false,"given":"Ruiqi","family":"Chen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8866-7189","authenticated-orcid":false,"given":"Ming","family":"Ling","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4877-9688","authenticated-orcid":false,"given":"Bruno da","family":"Silva","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.vlsi.2026.102695_b1","series-title":"End to end learning for self-driving cars","author":"Bojarski","year":"2016"},{"key":"10.1016\/j.vlsi.2026.102695_b2","series-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"10.1016\/j.vlsi.2026.102695_b3","doi-asserted-by":"crossref","unstructured":"J. Redmon, S. Divvala, R. Girshick, A. Farhadi, You only look once: Unified, real-time object detection, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 779\u2013788.","DOI":"10.1109\/CVPR.2016.91"},{"key":"10.1016\/j.vlsi.2026.102695_b4","doi-asserted-by":"crossref","unstructured":"K. He, X. Zhang, S. Ren, J. Sun, Deep residual learning for image recognition, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10.1016\/j.vlsi.2026.102695_b5","series-title":"Mobilenets: Efficient convolutional neural networks for mobile vision applications","author":"Howard","year":"2017"},{"key":"10.1016\/j.vlsi.2026.102695_b6","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.vlsi.2026.102695_b7","series-title":"GPT-4 technical report","author":"Achiam","year":"2023"},{"key":"10.1016\/j.vlsi.2026.102695_b8","doi-asserted-by":"crossref","unstructured":"S. Shen, D. Yang, Y. Xie, C. Pei, B. Yu, W. Yu, Deep-Learning-Based Pre-Layout Parasitic Capacitance Prediction on SRAM Designs, in: Proceedings of the Great Lakes Symposium on VLSI 2024, 2024, pp. 440\u2013445.","DOI":"10.1145\/3649476.3658754"},{"key":"10.1016\/j.vlsi.2026.102695_b9","doi-asserted-by":"crossref","DOI":"10.1016\/j.sysarc.2024.103247","article-title":"A survey of FPGA and ASIC designs for transformer inference acceleration and optimization","volume":"155","author":"Kang","year":"2024","journal-title":"J. Syst. Archit."},{"key":"10.1016\/j.vlsi.2026.102695_b10","series-title":"2025 62nd ACM\/IEEE Design Automation Conference","first-page":"1","article-title":"Few-shot learning on ams circuits and its application to parasitic capacitance prediction","author":"Shen","year":"2025"},{"key":"10.1016\/j.vlsi.2026.102695_b11","series-title":"2018 IEEE 26th Annual International Symposium on Field-Programmable Custom Computing Machines","first-page":"93","article-title":"Understanding performance differences of FPGAs and GPUs","author":"Cong","year":"2018"},{"issue":"1","key":"10.1016\/j.vlsi.2026.102695_b12","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3289185","article-title":"[DL] A survey of FPGA-based neural network inference accelerators","volume":"12","author":"Guo","year":"2019","journal-title":"ACM Trans. Reconfigurable Technol. Syst. (TRETS)"},{"key":"10.1016\/j.vlsi.2026.102695_b13","series-title":"Vitis AI User Guide (UG1414), Version 3.0: Deep-Learning Processor Unit","author":"(AMD)","year":"2023"},{"key":"10.1016\/j.vlsi.2026.102695_b14","series-title":"2023 60th ACM\/IEEE Design Automation Conference","first-page":"1","article-title":"Accelerating DNN inference with heterogeneous multi-dpu engines","author":"Du","year":"2023"},{"key":"10.1016\/j.vlsi.2026.102695_b15","doi-asserted-by":"crossref","DOI":"10.1016\/j.sysarc.2020.101896","article-title":"Benchmarking vision kernels and neural network inference accelerators on embedded platforms","volume":"113","author":"Qasaimeh","year":"2021","journal-title":"J. Syst. Archit."},{"issue":"3","key":"10.1016\/j.vlsi.2026.102695_b16","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3503465","article-title":"FPGA architecture exploration for DNN acceleration","volume":"15","author":"Roorda","year":"2022","journal-title":"ACM Trans. Reconfigurable Technol. Syst. (TRETS)"},{"key":"10.1016\/j.vlsi.2026.102695_b17","doi-asserted-by":"crossref","unstructured":"M. Petry, P. Gest, A. Koch, M. Ghiglione, M. Werner, Accelerated deep-learning inference on FPGAs in the space domain, in: Proceedings of the 20th ACM International Conference on Computing Frontiers, 2023, pp. 222\u2013228.","DOI":"10.1145\/3587135.3592763"},{"key":"10.1016\/j.vlsi.2026.102695_b18","series-title":"2020 IEEE 28th Annual International Symposium on Field-Programmable Custom Computing Machines","article-title":"High-throughput DNN inference with LogicNets","author":"Umuroglu","year":"2020"},{"key":"10.1016\/j.vlsi.2026.102695_b19","series-title":"DPUCZDX8G for Zynq UltraScale+ MPSoCs Product Guide (PG338)","author":"Advanced Micro Devices, Inc.","year":"2023"},{"issue":"11","key":"10.1016\/j.vlsi.2026.102695_b20","doi-asserted-by":"crossref","first-page":"3895","DOI":"10.1109\/TCAD.2023.3272582","article-title":"Koios 2.0: Open-source deep learning benchmarks for FPGA architecture and CAD research","volume":"42","author":"Arora","year":"2023","journal-title":"IEEE Trans. Comput.-Aided Des. Integr. Circuits Syst."},{"issue":"2","key":"10.1016\/j.vlsi.2026.102695_b21","first-page":"1","article-title":"Elastic-df: Scaling performance of DNN inference in FPGA clouds through automatic partitioning","volume":"15","author":"Alonso","year":"2021","journal-title":"ACM Trans. Reconfigurable Technol. Syst. (TRETS)"},{"issue":"1","key":"10.1016\/j.vlsi.2026.102695_b22","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3520142","article-title":"Optimus: An operator fusion framework for deep neural networks","volume":"22","author":"Cai","year":"2022","journal-title":"ACM Trans. Embed. Comput. Syst."},{"issue":"4","key":"10.1016\/j.vlsi.2026.102695_b23","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3417709","article-title":"Nnbench-x: A benchmarking methodology for neural network accelerator designs","volume":"17","author":"Xie","year":"2020","journal-title":"ACM Trans. Archit. Code Optim. (TACO)"},{"key":"10.1016\/j.vlsi.2026.102695_b24","doi-asserted-by":"crossref","unstructured":"S. Chheda, A. Curtis, E. Siegmann, B. Chapman, Performance study on CPU-based machine learning with PyTorch, in: Proceedings of the HPC Asia 2023 Workshops, 2023, pp. 24\u201334.","DOI":"10.1145\/3581576.3581615"},{"issue":"2","key":"10.1016\/j.vlsi.2026.102695_b25","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3570928","article-title":"Flexcnn: An end-to-end framework for composing CNN accelerators on FPGA","volume":"16","author":"Basalama","year":"2023","journal-title":"ACM Trans. Reconfigurable Technol. Syst."},{"key":"10.1016\/j.vlsi.2026.102695_b26","series-title":"2018 International Conference on Field-Programmable Technology","first-page":"310","article-title":"An FPGA realization of OpenPose based on a sparse weight convolutional neural network","author":"Akira","year":"2018"},{"issue":"10","key":"10.1016\/j.vlsi.2026.102695_b27","first-page":"1415","article-title":"A CNN accelerator on FPGA using depthwise separable convolution","volume":"65","author":"Bai","year":"2018","journal-title":"IEEE Trans. Circuits Syst. II: Express Briefs"},{"key":"10.1016\/j.vlsi.2026.102695_b28","series-title":"2016 26th International Conference on Field Programmable Logic and Applications","first-page":"1","article-title":"A high performance FPGA-based accelerator for large-scale convolutional neural networks","author":"Li","year":"2016"},{"key":"10.1016\/j.vlsi.2026.102695_b29","series-title":"2017 ACM\/IEEE 44th Annual International Symposium on Computer Architecture","first-page":"535","article-title":"Maximizing CNN accelerator efficiency through resource partitioning","author":"Shen","year":"2017"},{"key":"10.1016\/j.vlsi.2026.102695_b30","series-title":"2017 54th ACM\/EDAC\/IEEE Design Automation Conference","first-page":"1","article-title":"Automated systolic array architecture synthesis for high throughput CNN inference on FPGAs","author":"Wei","year":"2017"},{"key":"10.1016\/j.vlsi.2026.102695_b31","series-title":"2018 IEEE\/ACM International Conference on Computer-Aided Design","first-page":"1","article-title":"DNNBuilder: an automated tool for building high-performance DNN hardware accelerators for FPGAs","author":"Zhang","year":"2018"},{"key":"10.1016\/j.vlsi.2026.102695_b32","series-title":"2024 34th International Conference on Field-Programmable Logic and Applications","first-page":"197","article-title":"Revealing untapped DSP optimization potentials for FPGA-based systolic matrix engines","author":"Li","year":"2024"},{"key":"10.1016\/j.vlsi.2026.102695_b33","series-title":"2017 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"1800","article-title":"Xception: Deep learning with depthwise separable convolutions","author":"Chollet","year":"2017"},{"key":"10.1016\/j.vlsi.2026.102695_b34","series-title":"2025 IEEE\/ACM International Conference on Computer Aided Design","first-page":"1","article-title":"Hummingbird: A smaller and faster large language model accelerator on embedded FPGA","author":"Li","year":"2025"},{"key":"10.1016\/j.vlsi.2026.102695_b35","series-title":"torch.nn.RNN \u2014 PyTorch Documentation","author":"PyTorch Core Team","year":"2024"},{"key":"10.1016\/j.vlsi.2026.102695_b36","series-title":"torch.nn.LSTM \u2014 PyTorch Documentation","author":"PyTorch Core Team","year":"2024"},{"key":"10.1016\/j.vlsi.2026.102695_b37","series-title":"torch.nn.GRU \u2014 PyTorch Documentation","author":"PyTorch Core Team","year":"2024"},{"key":"10.1016\/j.vlsi.2026.102695_b38","series-title":"torch.nn.Embedding \u2014 PyTorch Documentation","author":"PyTorch Core Team","year":"2024"},{"key":"10.1016\/j.vlsi.2026.102695_b39","series-title":"torch.nn.MultiheadAttention \u2014 PyTorch Documentation","author":"PyTorch Core Team","year":"2024"}],"container-title":["Integration"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167926026000507?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167926026000507?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,3,16]],"date-time":"2026-03-16T04:37:33Z","timestamp":1773635853000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0167926026000507"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":39,"alternative-id":["S0167926026000507"],"URL":"https:\/\/doi.org\/10.1016\/j.vlsi.2026.102695","relation":{},"ISSN":["0167-9260"],"issn-type":[{"value":"0167-9260","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"BenDan: Benchmarking DPU performance on FPGAs","name":"articletitle","label":"Article Title"},{"value":"Integration","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.vlsi.2026.102695","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"102695"}}