{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T06:48:57Z","timestamp":1780469337701,"version":"3.54.1"},"reference-count":58,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J. Comput. Sci. Technol."],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1007\/s11390-025-4555-4","type":"journal-article","created":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T04:25:30Z","timestamp":1763699130000},"page":"1368-1385","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Decoupled Vector Processing Unit: Past, Present, and Future"],"prefix":"10.1007","volume":"40","author":[{"given":"Ruo-Xi","family":"Wang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dun-Bo","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qing-Jie","family":"Lang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dong-Huan","family":"Xie","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhi-Wei","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhen-Yu","family":"Gao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Li","family":"Shen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2025,11,21]]},"reference":[{"key":"4555_CR1","doi-asserted-by":"publisher","first-page":"283","DOI":"10.1109\/MICRO.2002.1176257","volume-title":"Proc. the 35th Annual IEEE\/ACM International Symposium on Microarchitecture","author":"C Kozyrakis","year":"2002","unstructured":"Kozyrakis C, Patterson D. Vector vs. superscalar and VLIW architectures for embedded multimedia benchmarks. In Proc. the 35th Annual IEEE\/ACM International Symposium on Microarchitecture, Nov. 2002, pp.283\u2013289. DOI: https:\/\/doi.org\/10.1109\/MICRO.2002.1176257."},{"key":"4555_CR2","doi-asserted-by":"publisher","first-page":"209","DOI":"10.1109\/ISCA52012.2021.00025","volume-title":"Proc. the 48th ACM\/IEEE Annual International Symposium on Computer Architecture","author":"J M Domingos","year":"2021","unstructured":"Domingos J M, Neves N, Roma N, Tom\u00e1s P. Unlimited vector extension with data streaming support. In Proc. the 48th ACM\/IEEE Annual International Symposium on Computer Architecture, Jun. 2021, pp.209\u2013222. DOI: https:\/\/doi.org\/10.1109\/ISCA52012.2021.00025."},{"key":"4555_CR3","doi-asserted-by":"publisher","DOI":"10.1109\/HCS59251.2023.10254718","volume-title":"Proc. the 2023 IEEE Hot Chips 35 Symposium","author":"M Bruce","year":"2023","unstructured":"Bruce M. Arm Neoverse V2 platform: Leadership performance and power efficiency for next-generation cloud computing, ML and HPC workloads. In Proc. the 2023 IEEE Hot Chips 35 Symposium, Aug. 2023. DOI: https:\/\/doi.org\/10.1109\/HCS59251.2023.10254718."},{"key":"4555_CR4","doi-asserted-by":"publisher","first-page":"691","DOI":"10.1109\/HPCA56546.2023.10071074","volume-title":"Proc. the 2023 IEEE International Symposium on High-Performance Computer Architecture","author":"K Al-Hawaj","year":"2023","unstructured":"Al-Hawaj K, Ta T, Cebry N, Agwa S, Afuye O, Hall E, Golden C, Apsel A B, Batten C. EVE: Ephemeral vector engines. In Proc. the 2023 IEEE International Symposium on High-Performance Computer Architecture, Feb. 25\u2013Mar. 1, 2023, pp.691\u2013704. DOI: https:\/\/doi.org\/10.1109\/HPCA56546.2023.10071074."},{"issue":"2","key":"4555_CR5","doi-asserted-by":"publisher","first-page":"530","DOI":"10.1109\/TVLSI.2019.2950087","volume":"28","author":"M Cavalcante","year":"2020","unstructured":"Cavalcante M, Schuiki F, Zaruba F, Schaffner M, Benini L. Ara: A 1-GHz+ scalable and energy-efficient RISC-V vector processor with multiprecision floating-point support in 22-nm FD-SOI. IEEE Trans. Very Large Scale Integration (VLSI) Systems, 2020, 28(2): 530\u2013543. DOI: https:\/\/doi.org\/10.1109\/TVLSI.2019.2950087.","journal-title":"IEEE Trans. Very Large Scale Integration (VLSI) Systems"},{"key":"4555_CR6","doi-asserted-by":"publisher","first-page":"43","DOI":"10.1109\/ASAP54787.2022.00017","volume-title":"Proc. the 33rd IEEE International Conference on Application-specific Systems, Architectures and Processors","author":"M Perotti","year":"2022","unstructured":"Perotti M, Cavalcante M, Wistoff N, Andri R, Cavigelli L, Benini L. A \u201cNew Ara\u201d for vector computing: An open source highly efficient RISC-V V 1.0 vector processor design. In Proc. the 33rd IEEE International Conference on Application-specific Systems, Architectures and Processors, Jul. 2022, pp.43\u201351. DOI: https:\/\/doi.org\/10.1109\/ASAP54787.2022.00017."},{"key":"4555_CR7","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1145\/2000064.2000080","volume-title":"Proc. the 38th Annual International Symposium on Computer Architecture","author":"Y Lee","year":"2011","unstructured":"Lee Y, Avizienis R, Bishara A, Xia R, Lockhart D, Batten C, Asanovi\u0107 K. Exploring the tradeoffs between programmability and efficiency in data-parallel accelerators. In Proc. the 38th Annual International Symposium on Computer Architecture, Jun. 2011, pp.129\u2013140. DOI: https:\/\/doi.org\/10.1145\/2024723.2000080."},{"key":"4555_CR8","doi-asserted-by":"publisher","first-page":"19","DOI":"10.1109\/ISVLSI.2015.23","volume-title":"Proc. the 2015 IEEE Computer Society Annual Symposium on VLSI","author":"I Ratkovic","year":"2015","unstructured":"Ratkovic I, PaloMar O, Stanic M, Duric M, Peic D, Unsal O, Cristal A, Valero M. Joint circuit-system design space exploration of multiplier unit structure for energyefficient vector processors. In Proc. the 2015 IEEE Computer Society Annual Symposium on VLSI, Jul. 2015, pp.19\u201326. DOI: https:\/\/doi.org\/10.1109\/ISVLSI.2015.23."},{"key":"4555_CR9","doi-asserted-by":"publisher","first-page":"61","DOI":"10.1145\/1450095.1450107","volume-title":"Proc. the 2008 International Conference on Compilers, Architectures and Synthesis for Embedded Systems","author":"P Yiannacouras","year":"2008","unstructured":"Yiannacouras P, Steffan J G, Rose J. VESPA: Portable, scalable, and flexible FPGA-based vector processors. In Proc. the 2008 International Conference on Compilers, Architectures and Synthesis for Embedded Systems, Oct. 2008, pp.61\u201370. DOI: https:\/\/doi.org\/10.1145\/1450095.1450107."},{"issue":"1","key":"4555_CR10","doi-asserted-by":"publisher","first-page":"140","DOI":"10.1109\/JSSC.2021.3118046","volume":"57","author":"C Schmidt","year":"2022","unstructured":"Schmidt C, Wright J, Wang Z, Chang E, Ou A, Bae W, Huang S, Milovanovic V, Flynn A, Richards B, Asanovic K, Alon E, Nikolic B. An eight-core 1.44-GHz RISC-V vector processor in 16-nm FinFET. IEEE Journal of Solid-State Circuits, 2022, 57(1): 140\u2013152. DOI: https:\/\/doi.org\/10.1109\/JSSC.2021.3118046.","journal-title":"IEEE Journal of Solid-State Circuits"},{"issue":"2","key":"4555_CR11","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1109\/40.918001","volume":"21","author":"B Khailany","year":"2001","unstructured":"Khailany B, Dally W J, Kapasi U J, Mattson P, Namkoong J, Owens J D, Towles B, Chang A, Rixner S. Imagine: Media processing with streams. IEEE Micro, 2001, 21(2): 35\u201346. DOI: https:\/\/doi.org\/10.1109\/40.918001.","journal-title":"IEEE Micro"},{"key":"4555_CR12","doi-asserted-by":"publisher","first-page":"73","DOI":"10.1145\/774789.774805","volume-title":"Proc. the 10th International Symposium on Hardware\/Software Codesign","author":"R Banakar","year":"2002","unstructured":"Banakar R, Steinke S, Lee B S, Balakrishnan M, Marwedel P. Scratchpad memory: A design alternative for cache on-chip memory in embedded systems. In Proc. the 10th International Symposium on Hardware\/Software Codesign, May 2002, pp.73\u201378. DOI: https:\/\/doi.org\/10.1145\/774789.774805."},{"key":"4555_CR13","doi-asserted-by":"publisher","first-page":"670","DOI":"10.1145\/3352460.3358277","volume-title":"Proc. the 52nd Annual IEEE\/ACM International Symposium on Microarchitecture","author":"G Gobieski","year":"2019","unstructured":"Gobieski G, Nagi A, Serafin N, Isgenc M M, Beckmann N, Lucia B. MANIC: A vector-dataflow architecture for ultra-low-power embedded systems. In Proc. the 52nd Annual IEEE\/ACM International Symposium on Microarchitecture, Oct. 2019, pp.670\u2013684. DOI: https:\/\/doi.org\/10.1145\/3352460.3358277."},{"key":"4555_CR14","doi-asserted-by":"publisher","first-page":"260","DOI":"10.1109\/CMPCON.1997.584724","volume-title":"Proc. the 1997 IEEE COMPCON 97. Digest of Papers","author":"M Eden","year":"1997","unstructured":"Eden M, Kagan M. The Pentium\u00ae processor with MMX\u2122 technology. In Proc. the 1997 IEEE COMPCON 97. Digest of Papers, Feb. 1997, pp.260\u2013262. DOI: https:\/\/doi.org\/10.1109\/CMPCON.1997.584724."},{"key":"4555_CR15","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2012.7476487","volume-title":"Proc. the 2012 IEEE Hot Chips 24 Symposium","author":"G Chrysos","year":"2012","unstructured":"Chrysos G. Intelff Xeon Phi coprocessor (codename Knights corner). In Proc. the 2012 IEEE Hot Chips 24 Symposium, Aug. 2012. DOI: https:\/\/doi.org\/10.1109\/HOTCHIPS.2012.7476487."},{"issue":"2","key":"4555_CR16","doi-asserted-by":"publisher","first-page":"52","DOI":"10.1109\/MM.2017.38","volume":"37","author":"J Doweck","year":"2017","unstructured":"Doweck J, Kao W F, Lu A K Y, Mandelblat J, Rahatekar A, Rappoport L, Rotem E, Yasin A, Yoaz A. Inside 6th-generation Intel core: New microarchitecture code-named Skylake. IEEE Micro, 2017, 37(2): 52\u201362. DOI: https:\/\/doi.org\/10.1109\/MM.2017.38.","journal-title":"IEEE Micro"},{"issue":"1","key":"4555_CR17","doi-asserted-by":"publisher","first-page":"63","DOI":"10.1145\/359327.359336","volume":"21","author":"R M Russell","year":"1978","unstructured":"Russell R M. The CRAY-1 computer system. Communications of the ACM, 1978, 21(1): 63\u201372. DOI: https:\/\/doi.org\/10.1145\/359327.359336.","journal-title":"Communications of the ACM"},{"key":"4555_CR18","doi-asserted-by":"publisher","DOI":"10.23919\/VLSICircuits52068.2021.9492415","volume-title":"Proc. the 2021 Symposium on VLSI Circuits","author":"S Matsuoka","year":"2021","unstructured":"Matsuoka S. Fugaku and A64FX: The first exascale supercomputer and its innovative Arm CPU. In Proc. the 2021 Symposium on VLSI Circuits, Jun. 2021. DOI: https:\/\/doi.org\/10.23919\/VLSICircuits52068.2021.9492415."},{"key":"4555_CR19","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1145\/1950413.1950420","volume-title":"Proc. the 19th ACM\/SIGDA International Symposium on Field Programmable Gate Arrays","author":"C H Chou","year":"2011","unstructured":"Chou C H, Severance A, Brant A D, Liu Z, Sant S, Lemieux G G F. VEGAS: Soft vector processor with scratchpad memory. In Proc. the 19th ACM\/SIGDA International Symposium on Field Programmable Gate Arrays, Feb. 27\u2013Mar. 1, 2011, pp.15\u201324. DOI: https:\/\/doi.org\/10.1145\/1950413.1950420."},{"issue":"4","key":"4555_CR20","doi-asserted-by":"publisher","first-page":"68","DOI":"10.1109\/MC.2007.124","volume":"40","author":"J Gebis","year":"2007","unstructured":"Gebis J, Patterson D. Embracing and extending 20th-century instruction set architectures. Computer, 2007, 40(4): 68\u201375. DOI: https:\/\/doi.org\/10.1109\/MC.2007.124.","journal-title":"Computer"},{"issue":"1","key":"4555_CR21","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1109\/MC.2017.8","volume":"50","author":"T M Conte","year":"2017","unstructured":"Conte T M, DeBenedictis E P, Gargini P A, Track E. Rebooting computing: The road ahead. Computer, 2017, 50(1): 20\u201329. DOI: https:\/\/doi.org\/10.1109\/MC.2017.8.","journal-title":"Computer"},{"issue":"3","key":"4555_CR22","doi-asserted-by":"publisher","first-page":"122","DOI":"10.1109\/MM.2012.17","volume":"32","author":"H Esmaeilzadeh","year":"2012","unstructured":"Esmaeilzadeh H, Blem E, St. Amant R, Sankaralingam K, Burger D. Dark silicon and the end of multicore scaling. IEEE Micro, 2012, 32(3): 122\u2013134. DOI: https:\/\/doi.org\/10.1109\/MM.2012.17.","journal-title":"IEEE Micro"},{"key":"4555_CR23","doi-asserted-by":"publisher","first-page":"281","DOI":"10.1109\/HPCA.1996.501193","volume-title":"Proc. the 2nd International Symposium on High-Performance Computer Architecture","author":"R Espasa","year":"1996","unstructured":"Espasa R, Valero M. Decoupled vector architectures. In Proc. the 2nd International Symposium on High-Performance Computer Architecture, Feb. 1996, pp.281\u2013290. DOI: https:\/\/doi.org\/10.1109\/HPCA.1996.501193."},{"key":"4555_CR24","first-page":"344","volume-title":"Proc. the 22nd European Solid-State Circuits Conference","author":"K Asanovic","year":"1996","unstructured":"Asanovic K, Beck J, Irissou B, Kingsbury B E D, Wawrzynek J. T0: A single-chip vector microprocessor with reconfigurable pipelines. In Proc. the 22nd European Solid-State Circuits Conference, Sept. 1996, pp.344\u2013347."},{"issue":"1","key":"4555_CR25","doi-asserted-by":"publisher","first-page":"114","DOI":"10.1007\/s42514-020-00057-2","volume":"3","author":"Y Wang","year":"2021","unstructured":"Wang Y, Li C, Liu C, Liu S, Lei Y, Zhang J, Zhang Y, Guo Y. Advancing DSP into HPC, AI, and beyond: Challenges, mechanisms, and future directions. CCF Trans. High Performance Computing, 2021, 3(1): 114\u2013125. DOI: https:\/\/doi.org\/10.1007\/s42514-020-00057-2.","journal-title":"CCF Trans. High Performance Computing"},{"key":"4555_CR26","doi-asserted-by":"publisher","DOI":"10.1109\/ISCAS.2006.1692638","volume-title":"Proc. the 2006 IEEE International Symposium on Circuits and Systems","author":"J Cho","year":"2006","unstructured":"Cho J, Chang H, Sung W. An FPGA based SIMD processor with a vector memory unit. In Proc. the 2006 IEEE International Symposium on Circuits and Systems, May 2006. DOI: https:\/\/doi.org\/10.1109\/ISCAS.2006.1692638."},{"key":"4555_CR27","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1109\/ISCA.2006.37","volume-title":"Proc. the 33rd International Symposium on Computer Architecture","author":"Y Lin","year":"2006","unstructured":"Lin Y, Lee H, Woh M, Harel Y, Mahlke S, Mudge T, Chakrabarti C, Flautner K. SODA: A low-power architecture for software radio. In Proc. the 33rd International Symposium on Computer Architecture, Jun. 2006, pp.89\u2013101. DOI: https:\/\/doi.org\/10.1109\/ISCA.2006.37."},{"issue":"1","key":"4555_CR28","doi-asserted-by":"publisher","first-page":"81","DOI":"10.1109\/MM.2010.8","volume":"30","author":"M Woh","year":"2010","unstructured":"Woh M, Seo S, Mahlke S, Mudge T, Chakrabarti C, Flautner K. AnySP: Anytime anywhere anyway signal processing. IEEE Micro, 2010, 30(1): 81\u201391. DOI: https:\/\/doi.org\/10.1109\/MM.2010.8.","journal-title":"IEEE Micro"},{"issue":"8","key":"4555_CR29","doi-asserted-by":"publisher","first-page":"1429","DOI":"10.1109\/TVLSI.2011.2160463","volume":"20","author":"P Yiannacouras","year":"2012","unstructured":"Yiannacouras P, Steffan J G, Rose J. Portable, flexible, and scalable soft vector processors. IEEE Trans. Very Large Scale Integration (VLSI) Systems, 2012, 20(8): 1429\u20131442. DOI: https:\/\/doi.org\/10.1109\/TVLSI.2011.2160463.","journal-title":"IEEE Trans. Very Large Scale Integration (VLSI) Systems"},{"key":"4555_CR30","series-title":"Technical Report","volume-title":"The Hwacha microarchitecture manual, version 3.8.1","author":"Y Lee","year":"2015","unstructured":"Lee Y, Ou A, Schmidt C, Karandikar S, Mao H, Asanovi\u0107 K. The Hwacha microarchitecture manual, version 3.8.1. Technical Report. Report UCB\/EECS-2015-263, University of California, Berkeley, 2015. https:\/\/www2.eecs.berkeley.edu\/Pubs\/TechRpts\/2015\/EECS-2015-263.html, Jul. 2025."},{"issue":"6","key":"4555_CR31","doi-asserted-by":"publisher","first-page":"36","DOI":"10.1109\/MM.2003.1261385","volume":"23","author":"C E Kozyrakis","year":"2003","unstructured":"Kozyrakis C E, Patterson D A. Scalable, vector processors for embedded systems. IEEE Micro, 2003, 23(6): 36\u201345. DOI: https:\/\/doi.org\/10.1109\/MM.2003.1261385.","journal-title":"IEEE Micro"},{"key":"4555_CR32","doi-asserted-by":"publisher","unstructured":"Yu J, Eagleston C, Chou C H Y, Perreault M, Lemieux G. Vector processing as a soft processor accelerator. ACM Trans. Reconfigurable Technology and Systems, 2009, 2(2): Article No. 12. DOI: https:\/\/doi.org\/10.1145\/1534916.1534922.","DOI":"10.1145\/1534916.1534922"},{"issue":"9","key":"4555_CR33","doi-asserted-by":"publisher","first-page":"3804","DOI":"10.1109\/TCSI.2022.3182483","volume":"69","author":"M Attari","year":"2022","unstructured":"Attari M, Ferreira L, Liu L, Malkowsky S. An application specific vector processor for efficient massive MIMO processing. IEEE Trans. Circuits and Systems I: Regular Papers, 2022, 69(9): 3804\u20133815. DOI: https:\/\/doi.org\/10.1109\/TCSI.2022.3182483.","journal-title":"IEEE Trans. Circuits and Systems I: Regular Papers"},{"issue":"2","key":"4555_CR34","doi-asserted-by":"publisher","first-page":"153","DOI":"10.1109\/T-C.1971.223205","volume":"C-20","author":"H S Stone","year":"1971","unstructured":"Stone H S. Parallel processing with the perfect shuffle. IEEE Trans. Computers, 1971, C-20(2): 153\u2013161. DOI: https:\/\/doi.org\/10.1109\/T-C.1971.223205.","journal-title":"IEEE Trans. Computers"},{"key":"4555_CR35","doi-asserted-by":"publisher","first-page":"301","DOI":"10.1109\/FPT.2005.1568571","volume-title":"Proc. the 2005 IEEE International Conference on Field-Programmable Technology, 2005","author":"J Fender","year":"2005","unstructured":"Fender J, Rose J, Galloway D. The transmogrifier-4: An FPGA-based hardware development system with multi-gigabyte memory capacity and high host and memory bandwidth. In Proc. the 2005 IEEE International Conference on Field-Programmable Technology, 2005, Dec. 2005, pp.301\u2013302. DOI: https:\/\/doi.org\/10.1109\/FPT.2005.1568571."},{"key":"4555_CR36","doi-asserted-by":"publisher","first-page":"199","DOI":"10.1109\/ESSCIRC.2014.6942056","volume-title":"Proc. the 40th European Solid State Circuits Conference","author":"Y Lee","year":"2014","unstructured":"Lee Y, Waterman A, Avizienis R, Cook H, Sun C, Stojanovi\u0107 V, Asanovi\u0107 K. A 45nm 1.3GHz 16.7 double-precision GFLOPS\/W RISC-V processor with vector accelerators. In Proc. the 40th European Solid State Circuits Conference, Sept. 2014, pp.199\u2013202. DOI: https:\/\/doi.org\/10.1109\/ESSCIRC.2014.6942056."},{"key":"4555_CR37","series-title":"Technical Report","volume-title":"The Hwacha vector-fetch architecture manual, version 3.8.1","author":"Y Lee","year":"2015","unstructured":"Lee Y, Schmidt C, Ou A J H, Waterman A, Asanovic K. The Hwacha vector-fetch architecture manual, version 3.8.1. Technical Report, Report UCB\/EECS-2015-262, University of California, Berkeley, Berkeley, CA, USA, 2015. https:\/\/aspire.eecs.berkeley.edu\/publication\/the-hwacha-vector-fetch-architecture-manual-version-3-8-1\/, Jul. 2025."},{"issue":"3","key":"4555_CR38","doi-asserted-by":"publisher","first-page":"112","DOI":"10.1145\/1067649.801719","volume":"10","author":"J E Smith","year":"1982","unstructured":"Smith J E. Decoupled access\/execute computer architectures. ACM SIGARCH Computer Architecture News, 1982, 10(3): 112\u2013119. DOI: https:\/\/doi.org\/10.1145\/1067649.801719.","journal-title":"ACM SIGARCH Computer Architecture News"},{"key":"4555_CR39","doi-asserted-by":"publisher","first-page":"331","DOI":"10.1109\/MICRO.2004.9","volume-title":"Proc. the 37th International Symposium on Microarchitecture","author":"C Batten","year":"2004","unstructured":"Batten C, Krashinsky R, Gerding S, Asanovic K. Cache refill\/access decoupling for vector machines. In Proc. the 37th International Symposium on Microarchitecture, Dec. 2004, pp.331\u2013342. DOI: https:\/\/doi.org\/10.1109\/MICRO.2004.9."},{"key":"4555_CR40","series-title":"Technical Report","volume-title":"Hwacha preliminary evaluation results, version 3.8.1","author":"Y Lee","year":"2015","unstructured":"Lee Y, Schmidt C, Karandikar S, Dabbelt D, Ou A J H, Asanovic K. Hwacha preliminary evaluation results, version 3.8.1. Technical Report, Report UCB\/EECS-2015-264, University of California, Berkeley, CA, USA, 2015. https:\/\/aspire.eecs.berkeley.edu\/publication\/hwacha-preliminary-evaluation-results-version-3-8-1\/, Jul. 2025."},{"key":"4555_CR41","doi-asserted-by":"publisher","first-page":"131","DOI":"10.1109\/FCCM.2006.10","volume-title":"Proc. the 14th Annual IEEE Symposium on Field-Programmable Custom Computing Machines","author":"B Fort","year":"2006","unstructured":"Fort B, Capalija D, Vranesic Z G, Brown S D. A multi-threaded soft processor for SoPC area reduction. In Proc. the 14th Annual IEEE Symposium on Field-Programmable Custom Computing Machines, Apr. 2006, pp.131\u2013142. DOI: https:\/\/doi.org\/10.1109\/FCCM.2006.10."},{"key":"4555_CR42","doi-asserted-by":"publisher","DOI":"10.1109\/ISCAS.2019.8702770","volume-title":"Proc. the 2019 IEEE International Symposium on Circuits and Systems","author":"S Malkowsky","year":"2019","unstructured":"Malkowsky S, Prabhu H, Liu L, Edfors O, \u00d6wall V. A programmable 16-lane SIMD ASIP for massive MIMO. In Proc. the 2019 IEEE International Symposium on Circuits and Systems, May 2019. DOI: https:\/\/doi.org\/10.1109\/ISCAS.2019.8702770."},{"issue":"4","key":"4555_CR43","doi-asserted-by":"publisher","first-page":"930","DOI":"10.1109\/JSSC.2016.2519386","volume":"51","author":"B Zimmer","year":"2016","unstructured":"Zimmer B, Lee Y, Puggelli A, Kwak J, Jevti\u0107 R, Keller B, Bailey S, Blagojevi\u0107 M, Chiu P F, Le H P, Chen P H, Sutardja N, Avizienis R, Waterman A, Richards B, Flatresse P, Alon E, Asanovi\u0107 K, Nikoli\u0107 B. A RISC-V vector processor with simultaneous-switching switched-capacitor DC-DC converters in 28 nm FDSOI. IEEE Journal of Solid-State Circuits, 2016, 51(4): 930\u2013942. DOI: https:\/\/doi.org\/10.1109\/JSSC.2016.2519386.","journal-title":"IEEE Journal of Solid-State Circuits"},{"key":"4555_CR44","doi-asserted-by":"publisher","first-page":"543","DOI":"10.1109\/HPCA57654.2024.00047","volume-title":"Proc. the 2024 IEEE International Symposium on High-Performance Computer Architecture","author":"O Chatzopoulos","year":"2024","unstructured":"Chatzopoulos O, Papadimitriou G, Karakostas V, Gizopoulos D. Gem5-MARVEL: Microarchitecture-level resilience analysis of heterogeneous SoC architectures. In Proc. the 2024 IEEE International Symposium on High-Performance Computer Architecture, Mar. 2024, pp.543\u2013559. DOI: https:\/\/doi.org\/10.1109\/HPCA57654.2024.00047."},{"key":"4555_CR45","doi-asserted-by":"publisher","first-page":"789","DOI":"10.1109\/HPCA51647.2021.00071","volume-title":"Proc. the 2021 IEEE International Symposium on High-Performance Computer Architecture","author":"H Liao","year":"2021","unstructured":"Liao H, Tu J, Xia J, Liu H, Zhou X, Yuan H, Hu Y. Ascend: A scalable and unified architecture for ubiquitous deep neural network computing: Industry track paper. In Proc. the 2021 IEEE International Symposium on High-Performance Computer Architecture, Feb. 27\u2013Mar. 3, 2021, pp.789\u2013801. DOI: https:\/\/doi.org\/10.1109\/HPCA51647.2021.00071."},{"key":"4555_CR46","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2019.8875654","volume-title":"Proc. the 2019 IEEE Hot Chips 31 Symposium","author":"H Liao","year":"2019","unstructured":"Liao H, Tu J, Xia J, Zhou X. DaVinci: A scalable architecture for neural network computing. In Proc. the 2019 IEEE Hot Chips 31 Symposium, Aug. 2019. DOI: https:\/\/doi.org\/10.1109\/HOTCHIPS.2019.8875654."},{"key":"4555_CR47","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1109\/ISSCC42613.2021.9366056","volume-title":"Proc. the 2021 IEEE International Solid-State Circuits Conference","author":"J Ouyang","year":"2021","unstructured":"Ouyang J, Du X, Ma Y, Liu J. 3.3 Kunlun: A 14nm high-performance AI processor for diversified workloads. In Proc. the 2021 IEEE International Solid-State Circuits Conference, Feb. 2021, pp.50\u201351. DOI: https:\/\/doi.org\/10.1109\/ISSCC42613.2021.9366056."},{"key":"4555_CR48","volume-title":"Proc. the 2017 IEEE Hot Chips 28 Symposium","author":"J Ouyang","year":"2017","unstructured":"Ouyang J, Wu E, Wang J, Li Y, Xie H. XPU\u2014A programmable FPGA accelerator for diverse workloads. In Proc. the 2017 IEEE Hot Chips 28 Symposium, Sept. 2017."},{"issue":"5","key":"4555_CR49","doi-asserted-by":"publisher","first-page":"1181","DOI":"10.1109\/TC.2021.3076987","volume":"71","author":"D Kang","year":"2022","unstructured":"Kang D, Kang D, Ha S. Multi-bank on-chip memory management techniques for CNN accelerators. IEEE Trans. Computers, 2022, 71(5): 1181\u20131193. DOI: https:\/\/doi.org\/10.1109\/TC.2021.3076987.","journal-title":"IEEE Trans. Computers"},{"issue":"6","key":"4555_CR50","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1109\/MM.2013.129","volume":"34","author":"S Chen","year":"2014","unstructured":"Chen S, Wang Y, Liu S, Wan J, Chen H, Liu H, Zhang K, Liu X, Ning X. FT-matrix: A coordination-aware architecture for signal processing. IEEE Micro, 2014, 34(6): 64\u201373. DOI: https:\/\/doi.org\/10.1109\/MM.2013.129.","journal-title":"IEEE Micro"},{"key":"4555_CR51","doi-asserted-by":"publisher","unstructured":"Wang Y, Wang D, Chen S, Liu Z, Chen S, Chen X, Zhou X. Iteration interleaving-based SIMD lane partition. ACM Trans. Architecture and Code Optimization, 2016, 12(4): Article No. 58. DOI: https:\/\/doi.org\/10.1145\/2847253.","DOI":"10.1145\/2847253"},{"issue":"2","key":"4555_CR52","doi-asserted-by":"publisher","first-page":"210","DOI":"10.1145\/1273440.1250689","volume":"35","author":"X Yang","year":"2007","unstructured":"Yang X, Yan X, Xing Z, Deng Y, Jiang J, Zhang Y. A 64-bit stream processor architecture for scientific applications. ACM SIGARCH Computer Architecture News, 2007, 35(2): 210\u2013219. DOI: https:\/\/doi.org\/10.1145\/1273440.1250689.","journal-title":"ACM SIGARCH Computer Architecture News"},{"key":"4555_CR53","doi-asserted-by":"publisher","first-page":"464","DOI":"10.1109\/ICPP.2011.79","volume-title":"Proc. the 2011 International Conference on Parallel Processing","author":"Y Guo","year":"2011","unstructured":"Guo Y, Zhuge Q, Hu J, Qiu M, Sha E H M. Optimal data allocation for scratch-pad memory on embedded multicore systems. In Proc. the 2011 International Conference on Parallel Processing, Sept. 2011, pp.464\u2013471. DOI: https:\/\/doi.org\/10.1109\/ICPP.2011.79."},{"key":"4555_CR54","doi-asserted-by":"publisher","first-page":"411","DOI":"10.1145\/3613424.3614268","volume-title":"Proc. the 56th IEEE\/ACM International Symposium on Microarchitecture","author":"R Fan","year":"2023","unstructured":"Fan R, Cui Y, Chen Q, Wang M, Zhang Y, Zheng W, Li Z. MAICC: A lightweight many-core architecture with incache computing for multi-DNN parallel inference. In Proc. the 56th IEEE\/ACM International Symposium on Microarchitecture, Oct. 28\u2013Nov. 1, 2023, pp.411\u2013423. DOI: https:\/\/doi.org\/10.1145\/3613424.3614268."},{"key":"4555_CR55","doi-asserted-by":"publisher","first-page":"399","DOI":"10.1109\/ISCA.2003.1207017","volume-title":"Proc. the 30th Annual International Symposium on Computer Architecture","author":"C Kozyrakis","year":"2003","unstructured":"Kozyrakis C, Patterson D. Overcoming the limitations of conventional vector processors. In Proc. the 30th Annual International Symposium on Computer Architecture, Jun. 2003, pp.399\u2013409. DOI: https:\/\/doi.org\/10.1109\/ISCA.2003.1207017."},{"key":"4555_CR56","doi-asserted-by":"publisher","unstructured":"Zhang D, Lang Q, Wang R, Shen L. Extension VM: Interleaved data layout in vector memory. ACM Trans. Architecture and Code Optimization, 2024, 21(1): Article No. 18. DOI: https:\/\/doi.org\/10.1145\/3631528.","DOI":"10.1145\/3631528"},{"key":"4555_CR57","doi-asserted-by":"publisher","DOI":"10.1109\/DAC56929.2023.10247913","volume-title":"Proc. the 60th ACM\/IEEE Design Automation Conference","author":"Y Ding","year":"2023","unstructured":"Ding Y, Liu C, Duan M, Chang W, Li K, Li K. HAIMA: A hybrid SRAM and DRAM accelerator-in-memory architecture for Transformer. In Proc. the 60th ACM\/IEEE Design Automation Conference, Jul. 2023. DOI: https:\/\/doi.org\/10.1109\/DAC56929.2023.10247913."},{"issue":"1","key":"4555_CR58","doi-asserted-by":"publisher","first-page":"75","DOI":"10.7544\/issn1000-1239.202330151","volume":"62","author":"D Zhang","year":"2025","unstructured":"Zhang D, Zeng L, Wang R, Wang Y, Shen L. Shuffle-SRAM: In-SRAM parallel bitwise data shuffle. Journal of Computer Research and Development, 2025, 62(1): 75\u201389. DOI: https:\/\/doi.org\/10.7544\/issn1000-1239.202330151. (in Chinese)","journal-title":"Journal of Computer Research and Development"}],"container-title":["Journal of Computer Science and Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11390-025-4555-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11390-025-4555-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11390-025-4555-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T06:02:51Z","timestamp":1763704971000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11390-025-4555-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9]]},"references-count":58,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2025,9]]}},"alternative-id":["4555"],"URL":"https:\/\/doi.org\/10.1007\/s11390-025-4555-4","relation":{},"ISSN":["1000-9000","1860-4749"],"issn-type":[{"value":"1000-9000","type":"print"},{"value":"1860-4749","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9]]},"assertion":[{"value":"4 July 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 July 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 November 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Conflict of Interest\n                      The authors declare that they have no conflict of interest.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics"}}]}}