{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T14:13:21Z","timestamp":1766067201719,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":120,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,9,30]],"date-time":"2020-09-30T00:00:00Z","timestamp":1601424000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000185","name":"Defense Advanced Research Projects Agency","doi-asserted-by":"publisher","award":["FA8650-18-2-7864"],"award-info":[{"award-number":["FA8650-18-2-7864"]}],"id":[{"id":"10.13039\/100000185","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,9,30]]},"DOI":"10.1145\/3410463.3414627","type":"proceedings-article","created":{"date-parts":[[2020,9,30]],"date-time":"2020-09-30T10:43:04Z","timestamp":1601462584000},"page":"175-190","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":15,"title":["Transmuter"],"prefix":"10.1145","author":[{"given":"Subhankar","family":"Pal","sequence":"first","affiliation":[{"name":"University of Michigan, Ann Arbor, MI, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Siying","family":"Feng","sequence":"additional","affiliation":[{"name":"University of Michigan, Ann Arbor, MI, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dong-hyeon","family":"Park","sequence":"additional","affiliation":[{"name":"University of Michigan, Ann Arbor, MI, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sung","family":"Kim","sequence":"additional","affiliation":[{"name":"University of Michigan, Ann Arbor, MI, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Aporva","family":"Amarnath","sequence":"additional","affiliation":[{"name":"University of Michigan, Ann Arbor, MI, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chi-Sheng","family":"Yang","sequence":"additional","affiliation":[{"name":"University of Michigan, Ann Arbor, MI, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xin","family":"He","sequence":"additional","affiliation":[{"name":"University of Michigan, Ann Arbor, MI, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jonathan","family":"Beaumont","sequence":"additional","affiliation":[{"name":"University of Michigan, Ann Arbor, MI, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kyle","family":"May","sequence":"additional","affiliation":[{"name":"University of Michigan, Ann Arbor, MI, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yan","family":"Xiong","sequence":"additional","affiliation":[{"name":"Arizona State University, Tempe, AZ, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kuba","family":"Kaszyk","sequence":"additional","affiliation":[{"name":"University of Edinburgh, Edinburgh, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"John Magnus","family":"Morton","sequence":"additional","affiliation":[{"name":"University of Edinburgh, Edinburgh, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiawen","family":"Sun","sequence":"additional","affiliation":[{"name":"University of Edinburgh, Edinburgh, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Michael","family":"O'Boyle","sequence":"additional","affiliation":[{"name":"University of Edinburgh, Edinburgh, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Murray","family":"Cole","sequence":"additional","affiliation":[{"name":"University of Edinburgh, Edinburgh, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chaitali","family":"Chakrabarti","sequence":"additional","affiliation":[{"name":"Arizona State University, Tempe, AZ, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"David","family":"Blaauw","sequence":"additional","affiliation":[{"name":"University of Michigan, Ann Arbor, MI, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hun-Seok","family":"Kim","sequence":"additional","affiliation":[{"name":"University of Michigan, Ann Arbor, MI, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Trevor","family":"Mudge","sequence":"additional","affiliation":[{"name":"University of Michigan, Ann Arbor, MI, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ronald","family":"Dreslinski","sequence":"additional","affiliation":[{"name":"University of Michigan, Ann Arbor, MI, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2020,9,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","author":"Abadi Martin","year":"2016","unstructured":"Martin Abadi , Paul Barham , Jianmin Chen , Zhifeng Chen , Andy Davis , Jeffrey Dean , Matthieu Devin , Sanjay Ghemawat , Geoffrey Irving , Michael Isard , Manjunath Kudlur , Josh Levenberg , Rajat Monga , Sherry Moore , Derek G. Murray , Benoit Steiner , Paul Tucker , Vijay Vasudevan , Pete Warden , Martin Wicke , Yuan Yu , and Xiaoqiang Zheng . 2016 . TensorFlow: A system for large-scale machine learning . In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16) . 265--283. Martin Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek G. Murray, Benoit Steiner, Paul Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng. 2016. TensorFlow: A system for large-scale machine learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16). 265--283."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2013.6522344"},{"key":"e_1_3_2_1_3_1","first-page":"33","article-title":"Pyramid Methods in Image Processing","volume":"29","author":"Adelson Edward H","year":"1984","unstructured":"Edward H Adelson , Charles H Anderson , James R Bergen , Peter J Burt , and Joan M Ogden . 1984 . Pyramid Methods in Image Processing . RCA Engineer , Vol. 29 , 6 (1984), 33 -- 41 . Edward H Adelson, Charles H Anderson, James R Bergen, Peter J Burt, and Joan M Ogden. 1984. Pyramid Methods in Image Processing. RCA Engineer, Vol. 29, 6 (1984), 33--41.","journal-title":"RCA Engineer"},{"key":"e_1_3_2_1_4_1","volume-title":"X-CGRA: An energy-efficient approximate coarse-grained reconfigurable architecture","author":"Akbari Omid","year":"2019","unstructured":"Omid Akbari , Mehdi Kamal , Ali Afzali-Kusha , Massoud Pedram , and Muhammad Shafique . 2019. X-CGRA: An energy-efficient approximate coarse-grained reconfigurable architecture . IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems ( 2019 ). Omid Akbari, Mehdi Kamal, Ali Afzali-Kusha, Massoud Pedram, and Muhammad Shafique. 2019. X-CGRA: An energy-efficient approximate coarse-grained reconfigurable architecture. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems (2019)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.21236\/ADA604494"},{"key":"e_1_3_2_1_6_1","volume-title":"2014 22nd European Signal Processing Conference (EUSIPCO). IEEE, 266--270","author":"Ayhan Tuba","year":"2014","unstructured":"Tuba Ayhan , Wim Dehaene , and Marian Verhelst . 2014 . A 128textasciitilde 2048\/1536 point FFT hardware implementation with output pruning . In 2014 22nd European Signal Processing Conference (EUSIPCO). IEEE, 266--270 . Tuba Ayhan, Wim Dehaene, and Marian Verhelst. 2014. A 128textasciitilde 2048\/1536 point FFT hardware implementation with output pruning. In 2014 22nd European Signal Processing Conference (EUSIPCO). IEEE, 266--270."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2436256.2436271"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/774789.774805"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1137\/110838844"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2024716.2024718"},{"key":"e_1_3_2_1_11_1","volume-title":"The M5 simulator: Modeling networked systems. Ieee micro","author":"Binkert Nathan L","year":"2006","unstructured":"Nathan L Binkert , Ronald G Dreslinski , Lisa R Hsu , Kevin T Lim , Ali G Saidi , and Steven K Reinhardt . 2006. The M5 simulator: Modeling networked systems. Ieee micro , Vol. 26 , 4 ( 2006 ), 52--60. Nathan L Binkert, Ronald G Dreslinski, Lisa R Hsu, Kevin T Lim, Ali G Saidi, and Steven K Reinhardt. 2006. The M5 simulator: Modeling networked systems. Ieee micro, Vol. 26, 4 (2006), 52--60."},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the GPU Technology Conference","author":"Buck Ian","year":"2010","unstructured":"Ian Buck . 2010 . The evolution of GPUs for general purpose computing . In Proceedings of the GPU Technology Conference 2010. 11. Ian Buck. 2010. The evolution of GPUs for general purpose computing. In Proceedings of the GPU Technology Conference 2010. 11."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2012.6402918"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.565"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2009.2037211"},{"key":"e_1_3_2_1_16_1","first-page":"260","article-title":"Embedded configurable logic ASIC","volume":"6","author":"Chang Web","year":"2001","unstructured":"Web Chang . 2001 . Embedded configurable logic ASIC . US Patent 6 , 260 ,087. Web Chang. 2001. Embedded configurable logic ASIC. US Patent 6,260,087.","journal-title":"US Patent"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2016.2616357"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the 26th International Conference on Neural Information Processing Systems -","volume":"2","author":"Cuturi Marco","year":"2013","unstructured":"Marco Cuturi . 2013 . Sinkhorn Distances: Lightspeed Computation of Optimal Transport . In Proceedings of the 26th International Conference on Neural Information Processing Systems - Volume 2 (NIPS'13). 2292--2300. Marco Cuturi. 2013. Sinkhorn Distances: Lightspeed Computation of Optimal Transport. In Proceedings of the 26th International Conference on Neural Information Processing Systems - Volume 2 (NIPS'13). 2292--2300."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358276"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2018.022071133"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.5555\/645496.658058"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3220025"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/2554688.2554785"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/567806.567810"},{"key":"e_1_3_2_1_25_1","volume-title":"Present and Future. In IEEE 17th International Conference on Application-specific Systems, Architectures and Processors (ASAP'06)","author":"Jr E Swartzlander Earl","year":"2006","unstructured":"E Swartzlander Earl Jr . 2006 . Systolic FFT Processors: Past , Present and Future. In IEEE 17th International Conference on Application-specific Systems, Architectures and Processors (ASAP'06) . IEEE, 153--158. E Swartzlander Earl Jr. 2006. Systolic FFT Processors: Past, Present and Future. In IEEE 17th International Conference on Application-specific Systems, Architectures and Processors (ASAP'06). IEEE, 153--158."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCAS.2013.6572129"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/1058129.1058148"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00033"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11227-015-1483-z"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-78890-6_53"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPADS.2013.47"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1142\/S0129626408003545"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2016.7446059"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11265-015-1057-6"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2016.7482073"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/2.839324"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2011.5749755"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/2464996.2465438"},{"key":"e_1_3_2_1_39_1","first-page":"19","article-title":"Ambric's new parallel processor","volume":"20","author":"Halfhill Tom R","year":"2006","unstructured":"Tom R Halfhill . 2006 . Ambric's new parallel processor . Microprocessor Report , Vol. 20 , 10 (2006), 19 -- 26 . Tom R Halfhill. 2006. Ambric's new parallel processor. Microprocessor Report, Vol. 20, 10 (2006), 19--26.","journal-title":"Microprocessor Report"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00059"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3392717.3392751"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC.2014.6757323"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/PROC.1986.13691"},{"volume-title":"Proceedings of the 34th Annual International Symposium on Computer Architecture","author":"Ipek Engin","key":"e_1_3_2_1_44_1","unstructured":"Engin Ipek , Meyrem Kirman , Nevin Kirman , and Jose F. Martinez . 2007. Core Fusion: Accommodating Software Diversity in Chip Multiprocessors . In Proceedings of the 34th Annual International Symposium on Computer Architecture ( San Diego, California, USA) (ISCA '07). ACM, 186--197. Engin Ipek, Meyrem Kirman, Nevin Kirman, and Jose F. Martinez. 2007. Core Fusion: Accommodating Software Diversity in Chip Multiprocessors. In Proceedings of the 34th Annual International Symposium on Computer Architecture (San Diego, California, USA) (ISCA '07). ACM, 186--197."},{"key":"e_1_3_2_1_45_1","volume-title":"Order-N tight-binding molecular dynamics on parallel computers. Computer physics communications","author":"Itoh Satoshi","year":"1995","unstructured":"Satoshi Itoh , Pablo Ordej\u00f3n , and Richard M Martin . 1995. Order-N tight-binding molecular dynamics on parallel computers. Computer physics communications , Vol. 88 , 2--3 ( 1995 ), 173--185. Satoshi Itoh, Pablo Ordej\u00f3n, and Richard M Martin. 1995. Order-N tight-binding molecular dynamics on parallel computers. Computer physics communications, Vol. 88, 2--3 (1995), 173--185."},{"volume-title":"High Performance Embedded Computing Conference (HPEC04","author":"Jackson Preston A.","key":"e_1_3_2_1_46_1","unstructured":"Preston A. Jackson , Cy P. Chan , Jonathan E. Scalera , Charles M. Rader , and M. Michael Vai . 2004. A systolic FFT architecture for real time FPGA systems,? High Performance Embedded Computing Conference (HPEC04 . In In High Performance Embedded Computing Conference (HPEC04. Preston A. Jackson, Cy P. Chan, Jonathan E. Scalera, Charles M. Rader, and M. Michael Vai. 2004. A systolic FFT architecture for real time FPGA systems,? High Performance Embedded Computing Conference (HPEC04. In In High Performance Embedded Computing Conference (HPEC04."},{"key":"e_1_3_2_1_47_1","unstructured":"Wenzel Jakob Jason Rhinelander and Dean Moldovan. 2017. pybind 11--Seamless operability between C 11 and Python. https:\/\/github.com\/pybind\/pybind11  Wenzel Jakob Jason Rhinelander and Dean Moldovan. 2017. pybind 11--Seamless operability between C 11 and Python. https:\/\/github.com\/pybind\/pybind11"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.45"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/2.241423"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_2_1_51_1","volume-title":"Nikolay Bogoychev, Andr\u00e9 F. T. Martins, and Alexandra Birch.","author":"Junczys-Dowmunt Marcin","year":"2018","unstructured":"Marcin Junczys-Dowmunt , Roman Grundkiewicz , Tomasz Dwojak , Hieu Hoang , Kenneth Heafield , Tom Neckermann , Frank Seide , Ulrich Germann , Alham Fikri Aji , Nikolay Bogoychev, Andr\u00e9 F. T. Martins, and Alexandra Birch. 2018 . Marian : Fast neural machine translation in C. arXiv preprint arXiv:1804.00344 (2018). Marcin Junczys-Dowmunt, Roman Grundkiewicz, Tomasz Dwojak, Hieu Hoang, Kenneth Heafield, Tom Neckermann, Frank Seide, Ulrich Germann, Alham Fikri Aji, Nikolay Bogoychev, Andr\u00e9 F. T. Martins, and Alexandra Birch. 2018. Marian: Fast neural machine translation in C. arXiv preprint arXiv:1804.00344 (2018)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3061639.3062262"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/1555754.1555774"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC.2016.7761646"},{"key":"e_1_3_2_1_55_1","volume-title":"MorphCore: An Energy-Efficient Microarchitecture for High Performance ILP and High Throughput TLP. 2012 45th Annual IEEE\/ACM International Symposium on Microarchitecture","author":"Suleman M. Aater","year":"2012","unstructured":"Khubaib, M. Aater Suleman , Milad Hashemi , Chris Wilkerson , and Yale N. Patt . 2012 . MorphCore: An Energy-Efficient Microarchitecture for High Performance ILP and High Throughput TLP. 2012 45th Annual IEEE\/ACM International Symposium on Microarchitecture ( 2012 ), 305--316. Khubaib, M. Aater Suleman, Milad Hashemi, Chris Wilkerson, and Yale N. Patt. 2012. MorphCore: An Energy-Efficient Microarchitecture for High Performance ILP and High Throughput TLP. 2012 45th Annual IEEE\/ACM International Symposium on Microarchitecture (2012), 305--316."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2008.25"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3192366.3192379"},{"volume-title":"Proceedings of the 42Nd Annual International Symposium on Computer Architecture","author":"Komuravelli Rakesh","key":"e_1_3_2_1_58_1","unstructured":"Rakesh Komuravelli , Matthew D. Sinclair , Johnathan Alsop , Muhammad Huzaifa , Maria Kotsifakou , Prakalp Srivastava , Sarita V. Adve , and Vikram S. Adve . 2015. Stash: Have Your Scratchpad and Cache It Too . In Proceedings of the 42Nd Annual International Symposium on Computer Architecture ( Portland, Oregon) (ISCA '15). ACM, 707--719. Rakesh Komuravelli, Matthew D. Sinclair, Johnathan Alsop, Muhammad Huzaifa, Maria Kotsifakou, Prakalp Srivastava, Sarita V. Adve, and Vikram S. Adve. 2015. Stash: Have Your Scratchpad and Cache It Too. In Proceedings of the 42Nd Annual International Symposium on Computer Architecture (Portland, Oregon) (ISCA '15). ACM, 707--719."},{"key":"e_1_3_2_1_59_1","unstructured":"Alex Krizhevsky Ilya Sutskever and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. In Advances in neural information processing systems. 1097--1105.  Alex Krizhevsky Ilya Sutskever and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. In Advances in neural information processing systems. 1097--1105."},{"key":"e_1_3_2_1_60_1","volume-title":"Why systolic architectures? IEEE computer","author":"Kung Hsiang-Tsung","year":"1982","unstructured":"Hsiang-Tsung Kung . 1982. Why systolic architectures? IEEE computer , Vol. 15 , 1 ( 1982 ), 37--46. Hsiang-Tsung Kung. 1982. Why systolic architectures? IEEE computer, Vol. 15, 1 (1982), 37--46."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2006.884574"},{"volume-title":"FPGA architecture: Survey and challenges","author":"Kuon Ian","key":"e_1_3_2_1_62_1","unstructured":"Ian Kuon , Russell Tessier , and Jonathan Rose . 2008. FPGA architecture: Survey and challenges . Now Publishers Inc . Ian Kuon, Russell Tessier, and Jonathan Rose. 2008. FPGA architecture: Survey and challenges. Now Publishers Inc."},{"key":"e_1_3_2_1_63_1","volume-title":"Proceedings of the 32Nd International Conference on International Conference on Machine Learning -","volume":"37","author":"Kusner Matt J.","unstructured":"Matt J. Kusner , Yu Sun , Nicholas I. Kolkin , and Kilian Q. Weinberger . 2015. From Word Embeddings to Document Distances . In Proceedings of the 32Nd International Conference on International Conference on Machine Learning - Volume 37 (Lille, France) (ICML'15). 957--966. Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, and Kilian Q. Weinberger. 2015. From Word Embeddings to Document Distances. In Proceedings of the 32Nd International Conference on International Conference on Machine Learning - Volume 37 (Lille, France) (ICML'15). 957--966."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/FPT.2009.5377625"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2004.1327917"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/ECTC.2016.348"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2015.04.020"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/FPT.2009.5377609"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/2304576.2304626"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/SIPS.2008.4671772"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2015.2463735"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357375"},{"key":"e_1_3_2_1_73_1","first-page":"1","article-title":"Mel Frequency Cepstral Coefficients for Music Modeling","volume":"270","author":"Logan Beth","year":"2000","unstructured":"Beth Logan . 2000 . Mel Frequency Cepstral Coefficients for Music Modeling . In ISMIR , Vol. 270. 1 -- 11 . Beth Logan. 2000. Mel Frequency Cepstral Coefficients for Music Modeling. In ISMIR, Vol. 270. 1--11.","journal-title":"ISMIR"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2016.25"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.21236\/ADA419598"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC.2013.6670338"},{"key":"e_1_3_2_1_78_1","volume-title":"Nitish Shirish Keskar, and Richard Socher","author":"Merity Stephen","year":"2018","unstructured":"Stephen Merity , Nitish Shirish Keskar, and Richard Socher . 2018 . An analysis of neural language modeling at multiple scales. arXiv preprint arXiv:1803.08240 (2018). Stephen Merity, Nitish Shirish Keskar, and Richard Socher. 2018. An analysis of neural language modeling at multiple scales. arXiv preprint arXiv:1803.08240 (2018)."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/WOCN.2017.8065859"},{"key":"e_1_3_2_1_80_1","volume-title":"Pthreads library interface","author":"Mueller Frank","year":"1993","unstructured":"Frank Mueller . 1993. Pthreads library interface . Florida State University ( 1993 ). Frank Mueller. 1993. Pthreads library interface. Florida State University (1993)."},{"key":"e_1_3_2_1_81_1","volume-title":"A tool to model large caches. HP laboratories","author":"Muralimanohar Naveen","year":"2009","unstructured":"Naveen Muralimanohar , Rajeev Balasubramonian , and Norman P Jouppi . 2009. CACTI 6.0 : A tool to model large caches. HP laboratories , Vol. 27 ( 2009 ), 28. Naveen Muralimanohar, Rajeev Balasubramonian, and Norman P Jouppi. 2009. CACTI 6.0: A tool to model large caches. HP laboratories, Vol. 27 (2009), 28."},{"key":"e_1_3_2_1_82_1","volume-title":"A coarse grain reconfigurable array (cgra) for statically scheduled data flow computing. Wave Computing White Paper","author":"Nicol Chris","year":"2017","unstructured":"Chris Nicol . 2017. A coarse grain reconfigurable array (cgra) for statically scheduled data flow computing. Wave Computing White Paper ( 2017 ). Chris Nicol. 2017. A coarse grain reconfigurable array (cgra) for statically scheduled data flow computing. Wave Computing White Paper (2017)."},{"key":"e_1_3_2_1_83_1","volume-title":"Stream-Dataflow Acceleration. In Proceedings of the 44th Annual International Symposium on Computer Architecture","author":"Nowatzki Tony","year":"2017","unstructured":"Tony Nowatzki , Vinay Gangadhar , Newsha Ardalani , and Karthikeyan Sankaralingam . 2017 . Stream-Dataflow Acceleration. In Proceedings of the 44th Annual International Symposium on Computer Architecture ( Toronto, ON, Canada) (ISCA '17). ACM, 416--429. Tony Nowatzki, Vinay Gangadhar, Newsha Ardalani, and Karthikeyan Sankaralingam. 2017. Stream-Dataflow Acceleration. In Proceedings of the 44th Annual International Symposium on Computer Architecture (Toronto, ON, Canada) (ISCA '17). ACM, 416--429."},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2014.6983052"},{"key":"e_1_3_2_1_85_1","first-page":"1","article-title":"Accelerating deep convolutional neural networks using specialized hardware","volume":"2","author":"Ovtcharov Kalin","year":"2015","unstructured":"Kalin Ovtcharov , Olatunji Ruwase , Joo-Young Kim , Jeremy Fowers , Karin Strauss , and Eric S Chung . 2015 . Accelerating deep convolutional neural networks using specialized hardware . Microsoft Research Whitepaper , Vol. 2 , 11 (2015), 1 -- 4 . Kalin Ovtcharov, Olatunji Ruwase, Joo-Young Kim, Jeremy Fowers, Karin Strauss, and Eric S Chung. 2015. Accelerating deep convolutional neural networks using specialized hardware. Microsoft Research Whitepaper, Vol. 2, 11 (2015), 1--4.","journal-title":"Microsoft Research Whitepaper"},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3124545"},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00067"},{"key":"e_1_3_2_1_88_1","volume-title":"2019 Symposium on VLSI Circuits","author":"Pal Subhankar","year":"2019","unstructured":"Subhankar Pal , Dong-Hyeon Park , Siying Feng , Paul Gao , Jielun Tan , Austin Rovinski , Shaolin Xie , Chun Zhao , Aporva Amarnath , Timothy Wesley , Jonathan Beaumont , Kuan-Yu Chen , Chaitali Chakrabarti , Michael Bedford Taylor , Trevor N. Mudge , David T. Blaauw , Hun-Seok Kim , and Ronald G. Dreslinski . 2019. A 7.3 M Output Non-Zeros\/J Sparse Matrix-Matrix Multiplication Accelerator using Memory Reconfiguration in 40 nm . In 2019 Symposium on VLSI Circuits , Kyoto, Japan , June 9-14, 2019 . IEEE, 150. Subhankar Pal, Dong-Hyeon Park, Siying Feng, Paul Gao, Jielun Tan, Austin Rovinski, Shaolin Xie, Chun Zhao, Aporva Amarnath, Timothy Wesley, Jonathan Beaumont, Kuan-Yu Chen, Chaitali Chakrabarti, Michael Bedford Taylor, Trevor N. Mudge, David T. Blaauw, Hun-Seok Kim, and Ronald G. Dreslinski. 2019. A 7.3 M Output Non-Zeros\/J Sparse Matrix-Matrix Multiplication Accelerator using Memory Reconfiguration in 40 nm. In 2019 Symposium on VLSI Circuits, Kyoto, Japan, June 9-14, 2019. IEEE, 150."},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00042"},{"key":"e_1_3_2_1_90_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2019.2960480"},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASAP.2011.6043234"},{"key":"e_1_3_2_1_92_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11265-014-0896-x"},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.tcs.2005.11.008"},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"publisher","DOI":"10.1145\/1059876.1059881"},{"key":"e_1_3_2_1_95_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080256"},{"key":"e_1_3_2_1_96_1","doi-asserted-by":"publisher","DOI":"10.5555\/2665671.2665678"},{"key":"e_1_3_2_1_97_1","volume-title":"Improving GANs using optimal transport. arXiv preprint arXiv:1803.05573","author":"Salimans Tim","year":"2018","unstructured":"Tim Salimans , Han Zhang , Alec Radford , and Dimitris Metaxas . 2018. Improving GANs using optimal transport. arXiv preprint arXiv:1803.05573 ( 2018 ). Tim Salimans, Han Zhang, Alec Radford, and Dimitris Metaxas. 2018. Improving GANs using optimal transport. arXiv preprint arXiv:1803.05573 (2018)."},{"key":"e_1_3_2_1_98_1","volume-title":"Automation & Test in Europe Conference & Exhibition (DATE). IEEE, 662--667","author":"Schuiki Fabian","year":"2019","unstructured":"Fabian Schuiki , Michael Schaffner , and Luca Benini . 2019 . Ntx: An energy-efficient streaming accelerator for floating-point generalized reduction workloads in 22 nm fd-soi. In 2019 Design , Automation & Test in Europe Conference & Exhibition (DATE). IEEE, 662--667 . Fabian Schuiki, Michael Schaffner, and Luca Benini. 2019. Ntx: An energy-efficient streaming accelerator for floating-point generalized reduction workloads in 22 nm fd-soi. In 2019 Design, Automation & Test in Europe Conference & Exhibition (DATE). IEEE, 662--667."},{"key":"e_1_3_2_1_99_1","doi-asserted-by":"publisher","DOI":"10.1109\/JETCAS.2012.2193936"},{"key":"e_1_3_2_1_100_1","doi-asserted-by":"publisher","DOI":"10.1109\/MDAT.2016.2633408"},{"key":"e_1_3_2_1_101_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054126"},{"key":"e_1_3_2_1_102_1","volume-title":"LACore: A Supercomputing-Like Linear Algebra Accelerator for SoC-Based Designs. In 2017 IEEE International Conference on Computer Design (ICCD). IEEE, 137--144","author":"Steffl Samuel","year":"2017","unstructured":"Samuel Steffl and Sherief Reda . 2017 . LACore: A Supercomputing-Like Linear Algebra Accelerator for SoC-Based Designs. In 2017 IEEE International Conference on Computer Design (ICCD). IEEE, 137--144 . Samuel Steffl and Sherief Reda. 2017. LACore: A Supercomputing-Like Linear Algebra Accelerator for SoC-Based Designs. In 2017 IEEE International Conference on Computer Design (ICCD). IEEE, 137--144."},{"key":"e_1_3_2_1_103_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2017.7863730"},{"key":"e_1_3_2_1_104_1","volume-title":"OpenCL: A parallel programming standard for heterogeneous computing systems. Computing in science & engineering","author":"Stone John E","year":"2010","unstructured":"John E Stone , David Gohara , and Guochun Shi . 2010. OpenCL: A parallel programming standard for heterogeneous computing systems. Computing in science & engineering , Vol. 12 , 3 ( 2010 ), 66--73. John E Stone, David Gohara, and Guochun Shi. 2010. OpenCL: A parallel programming standard for heterogeneous computing systems. Computing in science & engineering, Vol. 12, 3 (2010), 66--73."},{"key":"e_1_3_2_1_105_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00054"},{"key":"e_1_3_2_1_106_1","doi-asserted-by":"publisher","DOI":"10.1109\/MCSoC.2015.41"},{"key":"e_1_3_2_1_107_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2002.997877"},{"key":"e_1_3_2_1_108_1","unstructured":"Vaishali Tehre Pankaj Agrawal and RV Kshrisagar. [n.d.]. Implementation of Fast Fourier Transform Accelerator on Coarse Grain Reconfigurable Architecture. ( [n. d.]).  Vaishali Tehre Pankaj Agrawal and RV Kshrisagar. [n.d.]. Implementation of Fast Fourier Transform Accelerator on Coarse Grain Reconfigurable Architecture. ( [n. d.])."},{"key":"e_1_3_2_1_109_1","doi-asserted-by":"publisher","DOI":"10.1145\/3140659.3080244"},{"key":"e_1_3_2_1_110_1","doi-asserted-by":"publisher","DOI":"10.1109\/DATE.2004.1269069"},{"key":"e_1_3_2_1_111_1","doi-asserted-by":"publisher","DOI":"10.1145\/3193827"},{"key":"e_1_3_2_1_112_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2016.7446086"},{"key":"e_1_3_2_1_113_1","doi-asserted-by":"crossref","unstructured":"Jagath Weerasinghe Francois Abel Christoph Hagleitner and Andreas Herkersdorf. 2015. Enabling FPGAs in Hyperscale Data Centers. In 2015 IEEE 12th Intl Conf on Ubiquitous Intelligence and Computing and 2015 IEEE 12th Intl Conf on Autonomic and Trusted Computing and 2015 IEEE 15th Intl Conf on Scalable Computing and Communications and Its Associated Workshops (UIC-ATC-ScalCom). IEEE 1078--1086.  Jagath Weerasinghe Francois Abel Christoph Hagleitner and Andreas Herkersdorf. 2015. Enabling FPGAs in Hyperscale Data Centers. In 2015 IEEE 12th Intl Conf on Ubiquitous Intelligence and Computing and 2015 IEEE 12th Intl Conf on Autonomic and Trusted Computing and 2015 IEEE 15th Intl Conf on Scalable Computing and Communications and Its Associated Workshops (UIC-ATC-ScalCom). IEEE 1078--1086.","DOI":"10.1109\/UIC-ATC-ScalCom-CBDCom-IoP.2015.199"},{"key":"e_1_3_2_1_114_1","doi-asserted-by":"publisher","DOI":"10.1109\/SAMOS.2016.7818353"},{"key":"e_1_3_2_1_115_1","unstructured":"Xilinx [n.d.]. Partial Reconfiguration User Guide UG702 (v13.3). Xilinx. https: \/\/www.xilinx.com\/support\/documentation\/sw_manuals\/xilinx13_3\/ug702.pdf  Xilinx [n.d.]. Partial Reconfiguration User Guide UG702 (v13.3). Xilinx. https: \/\/www.xilinx.com\/support\/documentation\/sw_manuals\/xilinx13_3\/ug702.pdf"},{"key":"e_1_3_2_1_116_1","unstructured":"Xilinx [n.d.]. Partial Reconfiguration User Guide UG909 (v2018.1). Xilinx. https:\/\/www.xilinx.com\/support\/documentation\/sw_manuals\/xilinx2018_ 1\/ug909-vivado-partial-reconfiguration.pdf  Xilinx [n.d.]. Partial Reconfiguration User Guide UG909 (v2018.1). Xilinx. https:\/\/www.xilinx.com\/support\/documentation\/sw_manuals\/xilinx2018_ 1\/ug909-vivado-partial-reconfiguration.pdf"},{"key":"e_1_3_2_1_117_1","volume-title":"International Conference on High Performance Computing for Computational Science. Springer, 421--434","author":"Yamazaki Ichitaro","year":"2010","unstructured":"Ichitaro Yamazaki and Xiaoye S Li . 2010 . On techniques to improve robustness and scalability of a parallel hybrid linear solver . In International Conference on High Performance Computing for Computational Science. Springer, 421--434 . Ichitaro Yamazaki and Xiaoye S Li. 2010. On techniques to improve robustness and scalability of a parallel hybrid linear solver. In International Conference on High Performance Computing for Computational Science. Springer, 421--434."},{"key":"e_1_3_2_1_118_1","doi-asserted-by":"publisher","DOI":"10.1145\/3269206.3271697"},{"key":"e_1_3_2_1_119_1","doi-asserted-by":"publisher","DOI":"10.1145\/2684746.2689060"},{"key":"e_1_3_2_1_120_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC.2013.6670336"}],"event":{"name":"PACT '20: International Conference on Parallel Architectures and Compilation Techniques","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"],"location":"Virtual Event GA USA","acronym":"PACT '20"},"container-title":["Proceedings of the ACM International Conference on Parallel Architectures and Compilation Techniques"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3410463.3414627","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3410463.3414627","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3410463.3414627","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T21:31:51Z","timestamp":1750195911000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3410463.3414627"}},"subtitle":["Bridging the Efficiency Gap using Memory and Dataflow Reconfiguration"],"short-title":[],"issued":{"date-parts":[[2020,9,30]]},"references-count":120,"alternative-id":["10.1145\/3410463.3414627","10.1145\/3410463"],"URL":"https:\/\/doi.org\/10.1145\/3410463.3414627","relation":{},"subject":[],"published":{"date-parts":[[2020,9,30]]},"assertion":[{"value":"2020-09-30","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}