{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T15:40:13Z","timestamp":1772725213124,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":120,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,27]],"date-time":"2024-04-27T00:00:00Z","timestamp":1714176000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"National Science Foundation (NSF)","award":["CF#2107598"],"award-info":[{"award-number":["CF#2107598"]}]},{"name":"National Science Foundation (NSF) awards","award":["CNS#1822273"],"award-info":[{"award-number":["CNS#1822273"]}]},{"name":"Defense Advanced Research Project Agency (DARPA)","award":["#HR0011-18-C-0020"],"award-info":[{"award-number":["#HR0011-18-C-0020"]}]},{"name":"National Institute of Health (NIH) a","award":["R01EB028350"],"award-info":[{"award-number":["R01EB028350"]}]},{"name":"Semiconductor Research Corporation (SRC)","award":["#2021-AH- 3039"],"award-info":[{"award-number":["#2021-AH- 3039"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,27]]},"DOI":"10.1145\/3620665.3640365","type":"proceedings-article","created":{"date-parts":[[2024,4,22]],"date-time":"2024-04-22T14:18:06Z","timestamp":1713795486000},"page":"1165-1182","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":16,"title":["Tandem Processor: Grappling with Emerging Operators in Neural Networks"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5514-8027","authenticated-orcid":false,"given":"Soroush","family":"Ghodrati","sequence":"first","affiliation":[{"name":"University of California, San Diego, San Diego, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0955-585X","authenticated-orcid":false,"given":"Sean","family":"Kinzer","sequence":"additional","affiliation":[{"name":"University of California, San Diego, San Diego, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0328-9610","authenticated-orcid":false,"given":"Hanyang","family":"Xu","sequence":"additional","affiliation":[{"name":"University of California, San Diego, San Deigo, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2887-9761","authenticated-orcid":false,"given":"Rohan","family":"Mahapatra","sequence":"additional","affiliation":[{"name":"University of California, San Diego, San Diego, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2333-292X","authenticated-orcid":false,"given":"Yoonsung","family":"Kim","sequence":"additional","affiliation":[{"name":"KAIST, Daejeon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2648-8748","authenticated-orcid":false,"given":"Byung Hoon","family":"Ahn","sequence":"additional","affiliation":[{"name":"University of California, San Diego, Sa Diego, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6607-8869","authenticated-orcid":false,"given":"Dong Kai","family":"Wang","sequence":"additional","affiliation":[{"name":"UIUC, Champaign, IL, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5628-388X","authenticated-orcid":false,"given":"Lavanya","family":"Karthikeyan","sequence":"additional","affiliation":[{"name":"University of California, San Diego, San Diego, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8199-7671","authenticated-orcid":false,"given":"Amir","family":"Yazdanbakhsh","sequence":"additional","affiliation":[{"name":"Google DeepMind, Mountain View, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6629-449X","authenticated-orcid":false,"given":"Jongse","family":"Park","sequence":"additional","affiliation":[{"name":"KAIST, Daejeon, Korea, South ? Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0442-5634","authenticated-orcid":false,"given":"Nam Sung","family":"Kim","sequence":"additional","affiliation":[{"name":"UIUC, Urbana, IL, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8548-1039","authenticated-orcid":false,"given":"Hadi","family":"Esmaeilzadeh","sequence":"additional","affiliation":[{"name":"University of California, San Diego (UCSD), San Diego, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,4,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Arm scalable vector extension (sve). https:\/\/developer.arm.com\/documentation\/102476\/0100."},{"key":"e_1_3_2_1_2_1","unstructured":"Intel advanced vector extensions (avx). https:\/\/www.intel.com\/content\/www\/us\/en\/architecture-and-technology\/avx-512-overview.html."},{"key":"e_1_3_2_1_3_1","unstructured":"Nvdla. http:\/\/nvdla.org\/index.html."},{"key":"e_1_3_2_1_4_1","unstructured":"Risc-v vector extensions. https:\/\/github.com\/riscv\/riscv-v-spec\/blob\/master\/v-spec.adoc."},{"key":"e_1_3_2_1_5_1","volume-title":"a small self-contained low-precision gemm library","year":"2022","unstructured":"gemmlowp: a small self-contained low-precision gemm library, 2022. https:\/\/github.com\/google\/gemmlowp."},{"key":"e_1_3_2_1_6_1","volume-title":"OSDI","author":"TensorFlow M. Abadi","year":"2016","unstructured":"M. Abadi et al. TensorFlow: A system for large-scale machine learning. OSDI, 2016."},{"key":"e_1_3_2_1_7_1","volume-title":"Your imagination's new best friend. https:\/\/www.adobe.com\/products\/firefly.html","year":"2023","unstructured":"Adobe. Your imagination's new best friend. https:\/\/www.adobe.com\/products\/firefly.html, 2023."},{"key":"e_1_3_2_1_8_1","volume-title":"ISCA","author":"Aklaghi Vahide","year":"2018","unstructured":"Vahide Aklaghi, Amir Yazdanbakhsh, Kambiz Samadi, Hadi Esmaeilzadeh, and Rajesh K. Gupta. Snapea: Predictive early activation for reducing computation in deep convolutional neural networks. In ISCA, 2018."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3123982"},{"key":"e_1_3_2_1_10_1","volume-title":"ISCA","author":"Albericio Jorge","year":"2016","unstructured":"Jorge Albericio, Patrick Judd, Tayler Hetherington, Tor Aamodt, Natalie Enright Jerger, and Andreas Moshovos. Cnvlutin: ineffectual-neuron-free deep neural network computing. In ISCA, 2016."},{"key":"e_1_3_2_1_11_1","volume-title":"ASPLOS","author":"Ankit Aayush","year":"2019","unstructured":"Aayush Ankit, Izzat El Hajj, Sai Rahul Chalamalasetti, Geoffrey Ndu, Martin Foltin, R Stanley Williams, Paolo Faraboschi, Wen-mei W Hwu, John Paul Strachan, Kaushik Roy, et al. Puma: A programmable ultra-efficient memristor-based accelerator for machine learning inference. In ASPLOS, 2019."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511811449"},{"key":"e_1_3_2_1_13_1","volume-title":"ISSCC","author":"Bankman Daniel","year":"2018","unstructured":"Daniel Bankman, Lita Yang, Bert Moons, Marian Verhelst, and Boris Murmann. An always-on 3.8 \u03bcj\/86% cifar-10 mixed-signal binary cnn processor with all memory on chip in 28nm cmos. In ISSCC, 2018."},{"key":"e_1_3_2_1_14_1","volume-title":"ISSCC","author":"Beck Noah","year":"2018","unstructured":"Noah Beck, Sean White, Milam Paraschou, and Samuel Naffziger. 'zeppelin': An soc for multichip architectures. In ISSCC, 2018."},{"key":"e_1_3_2_1_15_1","volume-title":"OSDI","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et al. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In OSDI, 2018."},{"key":"e_1_3_2_1_16_1","volume-title":"ISCA","author":"Chen Yu-Hsin","year":"2016","unstructured":"Yu-Hsin Chen, Joel Emer, and Vivienne Sze. Eyeriss: Aspatial architecture for energy-efficient dataflow for convolutional neural networks. In ISCA, 2016."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/JETCAS.2019.2910232"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.58"},{"key":"e_1_3_2_1_19_1","volume-title":"ISCA","author":"Chi Ping","year":"2016","unstructured":"Ping Chi, Shuangchen Li, Cong Xu, Tao Zhang, Jishen Zhao, Yongpan Liu, Yu Wang, and Yuan Xie. Prime: A novel processing-in-memory architecture for neural network computation in reram-based main memory. In ISCA, 2016."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2014.12"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2005.41"},{"key":"e_1_3_2_1_22_1","volume-title":"ASPLOS","author":"Lascorz Alberto Delmas","year":"2019","unstructured":"Alberto Delmas Lascorz, Patrick Judd, Dylan Malone Stuart, Zissis Poulos, Mostafa Mahmoud, Sayeh Sharify, Milos Nikolic, Kevin Siu, and Andreas Moshovos. Bit-tactical: A software\/hardware approach to exploiting value and bit sparsity in neural networks. In ASPLOS, 2019."},{"key":"e_1_3_2_1_23_1","first-page":"301","volume-title":"2003 IEEE International Conference on Acoustics, Speech, and Signal Processing, 2003. Proceedings.(ICASSP'03)","volume":"2","author":"Derby Jeff H","unstructured":"Jeff H Derby and JaimeH Moreno. A high-performance embedded dsp core with novel simd features. In 2003 IEEE International Conference on Acoustics, Speech, and Signal Processing, 2003. Proceedings.(ICASSP'03)., volume 2, pages II--301. IEEE, 2003."},{"key":"e_1_3_2_1_24_1","volume-title":"Ai-powered text-to-video - turn text into stunning videos. https:\/\/designs.ai\/","year":"2023","unstructured":"Designs.ai. Ai-powered text-to-video - turn text into stunning videos. https:\/\/designs.ai\/, 2023."},{"key":"e_1_3_2_1_25_1","unstructured":"ONNX Runtime developers. ONNX Runtime 11 2018."},{"key":"e_1_3_2_1_26_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv, 2018."},{"key":"e_1_3_2_1_27_1","volume-title":"MICRO","author":"Ding Caiwen","year":"2017","unstructured":"Caiwen Ding, Siyu Liao, Yanzhi Wang, Zhe Li, Ning Liu, Youwei Zhuo, Chao Wang, Xuehai Qian, Yu Bai, Geng Yuan, et al. Circnn: accelerating and compressing deep neural networks using block-circulant weight matrices. In MICRO, 2017."},{"key":"e_1_3_2_1_28_1","volume-title":"ISCA","author":"Eckert Charles","year":"2018","unstructured":"Charles Eckert, Xiaowei Wang, Jingcheng Wang, Arun Subramaniyan, Ravi Iyer, Dennis Sylvester, David Blaaauw, and Reetuparna Das. Neural cache: Bit-serial in-cache acceleration of deep neural networks. In ISCA, 2018."},{"key":"e_1_3_2_1_29_1","volume-title":"Vectorization for simd architectures with alignment constraints. Acm sigplan notices, 39(6):82--93","author":"Eichenberger Alexandre E","year":"2004","unstructured":"Alexandre E Eichenberger, Peng Wu, and Kevin O'brien. Vectorization for simd architectures with alignment constraints. Acm sigplan notices, 39(6):82--93, 2004."},{"key":"e_1_3_2_1_30_1","volume-title":"ISCA","author":"Esmaeilzadeh Hadi","year":"2011","unstructured":"Hadi Esmaeilzadeh, Emily Blem, Renee St. Amant, Karthikeyan Sankaralingam, and Doug Burger. Dark silicon and the end of multi-core scaling. In ISCA, 2011."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2012.48"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589348"},{"key":"e_1_3_2_1_33_1","volume-title":"ISCA","author":"Fowers Jeremy","year":"2018","unstructured":"Jeremy Fowers, Kalin Ovtcharov, Michael Papamichael, Todd Massengill, Ming Liu, Daniel Lo, Shlomi Alkalay, Michael Haselman, Logan Adams, Mahdi Ghandi, et al. A configurable cloud-scale dnn processor for real-time ai. In ISCA, 2018."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2004.840491"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037702"},{"key":"e_1_3_2_1_36_1","volume-title":"ASPLOS","author":"Gao Mingyu","year":"2019","unstructured":"Mingyu Gao, Xuan Yang, Jing Pu, Mark Horowitz, and Christos Kozyrakis. Tangram: Optimized coarse-grained dataflow for scalable nn accelerators. In ASPLOS, 2019."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586216"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00062"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3410463.3414634"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18072.2020.9218656"},{"key":"e_1_3_2_1_41_1","volume-title":"Bard: A conversational ai tool by google. https:\/\/bard.google.com","year":"2023","unstructured":"Google. Bard: A conversational ai tool by google. https:\/\/bard.google.com, 2023."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00035"},{"key":"e_1_3_2_1_43_1","volume-title":"ISCA","author":"Han Song","year":"2016","unstructured":"Song Han, Xingyu Liu, Huizi Mao, Jing Pu, Ardavan Pedram, Mark A Horowitz, and William J Dally. Eie: efficient inference engine on compressed deep neural network. In ISCA, 2016."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00080"},{"key":"e_1_3_2_1_46_1","volume-title":"Ucnn: Exploiting computational reuse in deep neural networks via weight repetition. arXiv","author":"Hegde Kartik","year":"2018","unstructured":"Kartik Hegde, Jiyong Yu, Rohit Agrawal, Mengjia Yan, Michael Pellauer, and Christopher W Fletcher. Ucnn: Exploiting computational reuse in deep neural networks via weight repetition. arXiv, 2018."},{"key":"e_1_3_2_1_47_1","volume-title":"ISCA","author":"Imani Mohsen","year":"2019","unstructured":"Mohsen Imani, Saransh Gupta, Yeseong Kim, and Tajana Rosing. Floatpim: In-memory acceleration of deep neural network training with high precision. In ISCA, 2019."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00286"},{"key":"e_1_3_2_1_49_1","volume-title":"ISCA","author":"Jouppi Norman P","year":"2021","unstructured":"Norman P Jouppi, Doe Hyun Yoon, Matthew Ashcraft, Mark Gottscho, Thomas B Jablin, George Kurian, James Laudon, Sheng Li, Peter Ma, Xiaoyu Ma, et al. Ten lessons from three generations shaped google's tpuv4i: Industrial product. In ISCA, 2021."},{"key":"e_1_3_2_1_50_1","volume-title":"ISCA","author":"Jouppi Norman P","year":"2017","unstructured":"Norman P Jouppi, Cliff Young, Nishant Patil, David Patterson, Gaurav Agrawal, Raminder Bajwa, Sarah Bates, Suresh Bhatia, Nan Boden, Al Borchers, et al. In-datacenter performance analysis of a tensor processing unit. In ISCA, 2017."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.5555\/3195638.3195661"},{"key":"e_1_3_2_1_52_1","volume-title":"ISCA","author":"Karandikar Sagar","year":"2018","unstructured":"Sagar Karandikar, Howard Mao, Donggyu Kim, David Biancolin, Alon Amid, Dayeol Lee, Nathan Pemberton, Emmanuel Amaro, Colin Schmidt, Aditya Chopra, et al. Firesim: Fpga-accelerated cycle-exact scale-out system simulation in the public cloud. In ISCA, 2018."},{"key":"e_1_3_2_1_53_1","volume-title":"ISCA","author":"Kim Duckhwan","year":"2016","unstructured":"Duckhwan Kim, Jaeha Kung, Sek Chai, Sudhakar Yalamanchili, and Saibal Mukhopadhyay. Neurocube: A programmable digital neuromorphic architecture with high-density 3d memory. In ISCA, 2016."},{"key":"e_1_3_2_1_54_1","volume-title":"ICML","author":"Kim Sehoon","year":"2021","unstructured":"Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W Mahoney, and Kurt Keutzer. I-bert: Integer-only bert quantization. In ICML, 2021."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1137\/130930352"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304028"},{"key":"e_1_3_2_1_57_1","volume-title":"ASPLOS","author":"Kwon Hyoukjun","year":"2018","unstructured":"Hyoukjun Kwon, Ananda Samajdar, and Tushar Krishna. Maeri: Enabling flexible dataflow mapping over dnn accelerators via reconfigurable interconnects. ASPLOS, 2018."},{"key":"e_1_3_2_1_58_1","volume-title":"Gregory Michael Thorson","author":"Lacy William","year":"2058","unstructured":"William Lacy, Gregory Michael Thorson, Christopher Aaron Clark, Norman Paul Jouppi, Thomas Norrie, and Andrew Everett Phelps. Vector Processing Unit. U.S Patent 11520581, 2022."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD.2011.6105405"},{"key":"e_1_3_2_1_60_1","volume-title":"ISCA","author":"Li Zheng","year":"2022","unstructured":"Zheng Li, Soroush Ghodrati, Amir Yazdanbakhsh, Hadi Esmaeilzadeh, and Mingu Kang. Accelerating Attention through Gradient-Based Learned Runtime Pruning. In ISCA, 2022."},{"key":"e_1_3_2_1_61_1","volume-title":"ISCA","author":"LiKamWa Robert","year":"2016","unstructured":"Robert LiKamWa, Yunhui Hou, Julian Gao, Mia Polansky, and Lin Zhong. Redeye: analog convnet image sensor architecture for continuous mobile vision. In ISCA, 2016."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.29"},{"key":"e_1_3_2_1_63_1","volume-title":"ASP-LOS","author":"Mahapatra Rohan","year":"2024","unstructured":"Rohan Mahapatra, Soroush Ghodrati, Byung Hoon Ahn, Sean Kinzer, Shu ting Wang, Hanyang Xu, Lavanya Karthikeyan, Hardik Sharma, Amir Yazdanbakhsh, Mohammad Alian, and Hadi Esmaeilzadeh. In-storage domain-specific acceleration for serverless computing. ASP-LOS, 2024."},{"key":"e_1_3_2_1_64_1","first-page":"124","volume-title":"International Symposium on Innovations in Information and Communications Technology","author":"Sh Basil","year":"2011","unstructured":"Basil Sh. Mahmood and Mamoon A. Al Jbaar. Design and implementation of simd vector processor on fpga. In International Symposium on Innovations in Information and Communications Technology, pages 124--130, 2011."},{"key":"e_1_3_2_1_65_1","volume-title":"MICRO","author":"Mahmoud Mostafa","year":"2018","unstructured":"Mostafa Mahmoud, Kevin Siu, and Andreas Moshovos. Diffy: A d\u00e9j\u00e0 vu-free differential deep neural network accelerator. In MICRO, 2018."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CICC.2006.320923"},{"key":"e_1_3_2_1_67_1","volume-title":"Introducing audiocraft: A generative ai tool for audio and music. https:\/\/about.fb.com\/news\/2023\/08\/audiocraft-generative-ai-for-music-and-audio\/","year":"2023","unstructured":"Meta. Introducing audiocraft: A generative ai tool for audio and music. https:\/\/about.fb.com\/news\/2023\/08\/audiocraft-generative-ai-for-music-and-audio\/, 2023."},{"key":"e_1_3_2_1_68_1","volume-title":"Github copilot: Your ai pair programmer. https:\/\/github.com\/features\/copilot","year":"2023","unstructured":"Microsoft. Github copilot: Your ai pair programmer. https:\/\/github.com\/features\/copilot, 2023."},{"key":"e_1_3_2_1_69_1","unstructured":"Microsoft. Reinventing search with a new ai-powered microsoft bing and edge your copilot for the web. https:\/\/blogs.microsoft.com\/blog\/2023\/02\/07\/reinventing-search-with-a-new-ai-powered-microsoft-bing-and-edge-your-copilot-for-the-web\/ 2023."},{"key":"e_1_3_2_1_70_1","unstructured":"Facebook Research Microsoft. Onnx: an open format to represent deep learning models. http:\/\/onnx.ai\/ 2017."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2013.207"},{"key":"e_1_3_2_1_72_1","volume-title":"DATE","author":"Moons Bert","year":"2017","unstructured":"Bert Moons, Roel Uytterhoeven, Wim Dehaene, and Marian Verhelst. DVAFS: Trading Computational Accuracy for Energy Through Dynamic-Voltage-Accuracy-Frequency-Scaling. In DATE, 2017."},{"key":"e_1_3_2_1_73_1","volume-title":"Gp-simd processing-in-memory. ACM Transactions on Architecture and Code Optimization (TACO), 11(4):1--26","author":"Morad Amir","year":"2015","unstructured":"Amir Morad, Leonid Yavits, and Ran Ginosar. Gp-simd processing-in-memory. ACM Transactions on Architecture and Code Optimization (TACO), 11(4):1--26, 2015."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1145\/305138.305150"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454083"},{"key":"e_1_3_2_1_76_1","volume-title":"Nvidia turing architecture in-depth. https:\/\/developer.nvidia.com\/blog\/nvidia-turing-architecture-in-depth\/","author":"NVIDIA.","year":"2022","unstructured":"NVIDIA. Nvidia turing architecture in-depth. https:\/\/developer.nvidia.com\/blog\/nvidia-turing-architecture-in-depth\/, 2022."},{"key":"e_1_3_2_1_77_1","volume-title":"https:\/\/chat.openai.com","author":"Chatgpt AI.","year":"2023","unstructured":"OpenAI. Chatgpt. https:\/\/chat.openai.com, 2023."},{"key":"e_1_3_2_1_78_1","volume-title":"ISCA","author":"Parashar Angshuman","year":"2017","unstructured":"Angshuman Parashar, Minsoo Rhu, Anurag Mukkara, Antonio Puglielli, Rangharajan Venkatesan, Brucek Khailany, Joel Emer, Stephen W Keckler, and William J Dally. SCNN: An Accelerator for Compressed-sparse Convolutional Neural Networks. In ISCA, 2017."},{"key":"e_1_3_2_1_79_1","volume-title":"ISCA","author":"Park Eunhyeok","year":"2018","unstructured":"Eunhyeok Park, Dongyoung Kim, and Sungjoo Yoo. Energy-Efficient Neural Network Accelerator Based on Outlier-Aware Low-Precision Computation. In ISCA, 2018."},{"key":"e_1_3_2_1_80_1","volume-title":"NeurIPS","author":"PyTorch A. Paszke","year":"2019","unstructured":"A. Paszke et al. PyTorch: An imperative style, high-performance deep learning library. NeurIPS, 2019."},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304025"},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00015"},{"key":"e_1_3_2_1_83_1","volume-title":"Language Models are Unsupervised Multitask Learners. OpenAI blog","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, Ilya Sutskever, et al. Language Models are Unsupervised Multitask Learners. OpenAI blog, 2019."},{"key":"e_1_3_2_1_84_1","volume-title":"Mlperf inference benchmark. arxiv","author":"Reddi Vijay Janapa","year":"2019","unstructured":"Vijay Janapa Reddi, Christine Cheng, David Kanter, Peter Mattson, Guenther Schmuelling, Carole-Jean Wu, Brian Anderson, Maximilien Breughe, Mark Charlebois, William Chou, et al. Mlperf inference benchmark. arxiv, 2019."},{"key":"e_1_3_2_1_85_1","volume-title":"Yolov3: An incremental improvement. arXiv","author":"Redmon Joseph","year":"2018","unstructured":"Joseph Redmon and Ali Farhadi. Yolov3: An incremental improvement. arXiv, 2018."},{"key":"e_1_3_2_1_86_1","volume-title":"DAC","author":"Ryu Sungju","year":"2019","unstructured":"Sungju Ryu, Hyungjun Kim, Wooseok Yi, and Jae-Joon Kim. Bit-blade: Area and energy-efficient precision-scalable neural network accelerator with bitwise summation. In DAC, 2019."},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS48437.2020.00016"},{"key":"e_1_3_2_1_88_1","volume-title":"Scale-sim: Systolic cnn accelerator simulator. arXiv","author":"Samajdar Ananda","year":"2018","unstructured":"Ananda Samajdar, Yuhao Zhu, Paul Whatmough, Matthew Mattina, and Tushar Krishna. Scale-sim: Systolic cnn accelerator simulator. arXiv, 2018."},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"e_1_3_2_1_90_1","volume-title":"ISCA","author":"Shafiee Ali","year":"2016","unstructured":"Ali Shafiee, Anirban Nag, Naveen Muralimanohar, Rajeev Balasubramonian, John Paul Strachan, Miao Hu, R Stanley Williams, and Vivek Srikumar. Isaac: A convolutional neural network accelerator with in-situ analog arithmetic in crossbars. In ISCA, 2016."},{"key":"e_1_3_2_1_91_1","volume-title":"MICRO","author":"Shao Yakun Sophia","year":"2019","unstructured":"Yakun Sophia Shao, Jason Clemons, Rangharajan Venkatesan, Brian Zimmer, Matthew Fojtik, Nan Jiang, Ben Keller, Alicia Klinefelter, Nathaniel Pinckney, Priyanka Raina, et al. Simba: Scaling deep-learning inference with multi-chip-module-based architecture. In MICRO, 2019."},{"key":"e_1_3_2_1_92_1","volume-title":"ISCA","author":"Sharify Sayeh","year":"2019","unstructured":"Sayeh Sharify, Alberto Delmas Lascorz, Mostafa Mahmoud, Milos Nikolic, Kevin Siu, Dylan Malone Stuart, Zissis Poulos, and Andreas Moshovos. Laconic deep learning inference acceleration. In ISCA, 2019."},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"publisher","DOI":"10.1109\/DAC.2018.8465915"},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"publisher","DOI":"10.5555\/3195638.3195659"},{"key":"e_1_3_2_1_95_1","volume-title":"ISCA","author":"Sharma Hardik","year":"2018","unstructured":"Hardik Sharma, Jongse Park, Naveen Suda, Liangzhen Lai, Benson Chau, Vikas Chandra, and Hadi Esmaeilzadeh. Bit fusion: Bit-level dynamically composable architecture for accelerating deep neural networks. ISCA, 2018."},{"key":"e_1_3_2_1_96_1","volume-title":"Introduction to the sifive intelligence x280. https:\/\/www.sifive.com\/blog\/introduction-to-the-sifive-intelligence-x280","year":"2022","unstructured":"SiFIve. Introduction to the sifive intelligence x280. https:\/\/www.sifive.com\/blog\/introduction-to-the-sifive-intelligence-x280, 2022."},{"key":"e_1_3_2_1_97_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. Very deep convolutional networks for large-scale image recognition. arXiv, 2014."},{"key":"e_1_3_2_1_98_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.55"},{"key":"e_1_3_2_1_99_1","volume-title":"ISCA","author":"Song Mingcong","year":"2018","unstructured":"Mingcong Song, Jiechen Zhao, Yang Hu, Jiaqi Zhang, and Tao Li. Prediction based execution on deep neural networks. In ISCA, 2018."},{"key":"e_1_3_2_1_100_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00068"},{"key":"e_1_3_2_1_101_1","volume-title":"ISCA","author":"Srivastava Prakalp","year":"2018","unstructured":"Prakalp Srivastava, Mingu Kang, Sujan K Gonugondla, Sungmin Lim, Jungwook Choi, Vikram Adve, Nam Sung Kim, and Naresh Shanbhag. Promise: An end-to-end design of a programmable mixed-signal accelerator for machine-learning algorithms. In ISCA, 2018."},{"key":"e_1_3_2_1_102_1","doi-asserted-by":"publisher","DOI":"10.1109\/CICC53496.2022.9772810"},{"key":"e_1_3_2_1_103_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2003.1223637"},{"key":"e_1_3_2_1_104_1","volume-title":"ICML","author":"Tan Mingxing","year":"2019","unstructured":"Mingxing Tan and Quoc V Le. Efficientnet: Rethinking model scaling for convolutional neural networks. ICML, 2019."},{"key":"e_1_3_2_1_105_1","volume-title":"Dojo chip. https:\/\/www.tesla.com\/AI","year":"2022","unstructured":"Tesla. Dojo chip. https:\/\/www.tesla.com\/AI, 2022."},{"key":"e_1_3_2_1_106_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC.1994.344733"},{"key":"e_1_3_2_1_107_1","volume-title":"ISCA","author":"Venkataramani Swagath","year":"2021","unstructured":"Swagath Venkataramani, Vijayalakshmi Srinivasan, Wei Wang, Sanchari Sen, Jintao Zhang, Ankur Agrawal, Monodeep Kar, Shubham Jain, Alberto Mannari, Hoang Tran, et al. RaPiD: AI Accelerator for Ultra-low Precision Training and Inference. In ISCA, 2021."},{"key":"e_1_3_2_1_108_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"e_1_3_2_1_109_1","doi-asserted-by":"publisher","DOI":"10.1109\/2.485896"},{"key":"e_1_3_2_1_110_1","doi-asserted-by":"publisher","DOI":"10.14778\/1687627.1687671"},{"key":"e_1_3_2_1_111_1","doi-asserted-by":"publisher","DOI":"10.1145\/1498765.1498785"},{"key":"e_1_3_2_1_112_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3129647"},{"key":"e_1_3_2_1_113_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00064"},{"key":"e_1_3_2_1_114_1","first-page":"11875","volume-title":"International Conference on Machine Learning","author":"Yao Zhewei","year":"2021","unstructured":"Zhewei Yao, Zhen Dong, Zhangcheng Zheng, Amir Gholami, Jiali Yu, Eric Tan, Leyuan Wang, Qijing Huang, Yida Wang, Michael Mahoney, et al. Hawq-v3: Dyadic neural network quantization. In International Conference on Machine Learning, pages 11875--11886. PMLR, 2021."},{"key":"e_1_3_2_1_115_1","volume-title":"ISCA","author":"Yazdanbakhsh Amir","year":"2018","unstructured":"Amir Yazdanbakhsh, Hajar Falahati, Philip J. Wolfe, Kambiz Samadi, Hadi Esmaeilzadeh, and Nam Sung Kim. GANAX: A Unified SIMD-MIMD Acceleration for Generative Adversarial Network. In ISCA, 2018."},{"key":"e_1_3_2_1_116_1","volume-title":"White Paper","author":"Yiu Joseph","year":"2020","unstructured":"Joseph Yiu. Blending dsp and ml features into a low-power general-purpose processor - how far can we go? White Paper, 2020."},{"key":"e_1_3_2_1_117_1","volume-title":"ISCA","author":"Yuan Geng","year":"2021","unstructured":"Geng Yuan, Payman Behnam, Zhengang Li, Ali Shafiee, Sheng Lin, Xiaolong Ma, Hang Liu, Xuehai Qian, Mahdi Nazm Bojnordi, Yanzhi Wang, et al. Forms: fine-grained polarized reram-based in-situ computation for mixed-signal dnn accelerator. In ISCA, 2021."},{"key":"e_1_3_2_1_118_1","doi-asserted-by":"publisher","DOI":"10.1145\/2684746.2689060"},{"key":"e_1_3_2_1_119_1","volume-title":"ISCA","author":"Zhang Jiaqi","year":"2019","unstructured":"Jiaqi Zhang, Xiangru Chen, Mingcong Song, and Tao Li. Eager Pruning: Algorithm and Architecture Support for Fast Training of Deep Neural Networks. In ISCA, 2019."},{"key":"e_1_3_2_1_120_1","doi-asserted-by":"publisher","DOI":"10.5555\/3195638.3195662"}],"event":{"name":"ASPLOS '24: 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","location":"La Jolla CA USA","acronym":"ASPLOS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620665.3640365","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3620665.3640365","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:03:41Z","timestamp":1750291421000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620665.3640365"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,27]]},"references-count":120,"alternative-id":["10.1145\/3620665.3640365","10.1145\/3620665"],"URL":"https:\/\/doi.org\/10.1145\/3620665.3640365","relation":{},"subject":[],"published":{"date-parts":[[2024,4,27]]},"assertion":[{"value":"2024-04-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}