{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T22:21:51Z","timestamp":1766269311186,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":98,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,3,9]],"date-time":"2020-03-09T00:00:00Z","timestamp":1583712000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","award":["NRF-2018R1A5A1059921"],"award-info":[{"award-number":["NRF-2018R1A5A1059921"]}],"id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100014553","name":"Samsung Advanced Institute of Technology","doi-asserted-by":"publisher","award":["SRFC-TB1703-03"],"award-info":[{"award-number":["SRFC-TB1703-03"]}],"id":[{"id":"10.13039\/100014553","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,3,9]]},"DOI":"10.1145\/3373376.3378494","type":"proceedings-article","created":{"date-parts":[[2020,3,13]],"date-time":"2020-03-13T22:37:01Z","timestamp":1584139021000},"page":"1109-1124","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":25,"title":["NeuMMU"],"prefix":"10.1145","author":[{"given":"Bongjoon","family":"Hyun","sequence":"first","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, South Korea"}]},{"given":"Youngeun","family":"Kwon","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, South Korea"}]},{"given":"Yujeong","family":"Choi","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, South Korea"}]},{"given":"John","family":"Kim","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, South Korea"}]},{"given":"Minsoo","family":"Rhu","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, South Korea"}]}],"member":"320","published-online":{"date-parts":[[2020,3,13]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/2694344.2694381"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3123982"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001138"},{"key":"e_1_3_2_1_4_1","unstructured":"AMD. AMD's I\/O Virtualization Technology (IOMMU) Specifcation 2018.  AMD. AMD's I\/O Virtualization Technology (IOMMU) Specifcation 2018."},{"key":"e_1_3_2_1_5_1","volume-title":"Manuals and ISA Documents -- AMD","author":"Developer Guides AMD.","year":"2018","unstructured":"AMD. Developer Guides , Manuals and ISA Documents -- AMD . 2018 . AMD. Developer Guides, Manuals and ISA Documents -- AMD. 2018."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3123975"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173162.3173169"},{"volume-title":"DeepBench: Benchmarking Deep Learning Operations on Diferent Hardware","year":"2017","key":"e_1_3_2_1_8_1","unstructured":"Baidu. DeepBench: Benchmarking Deep Learning Operations on Diferent Hardware , 2017 . Baidu. DeepBench: Benchmarking Deep Learning Operations on Diferent Hardware, 2017."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/1815961.1815970"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2540708.2540741"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2011.5749717"},{"key":"e_1_3_2_1_12_1","volume-title":"Atten and Spell: A Neural Network for Large Vocabulary Conversational Speech Recognition. In Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Chan William","year":"2016","unstructured":"William Chan , Navdeep Jaitly , Quoc Le , and Oriol Vinyals . Listen , Atten and Spell: A Neural Network for Large Vocabulary Conversational Speech Recognition. In Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) , 2016 . William Chan, Navdeep Jaitly, Quoc Le, and Oriol Vinyals. Listen, Atten and Spell: A Neural Network for Large Vocabulary Conversational Speech Recognition. In Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2016."},{"key":"e_1_3_2_1_13_1","volume-title":"USIMM: the Utah SImulated Memory Module","author":"Chatterjee N.","year":"2012","unstructured":"N. Chatterjee , R. Balasubramonian , M. Shevgoor , S. Pugsley , A. Udipi , A. Shafee , K. Sudan , M. Awasthi , and Z. Chishti . USIMM: the Utah SImulated Memory Module , 2012 . N. Chatterjee, R. Balasubramonian, M. Shevgoor, S. Pugsley, A. Udipi, A. Shafee, K. Sudan, M. Awasthi, and Z. Chishti. USIMM: the Utah SImulated Memory Module, 2012."},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of the International Conference on Architectural Support for Programming Languages and Operation Systems (ASPLOS)","author":"Chen T.","year":"2014","unstructured":"T. Chen , Z. Du , N. Sun , J. Wang , C. Wu , Y. Chen , and O. Temam . Dian-Nao: A Small-footprint High-throughput Accelerator for Ubiquitous Machine-learning . In Proceedings of the International Conference on Architectural Support for Programming Languages and Operation Systems (ASPLOS) , 2014 . T. Chen, Z. Du, N. Sun, J. Wang, C. Wu, Y. Chen, and O. Temam. Dian-Nao: A Small-footprint High-throughput Accelerator for Ubiquitous Machine-learning. In Proceedings of the International Conference on Architectural Support for Programming Languages and Operation Systems (ASPLOS), 2014."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001177"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC.2016.7418007"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.58"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the International Symposium on Computer Architecture (ISCA), 2016. ASPLOS '20, March 16--20","author":"Chi P.","year":"2020","unstructured":"P. Chi , S. Li , C. Xu , T. Zhang , J. Zhao , Y. Liu , Y. Wang , and Y. Xie . A Novel Processing-in-memory Architecture for Neural Network Computation in ReRAM-based Main Memory . In Proceedings of the International Symposium on Computer Architecture (ISCA), 2016. ASPLOS '20, March 16--20 , 2020 , Lausanne, Switzerland Bongjoon Hyun, Youngeun Kwon, Yujeong Choi, John Kim, and Minsoo Rhu P. Chi, S. Li, C. Xu, T. Zhang, J. Zhao, Y. Liu, Y. Wang, and Y. Xie. A Novel Processing-in-memory Architecture for Neural Network Computation in ReRAM-based Main Memory. In Proceedings of the International Symposium on Computer Architecture (ISCA), 2016. ASPLOS '20, March 16--20, 2020, Lausanne, Switzerland Bongjoon Hyun, Youngeun Kwon, Yujeong Choi, John Kim, and Minsoo Rhu"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00027"},{"key":"e_1_3_2_1_20_1","volume-title":"Bit-Tactical: Exploiting Inefectual Computations in Convolutional Neural Networks: Which, Why, and How","author":"Delmas A.","year":"2018","unstructured":"A. Delmas , P. Judd , D. Stuart , Z. Poulos , M. Mahmoud , S. Sharify , M. Nikolic , and A. Moshovos . Bit-Tactical: Exploiting Inefectual Computations in Convolutional Neural Networks: Which, Why, and How , 2018 . A. Delmas, P. Judd, D. Stuart, Z. Poulos, M. Mahmoud, S. Sharify, M. Nikolic, and A. Moshovos. Bit-Tactical: Exploiting Inefectual Computations in Convolutional Neural Networks: Which, Why, and How, 2018."},{"key":"e_1_3_2_1_21_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In arxiv.org","author":"Devlin J.","year":"2018","unstructured":"J. Devlin , M. Chang , K. Lee , and K. Toutanova . BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In arxiv.org , 2018 . J. Devlin, M. Chang, K. Lee, and K. Toutanova. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In arxiv.org, 2018."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750389"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830789"},{"volume-title":"Accelerating Facebook's infrastructure with Application-Specifc Hardware","year":"2019","key":"e_1_3_2_1_24_1","unstructured":"Facebook. Accelerating Facebook's infrastructure with Application-Specifc Hardware , 2019 . Facebook. Accelerating Facebook's infrastructure with Application-Specifc Hardware, 2019."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037702"},{"volume-title":"Cloud TPUs: ML accelerators for TensorFlow","year":"2017","key":"e_1_3_2_1_26_1","unstructured":"Google. Cloud TPUs: ML accelerators for TensorFlow , 2017 . Google. Cloud TPUs: ML accelerators for TensorFlow, 2017."},{"volume-title":"Google Cloud TPU Beta Release","year":"2018","key":"e_1_3_2_1_27_1","unstructured":"Google. Google Cloud TPU Beta Release , 2018 . Google. Google Cloud TPU Beta Release, 2018."},{"key":"e_1_3_2_1_28_1","volume-title":"Neural Turing Machines. In arxiv.org","author":"Graves A.","year":"2014","unstructured":"A. Graves , G. Wayne , and I. Danihelka . Neural Turing Machines. In arxiv.org , 2014 . A. Graves, G. Wayne, and I. Danihelka. Neural Turing Machines. In arxiv.org, 2014."},{"key":"e_1_3_2_1_29_1","volume-title":"The Architectural Implications of Facebook's DNN-based Personalized Recommendation. In arxiv.org","author":"Gupta U.","year":"2019","unstructured":"U. Gupta , X. Wang , M. Naumov , C. Wu , B. Reagen , D. Brooks , B. Cottel , K. Hazelwood , B. Jia , H. S. Lee , A. Malevich , D. Mudigere , M. Smelyanskiy , L. Xiong , and X. Zhang . The Architectural Implications of Facebook's DNN-based Personalized Recommendation. In arxiv.org , 2019 . U. Gupta, X.Wang, M. Naumov, C.Wu, B. Reagen, D. Brooks, B. Cottel, K. Hazelwood, B. Jia, H. S. Lee, A. Malevich, D. Mudigere, M. Smelyanskiy, L. Xiong, and X. Zhang. The Architectural Implications of Facebook's DNN-based Personalized Recommendation. In arxiv.org, 2019."},{"volume-title":"Gaudi Training Platform White Paper","year":"2019","key":"e_1_3_2_1_30_1","unstructured":"Habana. Gaudi Training Platform White Paper , 2019 . Habana. Gaudi Training Platform White Paper, 2019."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2016.30"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.19"},{"key":"e_1_3_2_1_33_1","volume-title":"Jian Sun. Deep Residual Learning for Image Recognition. In Proceedings of the Conference on Computer Vision and Pattern Recognition (CVPR)","author":"He Kaiming","year":"2016","unstructured":"Kaiming He , Xiangyu Zhang , Shaoqing Ren , and Jian Sun. Deep Residual Learning for Image Recognition. In Proceedings of the Conference on Computer Vision and Pattern Recognition (CVPR) , 2016 . Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. Deep Residual Learning for Image Recognition. In Proceedings of the Conference on Computer Vision and Pattern Recognition (CVPR), 2016."},{"key":"e_1_3_2_1_34_1","volume-title":"Tat-Seng Chua. Neural Collaborative Filtering. In Proceedings of the International Conference on World Wide Web (WWW)","author":"He Xiangnan","year":"2017","unstructured":"Xiangnan He , Lizi Liao , Hanwang Zhang , Liqiang Nie , Xia Hu , and Tat-Seng Chua. Neural Collaborative Filtering. In Proceedings of the International Conference on World Wide Web (WWW) , 2017 . Xiangnan He, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu, and Tat-Seng Chua. Neural Collaborative Filtering. In Proceedings of the International Conference on World Wide Web (WWW), 2017."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3293883.3295710"},{"key":"e_1_3_2_1_36_1","volume-title":"Energy Table for 45nm Process","author":"Horowitz M.","year":"2013","unstructured":"M. Horowitz . Energy Table for 45nm Process , 2013 . M. Horowitz. Energy Table for 45nm Process, 2013."},{"key":"e_1_3_2_1_37_1","volume-title":"Cycle Time, Area, Leakage, and Dynamic Power Model","author":"Labs HP","year":"2016","unstructured":"HP Labs . CACTI: An Integrated Cache and Memory Access Time , Cycle Time, Area, Leakage, and Dynamic Power Model , 2016 . HP Labs. CACTI: An Integrated Cache and Memory Access Time, Cycle Time, Area, Leakage, and Dynamic Power Model, 2016."},{"key":"e_1_3_2_1_38_1","volume-title":"Heterogeneous System Architecture","author":"Foundation HSA","year":"2018","unstructured":"HSA Foundation . Heterogeneous System Architecture , 2018 . HSA Foundation. Heterogeneous System Architecture, 2018."},{"volume-title":"Codenamed Haswell","year":"2013","key":"e_1_3_2_1_39_1","unstructured":"Intel. 4th Generation Intel Core Processor , Codenamed Haswell , 2013 . Intel. 4th Generation Intel Core Processor, Codenamed Haswell, 2013."},{"volume-title":"Intel 64 and IA-32 Architectures Software Developer's Manual","year":"2018","key":"e_1_3_2_1_40_1","unstructured":"Intel. Intel 64 and IA-32 Architectures Software Developer's Manual Volume 3A: System Programming Guide Part 1 . 2018 . Intel. Intel 64 and IA-32 Architectures Software Developer's Manual Volume 3A: System Programming Guide Part 1. 2018."},{"volume-title":"Intel Nervana Hardware: Neural Network Processor (Lake Crest)","year":"2018","key":"e_1_3_2_1_41_1","unstructured":"Intel-Nervana. Intel Nervana Hardware: Neural Network Processor (Lake Crest) , 2018 . Intel-Nervana. Intel Nervana Hardware: Neural Network Processor (Lake Crest), 2018."},{"key":"e_1_3_2_1_42_1","volume-title":"Proceedings of the International Conference on Architectural Support for Programming Languages and Operation Systems (ASPLOS)","author":"Jacob B. L.","year":"1998","unstructured":"B. L. Jacob and T. N. Mudge . A Look at Several Memory Management Units, TLB-Refll Mechanisms, and Page Table Organizations . In Proceedings of the International Conference on Architectural Support for Programming Languages and Operation Systems (ASPLOS) , 1998 . B. L. Jacob and T. N. Mudge. A Look at Several Memory Management Units, TLB-Refll Mechanisms, and Page Table Organizations. In Proceedings of the International Conference on Architectural Support for Programming Languages and Operation Systems (ASPLOS), 1998."},{"key":"e_1_3_2_1_43_1","volume-title":"High Bandwidth Memory (HBM2) DRAM","author":"JEDEC.","year":"2018","unstructured":"JEDEC. High Bandwidth Memory (HBM2) DRAM . 2018 . JEDEC. High Bandwidth Memory (HBM2) DRAM. 2018."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_2_1_45_1","volume-title":"Exploring the Limits of Language Modeling. In arxiv.org","author":"Jozefowicz Rafal","year":"2016","unstructured":"Rafal Jozefowicz , Oriol Vinyals , Mike Schuster , Noam Shazeer , and YonghuiWu. Exploring the Limits of Language Modeling. In arxiv.org , 2016 . Rafal Jozefowicz, Oriol Vinyals, Mike Schuster, Noam Shazeer, and YonghuiWu. Exploring the Limits of Language Modeling. In arxiv.org, 2016."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783722"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/545214.545237"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001178"},{"key":"e_1_3_2_1_49_1","volume-title":"OneWeird Trick For Parallelizing Convolutional Neural Networks","author":"Krizhevsky A.","year":"2014","unstructured":"A. Krizhevsky . OneWeird Trick For Parallelizing Convolutional Neural Networks , 2014 . A. Krizhevsky. OneWeird Trick For Parallelizing Convolutional Neural Networks, 2014."},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NIPS)","author":"Krizhevsky A.","year":"2012","unstructured":"A. Krizhevsky , I. Sutskever , and G. Hinton . ImageNet Classifcation with Deep Convolutional Neural Networks . In Proceedings of the International Conference on Neural Information Processing Systems (NIPS) , 2012 . A. Krizhevsky, I. Sutskever, and G. Hinton. ImageNet Classifcation with Deep Convolutional Neural Networks. In Proceedings of the International Conference on Neural Information Processing Systems (NIPS), 2012."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358284"},{"key":"e_1_3_2_1_52_1","author":"Kwon Y.","year":"2018","unstructured":"Y. Kwon and M. Rhu . A Case for Memory-Centric HPC System Architecture for Training Deep Neural Networks. In IEEE Computer Architecture Letters , 2018 . Y. Kwon and M. Rhu. A Case for Memory-Centric HPC System Architecture for Training Deep Neural Networks. In IEEE Computer Architecture Letters, 2018.","journal-title":"In IEEE Computer Architecture Letters"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00021"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2019.2929165"},{"key":"e_1_3_2_1_55_1","volume-title":"OSDI","author":"Kwon Youngjin","year":"2016","unstructured":"Youngjin Kwon , Hangchen Yu , Simon Peter , Christopher J. Rossbach , and Emmett Witchel . Coordinated and Efcient Huge Page Management with Ingens . In OSDI , 2016 . Youngjin Kwon, Hangchen Yu, Simon Peter, Christopher J. Rossbach, and Emmett Witchel. Coordinated and Efcient Huge Page Management with Ingens. In OSDI, 2016."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485964"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304044"},{"key":"e_1_3_2_1_58_1","volume-title":"Proceedings of the International Symposium on Computer Architecture (ISCA), 2016. NeuMMU: Architectural Support for Eficient Address Translations in Neural Processing Units ASPLOS '20, March 16--20","author":"LiKamWa R.","year":"2020","unstructured":"R. LiKamWa , Y. Hou , M. Polansky , Y. Gao , and L. Zhong . RedEye: Analog ConvNet Image Sensor Architecture for Continuous Mobile Vision . In Proceedings of the International Symposium on Computer Architecture (ISCA), 2016. NeuMMU: Architectural Support for Eficient Address Translations in Neural Processing Units ASPLOS '20, March 16--20 , 2020 , Lausanne, Switzerland R. LiKamWa, Y. Hou, M. Polansky, Y. Gao, and L. Zhong. RedEye: Analog ConvNet Image Sensor Architecture for Continuous Mobile Vision. In Proceedings of the International Symposium on Computer Architecture (ISCA), 2016. NeuMMU: Architectural Support for Eficient Address Translations in Neural Processing Units ASPLOS '20, March 16--20, 2020, Lausanne, Switzerland"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/2694344.2694358"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001179"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2016.7446050"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3124534"},{"volume-title":"ML Hardware Accelerators","year":"2019","key":"e_1_3_2_1_63_1","unstructured":"MLPerf. MLPerf: A Broad ML Benchmark Suite for Measuring Performance of ML Software Frameworks , ML Hardware Accelerators , and ML Cloud Platforms , 2019 . MLPerf. MLPerf: A Broad ML Benchmark Suite for Measuring Performance of ML Software Frameworks, ML Hardware Accelerators, and ML Cloud Platforms, 2019."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/3174243.3174258"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.23919\/FPL.2017.8056823"},{"key":"e_1_3_2_1_66_1","volume-title":"Deep Learning Recommendation Model for Personalization and Recommendation Systems. In arxiv.org","author":"Naumov M.","year":"2019","unstructured":"M. Naumov , D. Mudigere , H. Shi , J. Huang , N. Sundaraman , J. Park , X. Wang , U. Gupta , C. Wu , A. Azzolini , D. Dzhulgakov , A. Mallevich , I. Cherniavskii , Y. Lu , R. Krishnamoorthi , A. Yu , V. Kondratenko , S. Pereira , X. Chen , W. Chen , V. Rao , B. Jia , L. Xiong , and M. Smelyanskiy . Deep Learning Recommendation Model for Personalization and Recommendation Systems. In arxiv.org , 2019 . M. Naumov, D. Mudigere, H. Shi, J. Huang, N. Sundaraman, J. Park, X. Wang, U. Gupta, C. Wu, A. Azzolini, D. Dzhulgakov, A. Mallevich, I. Cherniavskii, Y. Lu, R. Krishnamoorthi, A. Yu, V. Kondratenko, S. Pereira, X. Chen, W. Chen, V. Rao, B. Jia, L. Xiong, and M. Smelyanskiy. Deep Learning Recommendation Model for Personalization and Recommendation Systems. In arxiv.org, 2019."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3020078.3021740"},{"key":"e_1_3_2_1_68_1","first-page":"6","year":"2013","unstructured":"NVIDIA. Unifed Memory in CUDA 6 , 2013 . NVIDIA. Unifed Memory in CUDA 6, 2013.","journal-title":"NVIDIA. Unifed Memory in CUDA"},{"key":"e_1_3_2_1_69_1","volume-title":"NVIDIA CUDA Programming Guide","author":"NVIDIA.","year":"2016","unstructured":"NVIDIA. NVIDIA CUDA Programming Guide , 2016 . NVIDIA. NVIDIA CUDA Programming Guide, 2016."},{"key":"e_1_3_2_1_70_1","volume-title":"NVLINK High-Speed Interconnect","author":"NVIDIA.","year":"2016","unstructured":"NVIDIA. NVLINK High-Speed Interconnect , 2016 . NVIDIA. NVLINK High-Speed Interconnect, 2016."},{"key":"e_1_3_2_1_71_1","volume-title":"The NVIDIA DGX-2 Deep Learning System","author":"NVIDIA.","year":"2017","unstructured":"NVIDIA. The NVIDIA DGX-2 Deep Learning System , 2017 . NVIDIA. The NVIDIA DGX-2 Deep Learning System, 2017."},{"key":"e_1_3_2_1_72_1","first-page":"V100","year":"2018","unstructured":"NVIDIA. NVIDIA Tesla V100 , 2018 . NVIDIA. NVIDIA Tesla V100, 2018.","journal-title":"NVIDIA. NVIDIA Tesla"},{"key":"e_1_3_2_1_73_1","volume-title":"NVSwitch: Leveraging NVLink to Maximum Efect","author":"NVIDIA.","year":"2018","unstructured":"NVIDIA. NVSwitch: Leveraging NVLink to Maximum Efect , 2018 . NVIDIA. NVSwitch: Leveraging NVLink to Maximum Efect, 2018."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080254"},{"key":"e_1_3_2_1_75_1","volume-title":"Proceedings of the International Symposium on Computer Architecture (ISCA)","author":"Park E.","year":"2018","unstructured":"E. Park , D. Kim , and S. Yoo . Energy-Efcient Neural Network Accelerator Based on Outlier-Aware Low-Precision Computation . In Proceedings of the International Symposium on Computer Architecture (ISCA) , 2018 . E. Park, D. Kim, and S. Yoo. Energy-Efcient Neural Network Accelerator Based on Outlier-Aware Low-Precision Computation. In Proceedings of the International Symposium on Computer Architecture (ISCA), 2018."},{"key":"e_1_3_2_1_76_1","volume-title":"Y. Jia, L. Qiao, V. Rao, N. Rotem, S. Yoo, and M. Smelyanskiy. Deep Learning Inference in Facebook Data Centers: Characterization, Performance Optimizations and Hardware Implications. In arxiv.org","author":"Park J.","year":"2018","unstructured":"J. Park , M. Naumov , P. Basu , S. Deng , A. Kalaiah , D. Khudia , J. Law , P. Malani , A. Malevich , S. Nadathur , J. Pino , M. Schatz , A. Sidorov , V. Sivakumar , A. Tulloch , X. Wang , Y. Wu , H. Yuen , U. Diril , D. Dzhulgakov , K. Hazelwood an B. Jia , Y. Jia, L. Qiao, V. Rao, N. Rotem, S. Yoo, and M. Smelyanskiy. Deep Learning Inference in Facebook Data Centers: Characterization, Performance Optimizations and Hardware Implications. In arxiv.org , 2018 . J. Park, M. Naumov, P. Basu, S. Deng, A. Kalaiah, D. Khudia, J. Law, P. Malani, A. Malevich, S. Nadathur, J. Pino, M. Schatz, A. Sidorov, V. Sivakumar, A. Tulloch, X. Wang, Y. Wu, H. Yuen, U. Diril, D. Dzhulgakov, K. Hazelwood an B. Jia, Y. Jia, L. Qiao, V. Rao, N. Rotem, S. Yoo, and M. Smelyanskiy. Deep Learning Inference in Facebook Data Centers: Characterization, Performance Optimizations and Hardware Implications. In arxiv.org, 2018."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541942"},{"key":"e_1_3_2_1_78_1","volume-title":"GPU Lanes. In Proceedings of the International Symposium on High-Performance Computer Architecture (HPCA)","author":"Power J.","year":"2014","unstructured":"J. Power , M. Hill , and D. Wood . Supporting x86--64 Address Translation for 100s of GPU Lanes. In Proceedings of the International Symposium on High-Performance Computer Architecture (HPCA) , 2014 . J. Power, M. Hill, and D.Wood. Supporting x86--64 Address Translation for 100s of GPU Lanes. In Proceedings of the International Symposium on High-Performance Computer Architecture (HPCA), 2014."},{"key":"e_1_3_2_1_79_1","volume-title":"High-Accuracy Deep Neural Network Accelerators. In Proceedings of the International Symposium on Computer Architecture (ISCA)","author":"Reagen B.","year":"2016","unstructured":"B. Reagen , P. Whatmough , R. Adolf , S. Rama , H. Lee , S. Lee , J. Miguel , H. Lobato , G. Wei , and D. Brooks . Minerva: Enabling Low-Power , High-Accuracy Deep Neural Network Accelerators. In Proceedings of the International Symposium on Computer Architecture (ISCA) , 2016 . B. Reagen, P. Whatmough, R. Adolf, S. Rama, H. Lee, S. Lee, J. Miguel, H. Lobato, G. Wei, and D. Brooks. Minerva: Enabling Low-Power, High-Accuracy Deep Neural Network Accelerators. In Proceedings of the International Symposium on Computer Architecture (ISCA), 2016."},{"key":"e_1_3_2_1_80_1","volume-title":"Memory-Efcient Neural Network Design. In Proceedings of the International Symposium on Microarchitecture (MICRO)","author":"Rhu M.","year":"2016","unstructured":"M. Rhu , N. Gimelshein , J. Clemons , A. Zulfqar , and S. W. Keckler . vDNN: Virtualized Deep Neural Networks for Scalable , Memory-Efcient Neural Network Design. In Proceedings of the International Symposium on Microarchitecture (MICRO) , 2016 . M. Rhu, N. Gimelshein, J. Clemons, A. Zulfqar, and S. W. Keckler. vDNN: Virtualized Deep Neural Networks for Scalable, Memory-Efcient Neural Network Design. In Proceedings of the International Symposium on Microarchitecture (MICRO), 2016."},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00017"},{"key":"e_1_3_2_1_82_1","volume-title":"DRAMSim2: A Cycle Accurate Memory System Simulator","author":"Rosenfeld P.","year":"2011","unstructured":"P. Rosenfeld , E. Cooper-Balis , and B. Jacob . DRAMSim2: A Cycle Accurate Memory System Simulator , 2011 . P. Rosenfeld, E. Cooper-Balis, and B. Jacob. DRAMSim2: A Cycle Accurate Memory System Simulator, 2011."},{"key":"e_1_3_2_1_83_1","first-page":"05","author":"Ross J.","year":"2015","unstructured":"J. Ross . Prefetching Weights for Use in a Neural Network Processor. Patent , 05 2015 . US 9805304B2. J. Ross. Prefetching Weights for Use in a Neural Network Processor. Patent, 05 2015. US 9805304B2.","journal-title":"Neural Network Processor. Patent"},{"key":"e_1_3_2_1_84_1","first-page":"05","author":"Ross J.","year":"2015","unstructured":"J. Ross , N. Jouppi , A. Phelps , R. Young , T. Norrie , G. Thorson , and D. Luu . Neural Network Processor. Patent , 05 2015 . US 9747546B2. J. Ross, N. Jouppi, A. Phelps, R. Young, T. Norrie, G. Thorson, and D. Luu. Neural Network Processor. Patent, 05 2015. US 9747546B2.","journal-title":"Neural Network Processor. Patent"},{"key":"e_1_3_2_1_85_1","first-page":"05","author":"Ross J.","year":"2015","unstructured":"J. Ross and A. Phelps . Computing Convolutions Using a Neural Network Processor. Patent , 05 2015 . US 9697463B2. J. Ross and A. Phelps. Computing Convolutions Using a Neural Network Processor. Patent, 05 2015. US 9697463B2.","journal-title":"Computing Convolutions Using a Neural Network Processor. Patent"},{"key":"e_1_3_2_1_86_1","first-page":"05","year":"2015","unstructured":"J. Ross and G. Thorson. Rotating Data for Neural Network Computations. Patent , 05 2015 . US 9747548B2. J. Ross and G. Thorson. Rotating Data for Neural Network Computations. Patent, 05 2015. US 9747548B2.","journal-title":"J. Ross and G. Thorson. Rotating Data for Neural Network Computations. Patent"},{"key":"e_1_3_2_1_87_1","volume-title":"Unifed Memory on Pascal and Volta","author":"Sakharnykh Nikolay","year":"2017","unstructured":"Nikolay Sakharnykh . Unifed Memory on Pascal and Volta , 2017 . Nikolay Sakharnykh. Unifed Memory on Pascal and Volta, 2017."},{"key":"e_1_3_2_1_88_1","volume-title":"Recency-based TLBpreloading. In Proceedings of the International Symposium on Computer Architecture (ISCA)","author":"Saulsbury A.","year":"2000","unstructured":"A. Saulsbury , F. Dahlgren , and P. Stenstrom . Recency-based TLBpreloading. In Proceedings of the International Symposium on Computer Architecture (ISCA) , 2000 . A. Saulsbury, F. Dahlgren, and P. Stenstrom. Recency-based TLBpreloading. In Proceedings of the International Symposium on Computer Architecture (ISCA), 2000."},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001139"},{"key":"e_1_3_2_1_90_1","volume-title":"Sparc T4: A Dynamically Threaded Server-on-a-chip","author":"Shah M.","year":"2012","unstructured":"M. Shah , R. Golla , G. Grohoski , P. Jordan , J. Barreh , J. Brooks , M. Greenberg , G. Levinsky , M. Luttrell , C. Olson , Z. Samoail , M. Smittle , and T. Ziaja . Sparc T4: A Dynamically Threaded Server-on-a-chip . In IEEE Micro , 2012 . M. Shah, R. Golla, G. Grohoski, P. Jordan, J. Barreh, J. Brooks, M. Greenberg, G. Levinsky, M. Luttrell, C. Olson, Z. Samoail, M. Smittle, and T. Ziaja. Sparc T4: A Dynamically Threaded Server-on-a-chip. In IEEE Micro, 2012."},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783720"},{"key":"e_1_3_2_1_92_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"e_1_3_2_1_93_1","volume-title":"arxiv.org","author":"Vaswani A.","year":"2017","unstructured":"A. Vaswani , N. Shazeer , N. Parmar , J. Uszkoreit , L. Jones , A. N. Gomez , L. Kaiser , and I. Polosukhin . Attention Is All You Need. In arxiv.org , 2017 . A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser, and I. Polosukhin. Attention Is All You Need. In arxiv.org, 2017."},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952679"},{"key":"e_1_3_2_1_95_1","volume-title":"Proceedings of the International Solid State Circuits Conference (ISSCC)","author":"Whatmough P.","year":"2017","unstructured":"P. Whatmough , S. Lee , H. Lee , S. Rama , D. Brooks , and G. Wei . A 28nm SoC with a 1.2 GHz 568nJ\/Prediction Sparse Deep-Neural-Network Engine with &gt;0.1 Timing Error Rate Tolerance for IoT Applications . In Proceedings of the International Solid State Circuits Conference (ISSCC) , February 2017 . ASPLOS '20, March 16 --20 , 2020, Lausanne, Switzerland P. Whatmough, S. Lee, H. Lee, S. Rama, D. Brooks, and G.Wei. A 28nm SoC with a 1.2 GHz 568nJ\/Prediction Sparse Deep-Neural-Network Engine with &gt;0.1 Timing Error Rate Tolerance for IoT Applications. In Proceedings of the International Solid State Circuits Conference (ISSCC), February 2017. ASPLOS '20, March 16--20, 2020, Lausanne, Switzerland"},{"key":"e_1_3_2_1_96_1","volume-title":"Hot Chips: A Symposium on High Performance Chips","author":"Whatmough P.","year":"2017","unstructured":"P. Whatmough , S. Lee , N. Mulholland , P. Hansen , S. Kodali , D. Brooks , and G. Wei . DNN ENGINE: A 16nm Sub-uJ Deep Neural Network Inference Accelerator for the Embedded Masses . In Hot Chips: A Symposium on High Performance Chips , August 2017 . P. Whatmough, S. Lee, N. Mulholland, P. Hansen, S. Kodali, D. Brooks, and G. Wei. DNN ENGINE: A 16nm Sub-uJ Deep Neural Network Inference Accelerator for the Embedded Masses. In Hot Chips: A Symposium on High Performance Chips, August 2017."},{"key":"e_1_3_2_1_97_1","volume-title":"Youngeun Kwon, Yujeong Choi, John Kim, and Minsoo Rhu In Proceedings of the International Symposium on Microarchitecture (MICRO)","author":"Zhang S.","year":"2016","unstructured":"S. Zhang , Z. Du , L. Zhang , H. Lan , S. Liu , L. Li , Q. Guo , T. Chen , and Y. Chen . Cambricon-X: An Accelerator for Sparse Neural Networks. Bongjoon Hyun , Youngeun Kwon, Yujeong Choi, John Kim, and Minsoo Rhu In Proceedings of the International Symposium on Microarchitecture (MICRO) , October 2016 . S. Zhang, Z. Du, L. Zhang, H. Lan, S. Liu, L. Li, Q. Guo, T. Chen, and Y. Chen. Cambricon-X: An Accelerator for Sparse Neural Networks. Bongjoon Hyun, Youngeun Kwon, Yujeong Choi, John Kim, and Minsoo Rhu In Proceedings of the International Symposium on Microarchitecture (MICRO), October 2016."},{"key":"e_1_3_2_1_98_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2016.7446077"}],"event":{"name":"ASPLOS '20: Architectural Support for Programming Languages and Operating Systems","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"],"location":"Lausanne Switzerland","acronym":"ASPLOS '20"},"container-title":["Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3373376.3378494","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3373376.3378494","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T22:32:59Z","timestamp":1750199579000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3373376.3378494"}},"subtitle":["Architectural Support for Efficient Address Translations in Neural Processing Units"],"short-title":[],"issued":{"date-parts":[[2020,3,9]]},"references-count":98,"alternative-id":["10.1145\/3373376.3378494","10.1145\/3373376"],"URL":"https:\/\/doi.org\/10.1145\/3373376.3378494","relation":{},"subject":[],"published":{"date-parts":[[2020,3,9]]},"assertion":[{"value":"2020-03-13","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}