{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,16]],"date-time":"2026-02-16T16:39:51Z","timestamp":1771259991609,"version":"3.50.1"},"reference-count":53,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,1]]},"DOI":"10.1109\/aspdac.2018.8297377","type":"proceedings-article","created":{"date-parts":[[2018,2,22]],"date-time":"2018-02-22T17:02:02Z","timestamp":1519318922000},"page":"527-533","source":"Crossref","is-referenced-by-count":1,"title":["Accelerator-centric deep learning systems for enhanced scalability, energy-efficiency, and programmability"],"prefix":"10.1109","author":[{"given":"Minsoo","family":"Rhu","sequence":"first","affiliation":[]}],"member":"263","reference":[{"key":"ref39","article-title":"Deep learning with COTS HPC systems","author":"coates","year":"2013","journal-title":"International Conference on Machine Learning"},{"key":"ref38","author":"zhang","year":"2017","journal-title":"Poseidon An Efficient Communication Architecture for Distributed Deep Learning on GPU Clusters"},{"key":"ref33","author":"cray","year":"2017","journal-title":"Cray CS-Storm Accelerated Cluster Supercomputers"},{"key":"ref32","year":"2016","journal-title":"History of Massive-scale Sorting Experiments at Google"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080254"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001163"},{"key":"ref37","year":"2017","journal-title":"NVIDIA DGX-l System Architecture The Fastest Platform for Deep Learning"},{"key":"ref36","author":"smith","year":"2014","journal-title":"NVIDIA Volta IBM Power9 Land Contracts For New US Government Supercomputers"},{"key":"ref35","year":"2016","journal-title":"NVLINK High-Speed Interconnect"},{"key":"ref34","year":"2016","journal-title":"NVIDIA DGX-1 Deep Learning System"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001138"},{"key":"ref27","article-title":"Learning Both Weights and Connections for Efficient Neural Network","author":"han","year":"2015","journal-title":"Proceedings of the International Conference on Neural Information Processing Systems (NIPS)"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783723"},{"key":"ref2","author":"collobert","year":"2011","journal-title":"Natural Language Processing (almost) from Scratch"},{"key":"ref1","article-title":"ImageNet Classification with Deep Convolutional Neural Networks","author":"krizhevsky","year":"2012","journal-title":"Proceedings of the International Conference on Neural Information Processing Systems (NIPS)"},{"key":"ref20","year":"2008","journal-title":"CUBLAS Library"},{"key":"ref22","article-title":"XNOR-Net: Im-ageNet Classification using Binary Convolutional Neural Networks","author":"rastegari","year":"2016","journal-title":"European Conference on Computer Vision"},{"key":"ref21","year":"2016","journal-title":"Nvidia cudnn - gpu accelerated deep learning"},{"key":"ref24","author":"micikevicius","year":"2017","journal-title":"Mixed Precision Training"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2017.7966166"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.435"},{"key":"ref25","author":"vasilache","year":"2014","journal-title":"Fast convolutional nets with fbfft A GPU performance evaluation"},{"key":"ref50","year":"2017","journal-title":"Chainer Out-of-core training"},{"key":"ref51","year":"2016","journal-title":"Theano A Python framework for fast computation of mathematical expressions"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830794"},{"key":"ref52","author":"rhu","year":"2017","journal-title":"Com-pressing DMA Engine Leveraging Activation Sparsity for Training Deep Neural Networks"},{"key":"ref10","year":"2016","journal-title":"Microsoft"},{"key":"ref11","article-title":"MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems","author":"chen","year":"2015","journal-title":"Workshop on Machine Learning Systems"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.284"},{"key":"ref12","article-title":"Large Scale Distributed Deep Networks","author":"dean","year":"2012","journal-title":"Proceedings of the International Conference on Neural Information Processing Systems (NIPS)"},{"key":"ref13","article-title":"Project Adam: Building an Efficient and Scalable Deep Learning Training System","author":"chilimbi","year":"2014","journal-title":"OSDI"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"ref15","author":"simonyan","year":"2015","journal-title":"Very Deep Convolutional Networks for Large-scale Image Recognition"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783721"},{"key":"ref18","year":"2013","journal-title":"High-bandwidth Memory (HBM) DRAM"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/VLSIT.2012.6242474"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2005.06.042"},{"key":"ref3","article-title":"Learning hand-eye coordination for robotic grasping with deep learning and large-scale data collection","author":"levine","year":"2016","journal-title":"The International Journal of Robotics Research"},{"key":"ref6","author":"jouppi","year":"2017","journal-title":"In-datacenter performance analysis of a tensor processing unit"},{"key":"ref5","year":"2017","journal-title":"NVIDIA Tesla V100"},{"key":"ref8","year":"2016","journal-title":"TENSOR"},{"key":"ref7","year":"2017","journal-title":"Build and Train Machine Learning Models on our New Google Cloud TPUs"},{"key":"ref49","year":"2017","journal-title":"Tensorflow Memory-optimizer"},{"key":"ref9","year":"2016","journal-title":"Caffe"},{"key":"ref46","year":"2013","journal-title":"Unified Memory in CUDA 6"},{"key":"ref45","year":"2016","journal-title":"Torch"},{"key":"ref48","author":"krause","year":"2006","journal-title":"Address Translation Services"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2016.7446077"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/2966884.2966912"},{"key":"ref41","year":"2016","journal-title":"NVIDIA Collective Communications Library (NCCL)"},{"key":"ref44","author":"awan","year":"2017","journal-title":"Optimized Broadcast for Deep Learning Workloads on Dense-GPU InfiniBand Clusters MPI or NCCL?"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2017.25"}],"event":{"name":"2018 23rd Asia and South Pacific Design Automation Conference (ASP-DAC)","location":"Jeju","start":{"date-parts":[[2018,1,22]]},"end":{"date-parts":[[2018,1,25]]}},"container-title":["2018 23rd Asia and South Pacific Design Automation Conference (ASP-DAC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8291862\/8297256\/08297377.pdf?arnumber=8297377","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2018,4,11]],"date-time":"2018-04-11T17:11:09Z","timestamp":1523466669000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/8297377\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,1]]},"references-count":53,"URL":"https:\/\/doi.org\/10.1109\/aspdac.2018.8297377","relation":{},"subject":[],"published":{"date-parts":[[2018,1]]}}}