{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,4]],"date-time":"2025-12-04T10:05:06Z","timestamp":1764842706028},"reference-count":98,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,2,1]],"date-time":"2023-02-01T00:00:00Z","timestamp":1675209600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,2,1]],"date-time":"2023-02-01T00:00:00Z","timestamp":1675209600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,2]]},"DOI":"10.1109\/hpca56546.2023.10070941","type":"proceedings-article","created":{"date-parts":[[2023,3,24]],"date-time":"2023-03-24T17:42:55Z","timestamp":1679679775000},"page":"1003-1016","source":"Crossref","is-referenced-by-count":4,"title":["High Performance and Power Efficient Accelerator for Cloud Inference"],"prefix":"10.1109","author":[{"given":"Jianguo","family":"Yao","sequence":"first","affiliation":[{"name":"SJTU and Enflame-Tech Inc.,Shanghai,China"}]},{"given":"Hao","family":"Zhou","sequence":"additional","affiliation":[{"name":"Enflame-Tech Inc.,Shanghai,China"}]},{"given":"Yalin","family":"Zhang","sequence":"additional","affiliation":[{"name":"Enflame-Tech Inc.,Shanghai,China"}]},{"given":"Ying","family":"Li","sequence":"additional","affiliation":[{"name":"Enflame-Tech Inc.,Shanghai,China"}]},{"given":"Chuang","family":"Feng","sequence":"additional","affiliation":[{"name":"Enflame-Tech Inc.,Shanghai,China"}]},{"given":"Shi","family":"Chen","sequence":"additional","affiliation":[{"name":"Enflame-Tech Inc.,Shanghai,China"}]},{"given":"Jiaoyan","family":"Chen","sequence":"additional","affiliation":[{"name":"Enflame-Tech Inc.,Shanghai,China"}]},{"given":"Yongdong","family":"Wang","sequence":"additional","affiliation":[{"name":"Enflame-Tech Inc.,Shanghai,China"}]},{"given":"Qiaojuan","family":"Hu","sequence":"additional","affiliation":[{"name":"Enflame-Tech Inc.,Shanghai,China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Bert models from google research"},{"key":"ref2","article-title":"CenterNet models"},{"key":"ref3","article-title":"Cloud computing services - amazon web services (aws)"},{"key":"ref4","article-title":"Cloud computing services - microsoft azure"},{"key":"ref5","article-title":"Conformer models"},{"key":"ref6","article-title":"Cuda programming guide"},{"key":"ref7","article-title":"Nvidia A10 GPU"},{"key":"ref8","article-title":"Nvidia T4 GPU"},{"key":"ref9","article-title":"Open neural network exchange"},{"key":"ref10","article-title":"Permutation matrix"},{"key":"ref11","article-title":"Pytorch image classification models"},{"key":"ref12","article-title":"Retinaface models"},{"key":"ref13","article-title":"SRResnet models"},{"key":"ref14","article-title":"Tensorflow models"},{"key":"ref15","article-title":"Tensorrt command-line wrapper: trtexec"},{"key":"ref16","article-title":"Unet models"},{"key":"ref17","article-title":"Yolo v3 models"},{"key":"ref18","first-page":"265283","article-title":"Tensorflow: A system for large-scale machine learning","volume-title":"Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation, ser. OSDI\u201916","author":"Abadi"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.47"},{"article-title":"First-generation inference accelerator deployment at facebook","year":"2021","author":"Anderson","key":"ref20"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304026"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.3390\/en10101470"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2019.2935065"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/MDAT.2022.3161126"},{"key":"ref25","first-page":"579594","article-title":"Tvm: An automated end-to-end optimizing compiler for deep learning","volume-title":"ser. OSDI\u201918","author":"Chen","year":"2018"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2018.022071131"},{"article-title":"RetinaFace: Single-stage dense face localisation in the wild","year":"2019","author":"Deng","key":"ref27"},{"article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","year":"2018","author":"Devlin","key":"ref28"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00667"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2019.2908101"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/1067651.801649"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/VLSIC.2018.8502276"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00012"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2019.00023"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155675"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2016.2582924"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/mm.2012.71"},{"key":"ref38","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2020-3015","article-title":"Conformer: Convolution-augmented transformer for speech recognition","author":"Gulati","year":"2020"},{"volume-title":"Goya Inference Platform White Paper","year":"2020","key":"ref39"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00059"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3282307"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00023"},{"article-title":"Delta keyword transformer: Bringing transformers to the edge through dynamically pruned multi-head selfattention","year":"2022","author":"Jelicov\u00e1","key":"ref45"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359630"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/HCS49909.2020.9220619"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00010"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/3154484"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2008.4658633"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1145\/349299.349320"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1989.1.4.541"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/5.726791"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.19"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00071"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/HCS52781.2021.9567224"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/TCSI.2020.2979336"},{"key":"ref60","first-page":"881","article-title":"Rammer: Enabling holistic deep learning compiler optimizations with rtasks","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Ma"},{"article-title":"Accelerating sparse deep neural networks","year":"2021","author":"Mishra","key":"ref61"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3533727"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454083"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3058217"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1145\/1133981.1133997"},{"volume-title":"NVIDIA T4 70W Low Profile PCIe GPU Accelerator (PB09256-001_v05)","year":"2020","key":"ref66"},{"volume-title":"NVIDIA A10 GPU Accelerator (PB-10415-001_v04)","year":"2022","key":"ref67"},{"article-title":"Deep learning inference in facebook data centers: Characterization, performance optimizations and hardware implications","year":"2018","author":"Park","key":"ref68"},{"article-title":"Pytorch: An imperative style, highperformance deep learning library","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems","author":"Paszke","key":"ref69"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507738"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.23919\/DATE51398.2021.9474225"},{"article-title":"YOLOv3: An incremental improvement","year":"2018","author":"Redmon","key":"ref72"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/tc.2016.2620469"},{"article-title":"Infaas: A model-less and managed inference serving system","year":"2019","author":"Romero","key":"ref74"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"article-title":"Glow: Graph lowering compiler techniques for neural networks","year":"2018","author":"Rotem","key":"ref76"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.16"},{"article-title":"Very deep convolutional networks for large-scale image recognition","year":"2014","author":"Simonyan","key":"ref78"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1145\/2980930.2907957"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1016\/B978-012088469-8.50058-9"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00012"},{"article-title":"Tensor comprehensions: Framework-agnostic highperformance machine learning abstractions","year":"2018","author":"Vasilache","key":"ref84"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/VLSID.2016.24"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00074"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1145\/3243905"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2019.8875671"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370330"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10590-1_53"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-32049-6_5"},{"key":"ref93","first-page":"13711385","volume-title":"HSM: A Hybrid Slowdown Model for Multitasking GPUs","author":"Zhao","year":"2020"},{"key":"ref94","first-page":"863","article-title":"Ansor: Generating High-Performance tensor programs for deep learning","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Zheng"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527440"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378508"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1145\/2886101"},{"key":"ref98","first-page":"59","article-title":"Exploiting mixed simd parallelism by reducing data reorganization overhead","volume-title":"2016 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO)","author":"Zhou"}],"event":{"name":"2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)","start":{"date-parts":[[2023,2,25]]},"location":"Montreal, QC, Canada","end":{"date-parts":[[2023,3,1]]}},"container-title":["2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10070856\/10070923\/10070941.pdf?arnumber=10070941","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,13]],"date-time":"2024-02-13T13:31:19Z","timestamp":1707831079000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10070941\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,2]]},"references-count":98,"URL":"https:\/\/doi.org\/10.1109\/hpca56546.2023.10070941","relation":{},"subject":[],"published":{"date-parts":[[2023,2]]}}}