{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T14:46:49Z","timestamp":1775746009570,"version":"3.50.1"},"reference-count":45,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"11","license":[{"start":{"date-parts":[[2023,11,1]],"date-time":"2023-11-01T00:00:00Z","timestamp":1698796800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,11,1]],"date-time":"2023-11-01T00:00:00Z","timestamp":1698796800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,11,1]],"date-time":"2023-11-01T00:00:00Z","timestamp":1698796800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62174084"],"award-info":[{"award-number":["62174084"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022YFB4400604"],"award-info":[{"award-number":["2022YFB4400604"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. VLSI Syst."],"published-print":{"date-parts":[[2023,11]]},"DOI":"10.1109\/tvlsi.2023.3305569","type":"journal-article","created":{"date-parts":[[2023,9,14]],"date-time":"2023-09-14T18:09:15Z","timestamp":1694714955000},"page":"1788-1801","source":"Crossref","is-referenced-by-count":18,"title":["An Efficient Training Accelerator for Transformers With Hardware-Algorithm Co-Optimization"],"prefix":"10.1109","volume":"31","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-6965-3436","authenticated-orcid":false,"given":"Haikuo","family":"Shao","sequence":"first","affiliation":[{"name":"School of Electronic Science and Engineering, Nanjing University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7134-6514","authenticated-orcid":false,"given":"Jinming","family":"Lu","sequence":"additional","affiliation":[{"name":"School of Electronic Science and Engineering, Nanjing University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9553-3640","authenticated-orcid":false,"given":"Meiqi","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Sun Yat-sen University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7227-4786","authenticated-orcid":false,"given":"Zhongfeng","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Electronic Science and Engineering, Nanjing University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref13","author":"radford","year":"2018","journal-title":"Improving language understanding by generative pre-training"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2021.3120113"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/OJSSCS.2021.3119554"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/VLSICircuits18222.2020.9162917"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00061"},{"key":"ref14","article-title":"Language models are few-shot learners","author":"brown","year":"2020","journal-title":"arXiv 2005 14165"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00015"},{"key":"ref31","first-page":"1","article-title":"Training DNNs with hybrid block floating point","author":"drumond","year":"2018","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref30","article-title":"ETA: An efficient training accelerator for DNNs based on hardware-algorithm co-optimization","author":"lu","year":"2022","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"ref11","article-title":"Music transformer","author":"huang","year":"2018","journal-title":"arXiv 1809 04281"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16462"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413535"},{"key":"ref32","article-title":"A study of BFLOAT16 for deep learning training","author":"kalamkar","year":"2019","journal-title":"arXiv 1905 12322"},{"key":"ref2","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume":"abs 1810","author":"devlin","year":"2019","journal-title":"ArXiv"},{"key":"ref1","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proc NIPS"},{"key":"ref17","article-title":"A white paper on neural network quantization","author":"nagel","year":"2021","journal-title":"arXiv 2106 08295"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-6313"},{"key":"ref16","first-page":"1","article-title":"Deep compression: Compressing deep neural network with pruning, trained quantization and Huffman coding","author":"han","year":"2015","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref38","article-title":"FlexBlock: A flexible DNN training accelerator with multi-mode block floating point support","author":"noh","year":"2022","journal-title":"arXiv 2203 06673"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00807"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01453-z"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00204"},{"key":"ref23","article-title":"Mixed precision training","author":"micikevicius","year":"2017","journal-title":"arXiv 1710 03740"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC.2019.8662302"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/VLSIC.2018.8502276"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TCSI.2020.3021397"},{"key":"ref42","article-title":"Pointer sentinel mixture models","author":"merity","year":"2016","journal-title":"arXiv 1609 07843"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.21236\/ADA273556"},{"key":"ref22","first-page":"1","article-title":"A 28 nm 27.5TOPS\/W approximate-computing-based transformer processor with asymptotic sparsity speculating and out-of-order computing","volume":"65","author":"wang","year":"2022","journal-title":"IEEE Int Solid-State Circuits Conf (ISSCC) Dig Tech Papers"},{"key":"ref44","author":"krizhevsky","year":"2009","journal-title":"Learning multiple layers of features from tiny images"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2023.3234893"},{"key":"ref43","article-title":"The Goldilocks principle: Reading children&#x2019;s books with explicit memory representations","author":"hill","year":"2015","journal-title":"arXiv 1511 02301"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2022.3149787"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/5.726791"},{"key":"ref8","first-page":"4055","article-title":"Image transformer","author":"parmar","year":"2018","journal-title":"Proc 35th Int Conf Mach Learn (ICML)"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462506"},{"key":"ref4","article-title":"ALBERT: A lite BERT for self-supervised learning of language representations","author":"lan","year":"2019","journal-title":"arXiv 1909 11942"},{"key":"ref3","article-title":"RoBERTa: A robustly optimized BERT pretraining approach","author":"liu","year":"2019","journal-title":"arXiv 1907 11692"},{"key":"ref6","article-title":"An image is worth 16&#x00D7;16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"2020","journal-title":"arXiv 2010 11929"},{"key":"ref5","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"radford","year":"2019","journal-title":"OpenAIRE blog"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS48437.2020.00016"}],"container-title":["IEEE Transactions on Very Large Scale Integration (VLSI) Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/92\/10287007\/10251161.pdf?arnumber=10251161","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,6]],"date-time":"2023-11-06T20:04:34Z","timestamp":1699301074000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10251161\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11]]},"references-count":45,"journal-issue":{"issue":"11"},"URL":"https:\/\/doi.org\/10.1109\/tvlsi.2023.3305569","relation":{},"ISSN":["1063-8210","1557-9999"],"issn-type":[{"value":"1063-8210","type":"print"},{"value":"1557-9999","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,11]]}}}