{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T01:13:53Z","timestamp":1740100433707,"version":"3.37.3"},"reference-count":45,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,11,1]],"date-time":"2021-11-01T00:00:00Z","timestamp":1635724800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,11,1]],"date-time":"2021-11-01T00:00:00Z","timestamp":1635724800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea (NRF)","doi-asserted-by":"publisher","award":["NRF-2020R1A2B5B02002690"],"award-info":[{"award-number":["NRF-2020R1A2B5B02002690"]}],"id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,11,1]]},"DOI":"10.1109\/iccad51958.2021.9643433","type":"proceedings-article","created":{"date-parts":[[2021,12,23]],"date-time":"2021-12-23T23:06:46Z","timestamp":1640300806000},"page":"1-9","source":"Crossref","is-referenced-by-count":1,"title":["Deferred Dropout: An Algorithm-Hardware Co-Design DNN Training Method Provisioning Consistent High Activation Sparsity"],"prefix":"10.1109","author":[{"given":"Kangkyu","family":"Park","sequence":"first","affiliation":[{"name":"KAIST,School of Electrical Engineering"}]},{"given":"Yunki","family":"Han","sequence":"additional","affiliation":[{"name":"KAIST,School of Electrical Engineering"}]},{"given":"Lee-Sup","family":"Kim","sequence":"additional","affiliation":[{"name":"KAIST,School of Electrical Engineering"}]}],"member":"263","reference":[{"doi-asserted-by":"publisher","key":"ref39","DOI":"10.1109\/CVPR42600.2020.00225"},{"key":"ref38","article-title":"Backprop with approximate activations for memory-efficient network training","volume":"32","author":"chakrabarti","year":"2019","journal-title":"Advances in neural information processing systems"},{"doi-asserted-by":"publisher","key":"ref33","DOI":"10.1109\/SC41405.2020.00024"},{"key":"ref32","article-title":"Large scale distributed deep networks","volume":"25","author":"dean","year":"2012","journal-title":"Advances in neural information processing systems"},{"year":"2016","author":"chen","journal-title":"Training Deep Nets with Sublinear Memory Cost","key":"ref31"},{"doi-asserted-by":"publisher","key":"ref30","DOI":"10.1145\/3307650.3322263"},{"key":"ref37","first-page":"15625","article-title":"Sparse weight activation training","volume":"33","author":"raihan","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref36","article-title":"Dynamic sparse graph for efficient deep learning","author":"liu","year":"2019","journal-title":"International Conference on Learning Representations"},{"doi-asserted-by":"publisher","key":"ref35","DOI":"10.1145\/3352460.3358269"},{"key":"ref34","first-page":"3299","article-title":"meProp: Sparsified back propagation for accelerated deep learning with reduced overfitting","volume":"70","author":"sun","year":"2017","journal-title":"Proceedings of the 34th International Conference on Machine Learning ser Proceedings of Machine Learning Research"},{"doi-asserted-by":"publisher","key":"ref10","DOI":"10.1109\/MICRO.2016.7783721"},{"doi-asserted-by":"publisher","key":"ref40","DOI":"10.1109\/ICASSP.2019.8682791"},{"key":"ref11","article-title":"Checkmate: Breaking the memory wall with optimal tensor rematerialization","author":"jain","year":"2020","journal-title":"MLSys"},{"doi-asserted-by":"publisher","key":"ref12","DOI":"10.1109\/ISCA.2018.00070"},{"doi-asserted-by":"publisher","key":"ref13","DOI":"10.1109\/HPCA.2018.00017"},{"year":"2015","author":"xu","journal-title":"Empirical evaluation of rectified activations in convolutional network","key":"ref14"},{"key":"ref15","first-page":"4171","article-title":"BERT: Pretraining of deep bidirectional transformers for language understanding","author":"devlin","year":"2019","journal-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics Human Language Technologies Volume 1 (Long and Short Papers)"},{"year":"2020","author":"hendrycks","journal-title":"Gaussian error linear units (gelus)","key":"ref16"},{"key":"ref17","first-page":"1929","article-title":"Dropout: A simple way to prevent neural networks from overfitting","volume":"15","author":"srivastava","year":"2014","journal-title":"J Mach Learn Res"},{"year":"2019","author":"gomez","journal-title":"Learning sparse networks using targeted dropout","key":"ref18"},{"key":"ref19","first-page":"6000","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems ser NIPS&#x2019; 17"},{"doi-asserted-by":"publisher","key":"ref28","DOI":"10.1109\/DAC18072.2020.9218710"},{"key":"ref4","article-title":"Big self-supervised models are strong semi-supervised learners","author":"chen","year":"2020","journal-title":"Advances in neural information processing systems"},{"doi-asserted-by":"publisher","key":"ref27","DOI":"10.1109\/ISCA45697.2020.00075"},{"year":"2020","author":"seff","journal-title":"Sketchgraphs A large-scale dataset for modeling relational geometry in computer-aided design","key":"ref3"},{"year":"2020","author":"gupta","journal-title":"Chasing carbon The elusive environmental footprint of computing","key":"ref6"},{"doi-asserted-by":"publisher","key":"ref29","DOI":"10.1109\/DAC18072.2020.9218507"},{"doi-asserted-by":"publisher","key":"ref5","DOI":"10.1145\/3360307"},{"key":"ref8","first-page":"1","article-title":"Measuring the effects of data parallelism on neural network training","volume":"20","author":"shallue","year":"2019","journal-title":"Journal of Machine Learning Research"},{"year":"2018","author":"devarakonda","journal-title":"AdaBatch Adaptive batch sizes for training deep neural networks","key":"ref7"},{"key":"ref2","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"year":"2018","author":"goyal","journal-title":"Accurate large minibatch sgd Training imagenet in 1 hour","key":"ref9"},{"key":"ref1","article-title":"An image is worth 16&#x00D7;16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"2021","journal-title":"International Conference on Learning Representations"},{"year":"2020","journal-title":"google-research\/bert","key":"ref20"},{"doi-asserted-by":"publisher","key":"ref45","DOI":"10.1145\/3079856.3080246"},{"doi-asserted-by":"publisher","key":"ref22","DOI":"10.1109\/MDAT.2017.2741463"},{"doi-asserted-by":"publisher","key":"ref21","DOI":"10.1145\/3007787.3001138"},{"key":"ref42","first-page":"1135","article-title":"Learning both weights and connections for efficient neural networks","volume":"1","author":"han","year":"2015","journal-title":"Proceedings of the 28th International Conference on Neural Information Processing Systems"},{"year":"2017","author":"ramachandran","journal-title":"Searching for Activation Function","key":"ref24"},{"key":"ref41","first-page":"8024","article-title":"Pytorch: An imperative style, high-performance deep learning library","author":"paszke","year":"2019","journal-title":"Advances in Neural IInformation Processing Systems"},{"doi-asserted-by":"publisher","key":"ref23","DOI":"10.1145\/3079856.3080254"},{"year":"0","journal-title":"PyTorch","key":"ref44"},{"year":"0","journal-title":"Papers with code - gelu explained","key":"ref26"},{"doi-asserted-by":"publisher","key":"ref43","DOI":"10.1145\/3297858.3304028"},{"year":"2020","author":"misra","journal-title":"Mish A Self Regularized Non-Monotonic Activation Function","key":"ref25"}],"event":{"name":"2021 IEEE\/ACM International Conference On Computer Aided Design (ICCAD)","start":{"date-parts":[[2021,11,1]]},"location":"Munich, Germany","end":{"date-parts":[[2021,11,4]]}},"container-title":["2021 IEEE\/ACM International Conference On Computer Aided Design (ICCAD)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9643423\/9643432\/09643433.pdf?arnumber=9643433","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,3]],"date-time":"2022-08-03T00:12:08Z","timestamp":1659485528000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9643433\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,11,1]]},"references-count":45,"URL":"https:\/\/doi.org\/10.1109\/iccad51958.2021.9643433","relation":{},"subject":[],"published":{"date-parts":[[2021,11,1]]}}}