{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T13:51:55Z","timestamp":1750859515144,"version":"3.28.0"},"reference-count":50,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,9]]},"DOI":"10.1109\/cahpc.2018.8645935","type":"proceedings-article","created":{"date-parts":[[2019,2,21]],"date-time":"2019-02-21T23:19:26Z","timestamp":1550791166000},"page":"290-297","source":"Crossref","is-referenced-by-count":11,"title":["Large Scale Language Modeling: Converging on 40GB of Text in Four Hours"],"prefix":"10.1109","author":[{"given":"Raul","family":"Puri","sequence":"first","affiliation":[]},{"given":"Robert","family":"Kirby","sequence":"additional","affiliation":[]},{"given":"Nikolai","family":"Yakovenko","sequence":"additional","affiliation":[]},{"given":"Bryan","family":"Catanzaro","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","article-title":"A bayesian perspective on generalization and stochastic gradient descent","volume":"abs 1710 6451","author":"smith","year":"2017","journal-title":"CoRR"},{"key":"ref38","article-title":"Visualizing and understanding recurrent networks","volume":"abs 1506 2078","author":"karpathy","year":"2015","journal-title":"CoRR"},{"journal-title":"English gigaword fifth edition","year":"2011","author":"parker","key":"ref33"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.11"},{"key":"ref31","article-title":"Unsupervised pretraining for sequence to sequence learning","volume":"abs 1611 2683","author":"ramachandran","year":"2016","journal-title":"CoRR"},{"key":"ref30","article-title":"Skip-thought vectors","volume":"abs 1506 6726","author":"kiros","year":"2015","journal-title":"CoRR"},{"key":"ref37","article-title":"On the difficulty of training recurrent neural networks","volume":"abs 1211 5063","author":"pascanu","year":"2012","journal-title":"CoRR"},{"key":"ref36","first-page":"9","author":"lecun","year":"2012","journal-title":"Efficient backprop"},{"journal-title":"Gpu kernels for block-sparse weights","year":"2017","author":"gray","key":"ref35"},{"key":"ref34","article-title":"One billion word benchmark for measuring progress in statistical language modeling","volume":"abs 1312 3005","author":"chelba","year":"2013","journal-title":"CoRR"},{"key":"ref28","article-title":"Universal sentence encoder","volume":"abs 1803 11175","author":"cer","year":"2018","journal-title":"CoRR"},{"key":"ref27","article-title":"Learning general purpose distributed sentence representations via large scale multi-task learning","volume":"abs 1804 79","author":"subramanian","year":"2018","journal-title":"CoRR"},{"key":"ref29","article-title":"Generating wikipedia by summarizing long sequences","author":"liu","year":"2018","journal-title":"ICLRE"},{"journal-title":"Do Better ImageNet Models Transfer Better?","year":"2018","author":"kornblith","key":"ref2"},{"key":"ref1","article-title":"Extremely large minibatch SGD: training resnet-50 on imagenet in 15 minutes","volume":"abs 1711 4325","author":"akiba","year":"2017","journal-title":"CoRR"},{"journal-title":"Scaling neural machine translation","year":"2018","author":"ott","key":"ref20"},{"key":"ref22","article-title":"Semi-supervised sequence learning","volume":"abs 1511 1432","author":"dai","year":"2015","journal-title":"CoRR"},{"key":"ref21","article-title":"Deep learning scaling is predictable, empirically","volume":"abs 1712 409","author":"hestness","year":"2017","journal-title":"CoRR"},{"key":"ref24","article-title":"Learned in translation: Contextualized word vectors","volume":"abs 1708 107","author":"mccann","year":"2017","journal-title":"CoRR"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/K16-1006"},{"key":"ref26","article-title":"Fine-tuned language models for text classification","volume":"abs 1801 6146","author":"howard","year":"2018","journal-title":"CoRR"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1202"},{"key":"ref50","article-title":"Training deep nets with sublinear memory cost","volume":"abs 1604 6174","author":"chen","year":"2016","journal-title":"CoRR"},{"key":"ref10","article-title":"Decaf: A deep convolutional activation feature for generic visual recognition","author":"donahue","year":"2013","journal-title":"International Conference on Machine Learning"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2014.131"},{"key":"ref40","article-title":"Don't decay the learning rate, increase the batch size","volume":"abs 1711 489","author":"smith","year":"2017","journal-title":"CoRR"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"journal-title":"Improving language understanding by generative pre-training","year":"2018","author":"radford","key":"ref13"},{"key":"ref14","first-page":"384","article-title":"Word representations: A simple and general method for semi-supervised learning","author":"turian","year":"2010","journal-title":"Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics Ser ACL '10"},{"key":"ref15","article-title":"Distributed representations of words and phrases and their compositionality","volume":"abs 1310 4546","author":"mikolov","year":"2013","journal-title":"CoRR"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref17","article-title":"Accurate, large minibatch SGD: training imagenet in 1 hour","volume":"abs 1706 2677","author":"goyal","year":"2017","journal-title":"CoRR"},{"key":"ref18","article-title":"100-epoch imagenet training with alexnet in 24 minutes","volume":"abs 1709 5011","author":"you","year":"2017","journal-title":"CoRR"},{"key":"ref19","article-title":"Scaling SGD batch size to 32k for imagenet training","volume":"abs 1708 3888","author":"you","year":"2017","journal-title":"CoRR"},{"key":"ref4","article-title":"Multiplicative LSTM for sequence modelling","volume":"abs 1609 7959","author":"krause","year":"2016","journal-title":"CoRR"},{"key":"ref3","article-title":"Learning to generate reviews and discovering sentiment","volume":"abs 1704 1444","author":"radford","year":"2017","journal-title":"CoRR"},{"key":"ref6","article-title":"Very deep convolutional networks for large-scale image recognition","volume":"abs 1409 1556","author":"simonyan","year":"2014","journal-title":"CoRR"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/2766462.2767755"},{"journal-title":"ImageNet A Large-scale Hierarchical Image Database","year":"2009","author":"deng","key":"ref8"},{"key":"ref7","article-title":"Deep residual learning for image recognition","volume":"abs 1512 3385","author":"he","year":"2015","journal-title":"CoRR"},{"key":"ref49","article-title":"Attention is all you need","volume":"abs 1706 3762","author":"vaswani","year":"2017","journal-title":"CoRR"},{"key":"ref9","article-title":"Imagenet large scale visual recognition challenge","volume":"abs 1409 575","author":"russakovsky","year":"2014","journal-title":"CoRR"},{"key":"ref46","article-title":"Weight normalization: A simple repa-rameterization to accelerate training of deep neural networks","volume":"abs 1602 7868","author":"salimans","year":"2016","journal-title":"CoRR"},{"journal-title":"Training Recurrent Neural Networks","year":"2013","author":"sutskever","key":"ref45"},{"key":"ref48","first-page":"2825","article-title":"Scikit-learn: Machine learning in Python","volume":"12","author":"pedregosa","year":"2011","journal-title":"Journal of Machine Learning Research"},{"journal-title":"Adam A method for stochastic optimization","year":"2014","author":"kingma","key":"ref47"},{"key":"ref42","article-title":"Mixed precision training","volume":"abs 1710 3740","author":"micikevicius","year":"2017","journal-title":"CoRR"},{"journal-title":"Train longer generalize better closing the generalization gap in large batch training of neural networks","year":"2017","author":"hoffer","key":"ref41"},{"journal-title":"On Automatic Differentiation","year":"2017","author":"paszke","key":"ref44"},{"journal-title":"Mixed precision training Choosing a scaling factor","year":"2018","key":"ref43"}],"event":{"name":"2018 30th International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD)","start":{"date-parts":[[2018,9,24]]},"location":"Lyon, France","end":{"date-parts":[[2018,9,27]]}},"container-title":["2018 30th International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8638685\/8645847\/08645935.pdf?arnumber=8645935","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,27]],"date-time":"2022-01-27T08:02:47Z","timestamp":1643270567000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8645935\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,9]]},"references-count":50,"URL":"https:\/\/doi.org\/10.1109\/cahpc.2018.8645935","relation":{},"subject":[],"published":{"date-parts":[[2018,9]]}}}