{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,6,24]],"date-time":"2024-06-24T23:15:08Z","timestamp":1719270908476},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,11,17]],"date-time":"2019-11-17T00:00:00Z","timestamp":1573948800000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,11,17]]},"DOI":"10.1145\/3295500.3356137","type":"proceedings-article","created":{"date-parts":[[2019,11,7]],"date-time":"2019-11-07T19:43:22Z","timestamp":1573155802000},"update-policy":"http:\/\/dx.doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":36,"title":["Large-batch training for LSTM and beyond"],"prefix":"10.1145","author":[{"given":"Yang","family":"You","sequence":"first","affiliation":[{"name":"UC Berkeley"}]},{"given":"Jonathan","family":"Hseu","sequence":"additional","affiliation":[{"name":"Google Brain"}]},{"given":"Chris","family":"Ying","sequence":"additional","affiliation":[{"name":"Google Brain"}]},{"given":"James","family":"Demmel","sequence":"additional","affiliation":[{"name":"UC Berkeley"}]},{"given":"Kurt","family":"Keutzer","sequence":"additional","affiliation":[{"name":"UC Berkeley"}]},{"given":"Cho-Jui","family":"Hsieh","sequence":"additional","affiliation":[{"name":"UCLA"}]}],"member":"320","published-online":{"date-parts":[[2019,11,17]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Extremely large minibatch sgd: Training resnet-50 on imagenet in 15 minutes. arXiv preprint arXiv:1711.04325","author":"Akiba Takuya","year":"2017","unstructured":"Takuya Akiba , Shuji Suzuki , and Keisuke Fukuda . 2017. Extremely large minibatch sgd: Training resnet-50 on imagenet in 15 minutes. arXiv preprint arXiv:1711.04325 ( 2017 ). Takuya Akiba, Shuji Suzuki, and Keisuke Fukuda. 2017. Extremely large minibatch sgd: Training resnet-50 on imagenet in 15 minutes. arXiv preprint arXiv:1711.04325 (2017)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Yoshua Bengio Patrice Simard Paolo Frasconi etal 1994. Learning long-term dependencies with gradient descent is difficult. IEEE transactions on neural networks 5 2 (1994) 157--166. Yoshua Bengio Patrice Simard Paolo Frasconi et al. 1994. Learning long-term dependencies with gradient descent is difficult. IEEE transactions on neural networks 5 2 (1994) 157--166.","DOI":"10.1109\/72.279181"},{"key":"e_1_3_2_1_3_1","volume-title":"Optimization methods for large-scale machine learning. arXiv preprint arXiv:1606.04838","author":"Bottou L\u00e9on","year":"2016","unstructured":"L\u00e9on Bottou , Frank E Curtis , and Jorge Nocedal . 2016. Optimization methods for large-scale machine learning. arXiv preprint arXiv:1606.04838 ( 2016 ). L\u00e9on Bottou, Frank E Curtis, and Jorge Nocedal. 2016. Optimization methods for large-scale machine learning. arXiv preprint arXiv:1606.04838 (2016)."},{"key":"e_1_3_2_1_4_1","volume-title":"Revisiting distributed synchronous SGD. arXiv preprint arXiv:1604.00981","author":"Chen Jianmin","year":"2016","unstructured":"Jianmin Chen , Rajat Monga , Samy Bengio , and Rafal Jozefowicz . 2016. Revisiting distributed synchronous SGD. arXiv preprint arXiv:1604.00981 ( 2016 ). Jianmin Chen, Rajat Monga, Samy Bengio, and Rafal Jozefowicz. 2016. Revisiting distributed synchronous SGD. arXiv preprint arXiv:1604.00981 (2016)."},{"key":"e_1_3_2_1_5_1","volume-title":"Scale out for large minibatch SGD: Residual network training on ImageNet-1K with improved accuracy and reduced time to train. arXiv preprint arXiv:1711.04291","author":"Codreanu Valeriu","year":"2017","unstructured":"Valeriu Codreanu , Damian Podareanu , and Vikram Saletore . 2017. Scale out for large minibatch SGD: Residual network training on ImageNet-1K with improved accuracy and reduced time to train. arXiv preprint arXiv:1711.04291 ( 2017 ). Valeriu Codreanu, Damian Podareanu, and Vikram Saletore. 2017. Scale out for large minibatch SGD: Residual network training on ImageNet-1K with improved accuracy and reduced time to train. arXiv preprint arXiv:1711.04291 (2017)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"James Demmel. 2013. Communication-Avoiding Algorithms for Linear Algebra and Beyond.. In IPDPS. 585. James Demmel. 2013. Communication-Avoiding Algorithms for Linear Algebra and Beyond.. In IPDPS. 585.","DOI":"10.1109\/IPDPS.2013.123"},{"key":"e_1_3_2_1_7_1","volume-title":"AdaBatch: Adaptive Batch Sizes for Training Deep Neural Networks. arXiv preprint arXiv:1712.02029","author":"Devarakonda Aditya","year":"2017","unstructured":"Aditya Devarakonda , Maxim Naumov , and Michael Garland . 2017. AdaBatch: Adaptive Batch Sizes for Training Deep Neural Networks. arXiv preprint arXiv:1712.02029 ( 2017 ). Aditya Devarakonda, Maxim Naumov, and Michael Garland. 2017. AdaBatch: Adaptive Batch Sizes for Training Deep Neural Networks. arXiv preprint arXiv:1712.02029 (2017)."},{"key":"e_1_3_2_1_8_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin , Ming-Wei Chang , Kenton Lee , and Kristina Toutanova . 2018 . Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018). Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_9_1","first-page":"2121","article-title":"Adaptive subgradient methods for online learning and stochastic optimization","author":"Duchi John","year":"2011","unstructured":"John Duchi , Elad Hazan , and Yoram Singer . 2011 . Adaptive subgradient methods for online learning and stochastic optimization . Journal of Machine Learning Research 12 , Jul (2011), 2121 -- 2159 . John Duchi, Elad Hazan, and Yoram Singer. 2011. Adaptive subgradient methods for online learning and stochastic optimization. Journal of Machine Learning Research 12, Jul (2011), 2121--2159.","journal-title":"Journal of Machine Learning Research 12"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/BF01584320"},{"key":"e_1_3_2_1_12_1","volume-title":"Deep learning","author":"Goodfellow Ian","unstructured":"Ian Goodfellow , Yoshua Bengio , Aaron Courville , and Yoshua Bengio . 2016. Deep learning . Vol. 1 . MIT press Cambridge . Ian Goodfellow, Yoshua Bengio, Aaron Courville, and Yoshua Bengio. 2016. Deep learning. Vol. 1. MIT press Cambridge."},{"key":"e_1_3_2_1_13_1","volume-title":"Large Minibatch SGD: Training ImageNet in 1 Hour. arXiv preprint arXiv:1706.02677","author":"Goyal Priya","year":"2017","unstructured":"Priya Goyal , Piotr Doll\u00e1r , Ross Girshick , Pieter Noordhuis , Lukasz Wesolowski , Aapo Kyrola , Andrew Tulloch , Yangqing Jia , and Kaiming He. 2017. Accurate , Large Minibatch SGD: Training ImageNet in 1 Hour. arXiv preprint arXiv:1706.02677 ( 2017 ). Priya Goyal, Piotr Doll\u00e1r, Ross Girshick, Pieter Noordhuis, Lukasz Wesolowski, Aapo Kyrola, Andrew Tulloch, Yangqing Jia, and Kaiming He. 2017. Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour. arXiv preprint arXiv:1706.02677 (2017)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_15_1","volume-title":"Long short-term memory. Neural computation 9, 8","author":"Hochreiter Sepp","year":"1997","unstructured":"Sepp Hochreiter and J\u00fcrgen Schmidhuber . 1997. Long short-term memory. Neural computation 9, 8 ( 1997 ), 1735--1780. Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation 9, 8 (1997), 1735--1780."},{"key":"e_1_3_2_1_16_1","volume-title":"Train longer, generalize better: closing the generalization gap in large batch training of neural networks. arXiv preprint arXiv:1705.08741","author":"Hoffer Elad","year":"2017","unstructured":"Elad Hoffer , Itay Hubara , and Daniel Soudry . 2017. Train longer, generalize better: closing the generalization gap in large batch training of neural networks. arXiv preprint arXiv:1705.08741 ( 2017 ). Elad Hoffer, Itay Hubara, and Daniel Soudry. 2017. Train longer, generalize better: closing the generalization gap in large batch training of neural networks. arXiv preprint arXiv:1705.08741 (2017)."},{"key":"e_1_3_2_1_17_1","volume-title":"Mobilenets: Efficient convolutional neural networks for mobile vision applications. arXiv preprint arXiv:1704.04861","author":"Howard Andrew G","year":"2017","unstructured":"Andrew G Howard , Menglong Zhu , Bo Chen , Dmitry Kalenichenko , Weijun Wang , Tobias Weyand , Marco Andreetto , and Hartwig Adam . 2017 . Mobilenets: Efficient convolutional neural networks for mobile vision applications. arXiv preprint arXiv:1704.04861 (2017). Andrew G Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, and Hartwig Adam. 2017. Mobilenets: Efficient convolutional neural networks for mobile vision applications. arXiv preprint arXiv:1704.04861 (2017)."},{"key":"e_1_3_2_1_18_1","volume-title":"SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <","author":"Iandola Forrest N","year":"2016","unstructured":"Forrest N Iandola , Song Han , Matthew W Moskewicz , Khalid Ashraf , William J Dally , and Kurt Keutzer . 2016. SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and < ; 0.5 MB model size. arXiv preprint arXiv:1602.07360 ( 2016 ). Forrest N Iandola, Song Han, Matthew W Moskewicz, Khalid Ashraf, William J Dally, and Kurt Keutzer. 2016. SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and < 0.5 MB model size. arXiv preprint arXiv:1602.07360 (2016)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.284"},{"key":"e_1_3_2_1_20_1","unstructured":"Xianyan Jia Shutao Song Wei He Yangzihao Wang Haidong Rong Feihu Zhou Liqiang Xie Zhenyu Guo Yuanzhou Yang Liwei Yu etal 2018. Highly Scalable Deep Learning Training System with Mixed-Precision: Training ImageNet in Four Minutes. arXiv preprint arXiv:1807.11205 (2018). Xianyan Jia Shutao Song Wei He Yangzihao Wang Haidong Rong Feihu Zhou Liqiang Xie Zhenyu Guo Yuanzhou Yang Liwei Yu et al. 2018. Highly Scalable Deep Learning Training System with Mixed-Precision: Training ImageNet in Four Minutes. arXiv preprint arXiv:1807.11205 (2018)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_2_1_22_1","volume-title":"On large-batch training for deep learning: Generalization gap and sharp minima. arXiv preprint arXiv:1609.04836","author":"Keskar Nitish Shirish","year":"2016","unstructured":"Nitish Shirish Keskar , Dheevatsa Mudigere , Jorge Nocedal , Mikhail Smelyanskiy , and Ping Tak Peter Tang . 2016. On large-batch training for deep learning: Generalization gap and sharp minima. arXiv preprint arXiv:1609.04836 ( 2016 ). Nitish Shirish Keskar, Dheevatsa Mudigere, Jorge Nocedal, Mikhail Smelyanskiy, and Ping Tak Peter Tang. 2016. On large-batch training for deep learning: Generalization gap and sharp minima. arXiv preprint arXiv:1609.04836 (2016)."},{"key":"e_1_3_2_1_23_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba . 2014 . Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014). Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_24_1","volume-title":"One weird trick for parallelizing convolutional neural networks. arXiv preprint arXiv:1404.5997","author":"Krizhevsky Alex","year":"2014","unstructured":"Alex Krizhevsky . 2014. One weird trick for parallelizing convolutional neural networks. arXiv preprint arXiv:1404.5997 ( 2014 ). Alex Krizhevsky. 2014. One weird trick for parallelizing convolutional neural networks. arXiv preprint arXiv:1404.5997 (2014)."},{"key":"e_1_3_2_1_25_1","unstructured":"Alex Krizhevsky Ilya Sutskever and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. In Advances in neural information processing systems. 1097--1105. Alex Krizhevsky Ilya Sutskever and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. In Advances in neural information processing systems. 1097--1105."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/5.726791"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/2623330.2623612"},{"key":"e_1_3_2_1_29_1","volume-title":"Feature pyramid networks for object detection. arXiv preprint arXiv:1612.03144","author":"Lin Tsung-Yi","year":"2016","unstructured":"Tsung-Yi Lin , Piotr Doll\u00e1r , Ross Girshick , Kaiming He , Bharath Hariharan , and Serge Belongie . 2016. Feature pyramid networks for object detection. arXiv preprint arXiv:1612.03144 ( 2016 ). Tsung-Yi Lin, Piotr Doll\u00e1r, Ross Girshick, Kaiming He, Bharath Hariharan, and Serge Belongie. 2016. Feature pyramid networks for object detection. arXiv preprint arXiv:1612.03144 (2016)."},{"key":"e_1_3_2_1_30_1","volume-title":"Neural Machine Translation (seq2seq) Tutorial. https:\/\/github.com\/tensorflow\/nmt","author":"Luong Minh-Thang","year":"2017","unstructured":"Minh-Thang Luong , Eugene Brevdo , and Rui Zhao . 2017. Neural Machine Translation (seq2seq) Tutorial. https:\/\/github.com\/tensorflow\/nmt ( 2017 ). Minh-Thang Luong, Eugene Brevdo, and Rui Zhao. 2017. Neural Machine Translation (seq2seq) Tutorial. https:\/\/github.com\/tensorflow\/nmt (2017)."},{"key":"e_1_3_2_1_31_1","volume-title":"Mary Ann Marcinkiewicz, and Beatrice Santorini","author":"Marcus Mitchell P","year":"1993","unstructured":"Mitchell P Marcus , Mary Ann Marcinkiewicz, and Beatrice Santorini . 1993 . Building a large annotated corpus of English: The Penn Treebank. Computational linguistics 19, 2 (1993), 313--330. Mitchell P Marcus, Mary Ann Marcinkiewicz, and Beatrice Santorini. 1993. Building a large annotated corpus of English: The Penn Treebank. Computational linguistics 19, 2 (1993), 313--330."},{"key":"e_1_3_2_1_32_1","volume-title":"International conference on machine learning. 2408--2417","author":"Martens James","year":"2015","unstructured":"James Martens and Roger Grosse . 2015 . Optimizing neural networks with kronecker-factored approximate curvature . In International conference on machine learning. 2408--2417 . James Martens and Roger Grosse. 2015. Optimizing neural networks with kronecker-factored approximate curvature. In International conference on machine learning. 2408--2417."},{"key":"e_1_3_2_1_33_1","unstructured":"Paulius Micikevicius Sharan Narang Jonah Alben Gregory Diamos Erich Elsen David Garcia Boris Ginsburg Michael Houston Oleksii Kuchaev Ganesh Venkatesh etal 2017. Mixed precision training. arXiv preprint arXiv:1710.03740 (2017). Paulius Micikevicius Sharan Narang Jonah Alben Gregory Diamos Erich Elsen David Garcia Boris Ginsburg Michael Houston Oleksii Kuchaev Ganesh Venkatesh et al. 2017. Mixed precision training. arXiv preprint arXiv:1710.03740 (2017)."},{"key":"e_1_3_2_1_34_1","unstructured":"Daniel Neil Michael Pfeiffer and Shih-Chii Liu. 2016. Phased lstm: Accelerating recurrent network training for long or event-based sequences. In Advances in Neural Information Processing Systems. 3882--3890. Daniel Neil Michael Pfeiffer and Shih-Chii Liu. 2016. Phased lstm: Accelerating recurrent network training for long or event-based sequences. In Advances in Neural Information Processing Systems. 3882--3890."},{"key":"e_1_3_2_1_35_1","volume-title":"Second-order Optimization Method for Large Mini-batch: Training ResNet-50 on ImageNet in 35 Epochs. arXiv preprint arXiv:1811.12019","author":"Osawa Kazuki","year":"2018","unstructured":"Kazuki Osawa , Yohei Tsuji , Yuichiro Ueno , Akira Naruse , Rio Yokota , and Satoshi Matsuoka . 2018. Second-order Optimization Method for Large Mini-batch: Training ResNet-50 on ImageNet in 35 Epochs. arXiv preprint arXiv:1811.12019 ( 2018 ). Kazuki Osawa, Yohei Tsuji, Yuichiro Ueno, Akira Naruse, Rio Yokota, and Satoshi Matsuoka. 2018. Second-order Optimization Method for Large Mini-batch: Training ResNet-50 on ImageNet in 35 Epochs. arXiv preprint arXiv:1811.12019 (2018)."},{"key":"e_1_3_2_1_36_1","volume-title":"On the momentum term in gradient descent learning algorithms. Neural networks 12, 1","author":"Qian Ning","year":"1999","unstructured":"Ning Qian . 1999. On the momentum term in gradient descent learning algorithms. Neural networks 12, 1 ( 1999 ), 145--151. Ning Qian. 1999. On the momentum term in gradient descent learning algorithms. Neural networks 12, 1 (1999), 145--151."},{"key":"e_1_3_2_1_37_1","volume-title":"Herbert Robbins Selected Papers","author":"Robbins Herbert","unstructured":"Herbert Robbins and Sutton Monro . 1985. A stochastic approximation method . In Herbert Robbins Selected Papers . Springer , 102--109. Herbert Robbins and Sutton Monro. 1985. A stochastic approximation method. In Herbert Robbins Selected Papers. Springer, 102--109."},{"key":"e_1_3_2_1_38_1","volume-title":"Mesh-tensorflow: Deep learning for supercomputers. In Advances in Neural Information Processing Systems. 10435--10444.","author":"Shazeer Noam","year":"2018","unstructured":"Noam Shazeer , Youlong Cheng , Niki Parmar , Dustin Tran , Ashish Vaswani , Penporn Koanantakool , Peter Hawkins , HyoukJoong Lee , Mingsheng Hong , Cliff Young , 2018 . Mesh-tensorflow: Deep learning for supercomputers. In Advances in Neural Information Processing Systems. 10435--10444. Noam Shazeer, Youlong Cheng, Niki Parmar, Dustin Tran, Ashish Vaswani, Penporn Koanantakool, Peter Hawkins, HyoukJoong Lee, Mingsheng Hong, Cliff Young, et al. 2018. Mesh-tensorflow: Deep learning for supercomputers. In Advances in Neural Information Processing Systems. 10435--10444."},{"key":"e_1_3_2_1_39_1","volume-title":"Increase the Batch Size. arXiv preprint arXiv:1711.00489","author":"Smith Samuel L","year":"2017","unstructured":"Samuel L Smith , Pieter-Jan Kindermans , and Quoc V Le. 2017. Don't Decay the Learning Rate , Increase the Batch Size. arXiv preprint arXiv:1711.00489 ( 2017 ). Samuel L Smith, Pieter-Jan Kindermans, and Quoc V Le. 2017. Don't Decay the Learning Rate, Increase the Batch Size. arXiv preprint arXiv:1711.00489 (2017)."},{"key":"e_1_3_2_1_40_1","volume-title":"International conference on machine learning. 1139--1147","author":"Sutskever Ilya","year":"2013","unstructured":"Ilya Sutskever , James Martens , George Dahl , and Geoffrey Hinton . 2013 . On the importance of initialization and momentum in deep learning . In International conference on machine learning. 1139--1147 . Ilya Sutskever, James Martens, George Dahl, and Geoffrey Hinton. 2013. On the importance of initialization and momentum in deep learning. In International conference on machine learning. 1139--1147."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.5555\/3298023.3298188"},{"key":"e_1_3_2_1_42_1","volume-title":"Mnasnet: Platform-aware neural architecture search for mobile. arXiv preprint arXiv:1807.11626","author":"Tan Mingxing","year":"2018","unstructured":"Mingxing Tan , Bo Chen , Ruoming Pang , Vijay Vasudevan , and Quoc V Le . 2018 . Mnasnet: Platform-aware neural architecture search for mobile. arXiv preprint arXiv:1807.11626 (2018). Mingxing Tan, Bo Chen, Ruoming Pang, Vijay Vasudevan, and Quoc V Le. 2018. Mnasnet: Platform-aware neural architecture search for mobile. arXiv preprint arXiv:1807.11626 (2018)."},{"key":"e_1_3_2_1_43_1","volume-title":"Divide the gradient by a running average of its recent magnitude. COURSERA: Neural networks for machine learning 4, 2","author":"Tieleman Tijmen","year":"2012","unstructured":"Tijmen Tieleman and Geoffrey Hinton . 2012. Lecture 6.5-rmsprop : Divide the gradient by a running average of its recent magnitude. COURSERA: Neural networks for machine learning 4, 2 ( 2012 ), 26--31. Tijmen Tieleman and Geoffrey Hinton. 2012. Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude. COURSERA: Neural networks for machine learning 4, 2 (2012), 26--31."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1093"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.2172\/1407078"},{"key":"e_1_3_2_1_46_1","unstructured":"Yonghui Wu Mike Schuster Zhifeng Chen Quoc V Le Mohammad Norouzi Wolfgang Macherey Maxim Krikun Yuan Cao Qin Gao Klaus Macherey etal 2016. Google's neural machine translation system: Bridging the gap between human and machine translation. arXiv preprint arXiv:1609.08144 (2016). Yonghui Wu Mike Schuster Zhifeng Chen Quoc V Le Mohammad Norouzi Wolfgang Macherey Maxim Krikun Yuan Cao Qin Gao Klaus Macherey et al. 2016. Google's neural machine translation system: Bridging the gap between human and machine translation. arXiv preprint arXiv:1609.08144 (2016)."},{"key":"e_1_3_2_1_47_1","volume-title":"Image Classification at Supercomputer Scale. arXiv preprint arXiv:1811.06992","author":"Ying Chris","year":"2018","unstructured":"Chris Ying , Sameer Kumar , Dehao Chen , Tao Wang , and Youlong Cheng . 2018. Image Classification at Supercomputer Scale. arXiv preprint arXiv:1811.06992 ( 2018 ). Chris Ying, Sameer Kumar, Dehao Chen, Tao Wang, and Youlong Cheng. 2018. Image Classification at Supercomputer Scale. arXiv preprint arXiv:1811.06992 (2018)."},{"key":"e_1_3_2_1_48_1","volume-title":"Scaling sgd batch size to 32k for imagenet training. arXiv preprint arXiv:1708.03888","author":"You Yang","year":"2017","unstructured":"Yang You , Igor Gitman , and Boris Ginsburg . 2017. Scaling sgd batch size to 32k for imagenet training. arXiv preprint arXiv:1708.03888 ( 2017 ). Yang You, Igor Gitman, and Boris Ginsburg. 2017. Scaling sgd batch size to 32k for imagenet training. arXiv preprint arXiv:1708.03888 (2017)."},{"key":"e_1_3_2_1_49_1","volume-title":"ImageNet training in minutes. CoRR, abs\/1709.05011","author":"You Yang","year":"2017","unstructured":"Yang You , Zhao Zhang , C Hsieh , James Demmel , and Kurt Keutzer . 2017. ImageNet training in minutes. CoRR, abs\/1709.05011 ( 2017 ). Yang You, Zhao Zhang, C Hsieh, James Demmel, and Kurt Keutzer. 2017. ImageNet training in minutes. CoRR, abs\/1709.05011 (2017)."},{"key":"e_1_3_2_1_50_1","volume-title":"Multi-scale context aggregation by dilated convolutions. arXiv preprint arXiv:1511.07122","author":"Yu Fisher","year":"2015","unstructured":"Fisher Yu and Vladlen Koltun . 2015. Multi-scale context aggregation by dilated convolutions. arXiv preprint arXiv:1511.07122 ( 2015 ). Fisher Yu and Vladlen Koltun. 2015. Multi-scale context aggregation by dilated convolutions. arXiv preprint arXiv:1511.07122 (2015)."},{"key":"e_1_3_2_1_51_1","volume-title":"ADADELTA: an adaptive learning rate method. arXiv preprint arXiv:1212.5701","author":"Zeiler Matthew D","year":"2012","unstructured":"Matthew D Zeiler . 2012. ADADELTA: an adaptive learning rate method. arXiv preprint arXiv:1212.5701 ( 2012 ). Matthew D Zeiler. 2012. ADADELTA: an adaptive learning rate method. arXiv preprint arXiv:1212.5701 (2012)."}],"event":{"name":"SC '19: The International Conference for High Performance Computing, Networking, Storage, and Analysis","location":"Denver Colorado","acronym":"SC '19","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","IEEE CS"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3295500.3356137","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,8]],"date-time":"2023-01-08T18:57:16Z","timestamp":1673204236000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3295500.3356137"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,11,17]]},"references-count":50,"alternative-id":["10.1145\/3295500.3356137","10.1145\/3295500"],"URL":"http:\/\/dx.doi.org\/10.1145\/3295500.3356137","relation":{},"subject":[],"published":{"date-parts":[[2019,11,17]]},"assertion":[{"value":"2019-11-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}