{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T04:57:00Z","timestamp":1755838620652,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2017,9,25]],"date-time":"2017-09-25T00:00:00Z","timestamp":1506297600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2017,9,25]]},"DOI":"10.1145\/3127024.3127037","type":"proceedings-article","created":{"date-parts":[[2017,8,24]],"date-time":"2017-08-24T11:58:11Z","timestamp":1503575891000},"page":"1-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["What does fault tolerant deep learning need from MPI?"],"prefix":"10.1145","author":[{"given":"Vinay","family":"Amatya","sequence":"first","affiliation":[{"name":"Pacific Northwest National Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Abhinav","family":"Vishnu","sequence":"additional","affiliation":[{"name":"Pacific Northwest National Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Charles","family":"Siegel","sequence":"additional","affiliation":[{"name":"Pacific Northwest National Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jeff","family":"Daily","sequence":"additional","affiliation":[{"name":"Pacific Northwest National Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2017,9,25]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"1097","volume-title":"Inc.","author":"Krizhevsky A.","year":"2012","unstructured":"A. Krizhevsky , I. Sutskever , and G. E. Hinton , \" Imagenet classification with deep convolutional neural networks,\" in Advances in Neural Information Processing Systems 25, F. Pereira, C. Burges, L. Bottou, and K. Weinberger, Eds. Curran Associates , Inc. , 2012 , pp. 1097 -- 1105 . {Online}. Available : http:\/\/papers.nips.cc\/paper\/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf A. Krizhevsky, I. Sutskever, and G. E. Hinton, \"Imagenet classification with deep convolutional neural networks,\" in Advances in Neural Information Processing Systems 25, F. Pereira, C. Burges, L. Bottou, and K. Weinberger, Eds. Curran Associates, Inc., 2012, pp. 1097--1105. {Online}. Available: http:\/\/papers.nips.cc\/paper\/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf"},{"key":"e_1_3_2_1_2_1","first-page":"2015","article-title":"Going deeper with convolutions","author":"Szegedy C.","year":"2015","unstructured":"C. Szegedy , W. Liu , Y. Jia , P. Sermanet , S. Reed , D. Anguelov , D. Erhan , V. Vanhoucke , and A. Rabinovich , \" Going deeper with convolutions ,\" in CVPR 2015 , 2015 . {Online}. Available: http:\/\/arxiv.org\/abs\/1409.4842 C. Szegedy, W. Liu, Y. Jia, P. Sermanet, S. Reed, D. Anguelov, D. Erhan, V. Vanhoucke, and A. Rabinovich, \"Going deeper with convolutions,\" in CVPR 2015, 2015. {Online}. Available: http:\/\/arxiv.org\/abs\/1409.4842","journal-title":"CVPR"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_3_1","DOI":"10.1038\/ncomms5308"},{"key":"e_1_3_2_1_4_1","volume-title":"Collins et al., \"Application of deep convolutional neural networks for detecting extreme weather in climate datasets,\" arXiv preprint arXiv.1605.01156","author":"Liu Y.","year":"2016","unstructured":"Y. Liu , E. Racah , J. Correa , A. Khosrowshahi , D. Lavers , K. Kunkel , M. Wehner , W. Collins et al., \"Application of deep convolutional neural networks for detecting extreme weather in climate datasets,\" arXiv preprint arXiv.1605.01156 , 2016 . Y. Liu, E. Racah, J. Correa, A. Khosrowshahi, D. Lavers, K. Kunkel, M. Wehner, W. Collins et al., \"Application of deep convolutional neural networks for detecting extreme weather in climate datasets,\" arXiv preprint arXiv.1605.01156, 2016."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_5_1","DOI":"10.1002\/jcc.24764"},{"key":"e_1_3_2_1_6_1","volume-title":"TensorFlow: Large-scale machine learning on heterogeneous systems","author":"Abadi M.","year":"2015","unstructured":"M. Abadi , A. Agarwal , P. Barham , E. Brevdo , Z. Chen , C. Citro , G. S. Corrado , A. Davis , J. Dean , M. Devin , S. Ghemawat , I. Goodfellow , A. Harp , G. Irving , M. Isard , Y. Jia , R. Jozefowicz , L. Kaiser , M. Kudlur , J. Levenberg , D. Man\u00e9 , R. Monga , S. Moore , D. Murray , C. Olah , M. Schuster , J. Shlens , B. Steiner , I. Sutskever , K. Talwar , P. Tucker , V. Vanhoucke , V. Vasudevan , F. Vi\u00e9gas , O. Vinyals , P. Warden , M. Wattenberg , M. Wicke , Y. Yu , and X. Zheng , \" TensorFlow: Large-scale machine learning on heterogeneous systems ,\" 2015 , software available from tensorflow.org. {Online}. Available: http:\/\/tensorflow.org\/ M. Abadi, A. Agarwal, P. Barham, E. Brevdo, Z. Chen, C. Citro, G. S. Corrado, A. Davis, J. Dean, M. Devin, S. Ghemawat, I. Goodfellow, A. Harp, G. Irving, M. Isard, Y. Jia, R. Jozefowicz, L. Kaiser, M. Kudlur, J. Levenberg, D. Man\u00e9, R. Monga, S. Moore, D. Murray, C. Olah, M. Schuster, J. Shlens, B. Steiner, I. Sutskever, K. Talwar, P. Tucker, V. Vanhoucke, V. Vasudevan, F. Vi\u00e9gas, O. Vinyals, P. Warden, M. Wattenberg, M. Wicke, Y. Yu, and X. Zheng, \"TensorFlow: Large-scale machine learning on heterogeneous systems,\" 2015, software available from tensorflow.org. {Online}. Available: http:\/\/tensorflow.org\/"},{"key":"e_1_3_2_1_7_1","volume-title":"Caffe: Convolutional architecture for fast feature embedding,\" arXiv preprint arXiv:1408.5093","author":"Jia Y.","year":"2014","unstructured":"Y. Jia , E. Shelhamer , J. Donahue , S. Karayev , J. Long , R. Girshick , S. Guadarrama , and T. Darrell , \" Caffe: Convolutional architecture for fast feature embedding,\" arXiv preprint arXiv:1408.5093 , 2014 . Y. Jia, E. Shelhamer, J. Donahue, S. Karayev, J. Long, R. Girshick, S. Guadarrama, and T. Darrell, \"Caffe: Convolutional architecture for fast feature embedding,\" arXiv preprint arXiv:1408.5093, 2014."},{"key":"e_1_3_2_1_8_1","volume-title":"Jun.","author":"Bergstra J.","year":"2010","unstructured":"J. Bergstra , O. Breuleux , F. Bastien , P. Lamblin , R. Pascanu , G. Desjardins , J. Turian , D. Warde-Farley , and Y. Bengio , \" Theano: a CPU and GPU math expression compiler,\" in Proceedings of the Python for Scientific Computing Conference (SciPy) , Jun. 2010 , oral Presentation . J. Bergstra, O. Breuleux, F. Bastien, P. Lamblin, R. Pascanu, G. Desjardins, J. Turian, D. Warde-Farley, and Y. Bengio, \"Theano: a CPU and GPU math expression compiler,\" in Proceedings of the Python for Scientific Computing Conference (SciPy), Jun. 2010, oral Presentation."},{"key":"e_1_3_2_1_9_1","volume-title":"Theano: new features and speed improvements,\" Deep Learning and Unsupervised Feature Learning NIPS 2012 Workshop","author":"Bastien F.","year":"2012","unstructured":"F. Bastien , P. Lamblin , R. Pascanu , J. Bergstra , I. J. Goodfellow , A. Bergeron , N. Bouchard , and Y. Bengio , \" Theano: new features and speed improvements,\" Deep Learning and Unsupervised Feature Learning NIPS 2012 Workshop , 2012 . F. Bastien, P. Lamblin, R. Pascanu, J. Bergstra, I. J. Goodfellow, A. Bergeron, N. Bouchard, and Y. Bengio, \"Theano: new features and speed improvements,\" Deep Learning and Unsupervised Feature Learning NIPS 2012 Workshop, 2012."},{"key":"e_1_3_2_1_10_1","volume-title":"Torch: A modular machine learning software library","author":"Collobert R.","year":"2002","unstructured":"R. Collobert , S. Bengio , and J. Marithoz , \" Torch: A modular machine learning software library ,\" 2002 . R. Collobert, S. Bengio, and J. Marithoz, \"Torch: A modular machine learning software library,\" 2002."},{"key":"e_1_3_2_1_11_1","first-page":"00175","article-title":"Firecaffe: near-linear acceleration of deep neural network training on compute clusters","volume":"1511","author":"Iandola F. N.","year":"2015","unstructured":"F. N. Iandola , K. Ashraf , M. W. Moskewicz , and K. Keutzer , \" Firecaffe: near-linear acceleration of deep neural network training on compute clusters ,\" CoRR , vol. abs\/ 1511 . 00175 , 2015 . {Online}. Available: http:\/\/arxiv.org\/abs\/1511.00175 F. N. Iandola, K. Ashraf, M. W. Moskewicz, and K. Keutzer, \"Firecaffe: near-linear acceleration of deep neural network training on compute clusters,\" CoRR, vol. abs\/1511.00175, 2015. {Online}. Available: http:\/\/arxiv.org\/abs\/1511.00175","journal-title":"CoRR"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_12_1","DOI":"10.1145\/3018743.3018769"},{"key":"e_1_3_2_1_13_1","volume-title":"Distributed tensorflow with mpi,\" arXiv preprint arXiv:1603.02339","author":"Vishnu A.","year":"2016","unstructured":"A. Vishnu , C. Siegel , and J. Daily , \" Distributed tensorflow with mpi,\" arXiv preprint arXiv:1603.02339 , 2016 . A. Vishnu, C. Siegel, and J. Daily, \"Distributed tensorflow with mpi,\" arXiv preprint arXiv:1603.02339, 2016."},{"key":"e_1_3_2_1_14_1","first-page":"06216","article-title":"Poseidon: A system architecture for efficient gpu-based deep learning on multiple machines","volume":"1512","author":"Zhang H.","year":"2015","unstructured":"H. Zhang , Z. Hu , J. Wei , P. Xie , G. Kim , Q. Ho , and E. P. Xing , \" Poseidon: A system architecture for efficient gpu-based deep learning on multiple machines ,\" CoRR , vol. abs\/ 1512 . 06216 , 2015 . {Online}. Available: http:\/\/arxiv.org\/abs\/1512.06216 H. Zhang, Z. Hu, J. Wei, P. Xie, G. Kim, Q. Ho, and E. P. Xing, \"Poseidon: A system architecture for efficient gpu-based deep learning on multiple machines,\" CoRR, vol. abs\/1512.06216, 2015. {Online}. Available: http:\/\/arxiv.org\/abs\/1512.06216","journal-title":"CoRR"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_15_1","DOI":"10.1145\/2783258.2783323"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_16_1","DOI":"10.1145\/2901318.2901323"},{"key":"e_1_3_2_1_17_1","first-page":"571","volume-title":"CO: USENIX Association","author":"Chilimbi T.","year":"2014","unstructured":"T. Chilimbi , Y. Suzue , J. Apacible , and K. Kalyanaraman , \" Project adam: Building an efficient and scalable deep learning training system,\" in 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14). Broomfield , CO: USENIX Association , 2014 , pp. 571 -- 582 . {Online}. Available : https:\/\/www.usenix.org\/conference\/osdil4\/technical-sessions\/presentation\/chilimbi T. Chilimbi, Y. Suzue, J. Apacible, and K. Kalyanaraman, \"Project adam: Building an efficient and scalable deep learning training system,\" in 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14). Broomfield, CO: USENIX Association, 2014, pp. 571--582. {Online}. Available: https:\/\/www.usenix.org\/conference\/osdil4\/technical-sessions\/presentation\/chilimbi"},{"key":"e_1_3_2_1_18_1","first-page":"01274","article-title":"Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems","volume":"1512","author":"Chen T.","year":"2015","unstructured":"T. Chen , M. Li , Y. Li , M. Lin , N. Wang , M. Wang , T. Xiao , B. Xu , C. Zhang , and Z. Zhang , \" Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems ,\" CoRR , vol. abs\/ 1512 . 01274 , 2015 . {Online}. Available: http:\/\/arxiv.org\/abs\/1512.01274 T. Chen, M. Li, Y. Li, M. Lin, N. Wang, M. Wang, T. Xiao, B. Xu, C. Zhang, and Z. Zhang, \"Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems,\" CoRR, vol. abs\/1512.01274, 2015. {Online}. Available: http:\/\/arxiv.org\/abs\/1512.01274","journal-title":"CoRR"},{"unstructured":"\"Caffeonspark github project \" https:\/\/github.com\/yahoo\/CaffeOnSpark accessed: 2017-01-23.  \"Caffeonspark github project \" https:\/\/github.com\/yahoo\/CaffeOnSpark accessed: 2017-01-23.","key":"e_1_3_2_1_19_1"},{"key":"e_1_3_2_1_20_1","first-page":"06051","article-title":"Sparknet: Training deep networks in spark","volume":"1511","author":"Moritz P.","year":"2015","unstructured":"P. Moritz , R. Nishihara , I. Stoica , and M. I. Jordan , \" Sparknet: Training deep networks in spark ,\" CoRR , vol. abs\/ 1511 . 06051 , 2015 . {Online}. Available: http:\/\/arxiv.org\/abs\/1511.06051 P. Moritz, R. Nishihara, I. Stoica, and M. I. Jordan, \"Sparknet: Training deep networks in spark,\" CoRR, vol. abs\/1511.06051, 2015. {Online}. Available: http:\/\/arxiv.org\/abs\/1511.06051","journal-title":"CoRR"},{"key":"e_1_3_2_1_22_1","volume-title":"Jul.","author":"Chaudhari P.","year":"2017","unstructured":"P. Chaudhari , C. Baldassi , R. Zecchina , S. Soatto , and A. Talwalkar , \" Parle: parallelizing stochastic gradient descent,\" ArXiv e-prints , Jul. 2017 . P. Chaudhari, C. Baldassi, R. Zecchina, S. Soatto, and A. Talwalkar, \"Parle: parallelizing stochastic gradient descent,\" ArXiv e-prints, Jul. 2017."},{"key":"e_1_3_2_1_23_1","volume-title":"Large Minibatch SGD: Training ImageNet in 1 Hour,\" ArXiv e-prints","author":"Goyal P.","year":"2017","unstructured":"P. Goyal , P. Doll\u00e1r , R. Girshick , P. Noordhuis , L. Wesolowski , A. Kyrola , A. Tulloch , Y. Jia , and K. He , \" Accurate , Large Minibatch SGD: Training ImageNet in 1 Hour,\" ArXiv e-prints , Jun. 2017 . P. Goyal, P. Doll\u00e1r, R. Girshick, P. Noordhuis, L. Wesolowski, A. Kyrola, A. Tulloch, Y. Jia, and K. He, \"Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour,\" ArXiv e-prints, Jun. 2017."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_24_1","DOI":"10.1109\/TDSC.2009.4"},{"unstructured":"\"Caffe2 is a lightweight modular and scalable deep learning framework.\" https:\/\/github.com\/caffe2\/caffe2 accessed: 2017-05-25.  \"Caffe2 is a lightweight modular and scalable deep learning framework.\" https:\/\/github.com\/caffe2\/caffe2 accessed: 2017-05-25.","key":"e_1_3_2_1_25_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_26_1","DOI":"10.1177\/1094342015623623"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_27_1","DOI":"10.1007\/s00607-013-0331-3"},{"key":"e_1_3_2_1_28_1","volume-title":"Machine learning: an algorithmic perspective","author":"Marsland S.","year":"2015","unstructured":"S. Marsland , Machine learning: an algorithmic perspective . CRC press , 2015 . S. Marsland, Machine learning: an algorithmic perspective. CRC press, 2015."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_29_1","DOI":"10.1177\/1094342004046045"},{"key":"e_1_3_2_1_30_1","first-page":"1232","volume-title":"Eds., 2012","author":"Dean J.","year":"2012","unstructured":"J. Dean , G. Corrado , R. Monga , K. Chen , M. Devin , M. Mao , M. Ranzato , A. Senior , P. Tucker , K. Yang , Q. V. Le , and A. Y. Ng , \" Large scale distributed deep networks,\" in Advances in Neural Information Processing Systems 25, P. Bartlett, F. Pereira, C. Burges, L. Bottou, and K. Weinberger , Eds., 2012 , pp. 1232 -- 1240 . {Online}. Available : http:\/\/books.nips.cc\/papers\/files\/nips25\/NIPS 2012 _0598.pdf J. Dean, G. Corrado, R. Monga, K. Chen, M. Devin, M. Mao, M. Ranzato, A. Senior, P. Tucker, K. Yang, Q. V. Le, and A. Y. Ng, \"Large scale distributed deep networks,\" in Advances in Neural Information Processing Systems 25, P. Bartlett, F. Pereira, C. Burges, L. Bottou, and K. Weinberger, Eds., 2012, pp. 1232--1240. {Online}. Available: http:\/\/books.nips.cc\/papers\/files\/nips25\/NIPS2012_0598.pdf"},{"key":"e_1_3_2_1_31_1","first-page":"06709","article-title":"Distributed deep learning using synchronous stochastic gradient descent","volume":"1602","author":"Das D.","year":"2016","unstructured":"D. Das , S. Avancha , D. Mudigere , K. Vaidyanathan , S. Sridharan , D. D. Kalamkar , B. Kaul , and P. Dubey , \" Distributed deep learning using synchronous stochastic gradient descent ,\" CoRR , vol. abs\/ 1602 . 06709 , 2016 . {Online}. Available: http:\/\/arxiv.org\/abs\/1602.06709 D. Das, S. Avancha, D. Mudigere, K. Vaidyanathan, S. Sridharan, D. D. Kalamkar, B. Kaul, and P. Dubey, \"Distributed deep learning using synchronous stochastic gradient descent,\" CoRR, vol. abs\/1602.06709, 2016. {Online}. Available: http:\/\/arxiv.org\/abs\/1602.06709","journal-title":"CoRR"},{"key":"e_1_3_2_1_32_1","first-page":"2737","volume-title":"NIPS'15","author":"Lian X.","year":"2015","unstructured":"X. Lian , Y. Huang , Y. Li , and J. Liu , \" Asynchronous parallel stochastic gradient for nonconvex optimization,\" in Proceedings of the 28th International Conference on Neural Information Processing Systems, ser . NIPS'15 . Cambridge, MA, USA: MIT Press , 2015 , pp. 2737 -- 2745 . {Online}. Available: http:\/\/dl.acm.org\/citation.cfm?id=2969442.2969545 X. Lian, Y. Huang, Y. Li, and J. Liu, \"Asynchronous parallel stochastic gradient for nonconvex optimization,\" in Proceedings of the 28th International Conference on Neural Information Processing Systems, ser. NIPS'15. Cambridge, MA, USA: MIT Press, 2015, pp. 2737--2745. {Online}. Available: http:\/\/dl.acm.org\/citation.cfm?id=2969442.2969545"},{"key":"e_1_3_2_1_33_1","first-page":"1097","volume-title":"NIPS'12","author":"Krizhevsky A.","year":"2012","unstructured":"A. Krizhevsky , I. Sutskever , and G. E. Hinton , \" Imagenet classification with deep convolutional neural networks,\" in Proceedings of the 25th International Conference on Neural Information Processing Systems, ser . NIPS'12 . USA: Curran Associates Inc. , 2012 , pp. 1097 -- 1105 . {Online}. Available: http:\/\/dl.acm.org\/citation.cfm?id=2999134.2999257 A. Krizhevsky, I. Sutskever, and G. E. Hinton, \"Imagenet classification with deep convolutional neural networks,\" in Proceedings of the 25th International Conference on Neural Information Processing Systems, ser. NIPS'12. USA: Curran Associates Inc., 2012, pp. 1097--1105. {Online}. Available: http:\/\/dl.acm.org\/citation.cfm?id=2999134.2999257"},{"key":"e_1_3_2_1_34_1","first-page":"963","volume-title":"ICPADS 2016","author":"Zheng S.","year":"2016","unstructured":"S. Zheng , A. Vishnu , and C. H. Q. Ding , \"Accelerating deep learning with shrinkage and recall,\" in 22nd IEEE International Conference on Parallel and Distributed Systems , ICPADS 2016 , Wuhan, China , December 13-16, 2016 , 2016, pp. 963 -- 970 . {Online}. Available S. Zheng, A. Vishnu, and C. H. Q. Ding, \"Accelerating deep learning with shrinkage and recall,\" in 22nd IEEE International Conference on Parallel and Distributed Systems, ICPADS 2016, Wuhan, China, December 13-16, 2016, 2016, pp. 963--970. {Online}. Available"},{"key":"e_1_3_2_1_35_1","first-page":"753","volume-title":"USA","author":"Siegel C.","year":"2016","unstructured":"C. Siegel , J. Daily , and A. Vishnu , \" Adaptive neuron apoptosis for accelerating deep learning on large scale systems,\" in 2016 IEEE International Conference on Big Data, BigData 2016, Washington DC , USA , December 5-8, 2016 , 2016, pp. 753 -- 762 . {Online}. Available C. Siegel, J. Daily, and A. Vishnu, \"Adaptive neuron apoptosis for accelerating deep learning on large scale systems,\" in 2016 IEEE International Conference on Big Data, BigData 2016, Washington DC, USA, December 5-8, 2016, 2016, pp. 753--762. {Online}. Available"},{"key":"e_1_3_2_1_36_1","volume-title":"Adaptive neuron apoptosis for accelerating deep learning on large scale systems,\" arXiv preprint arXiv.1610.00790","author":"Siegel C.","year":"2016","unstructured":"C. Siegel , J. Daily , and A. Vishnu , \" Adaptive neuron apoptosis for accelerating deep learning on large scale systems,\" arXiv preprint arXiv.1610.00790 , 2016 . C. Siegel, J. Daily, and A. Vishnu, \"Adaptive neuron apoptosis for accelerating deep learning on large scale systems,\" arXiv preprint arXiv.1610.00790, 2016."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_37_1","DOI":"10.1109\/CLUSTER.2015.30"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_38_1","DOI":"10.1109\/CLUSTER.2015.26"},{"key":"e_1_3_2_1_39_1","first-page":"222","volume-title":"IPDPS 2016","author":"Vishnu A.","year":"2016","unstructured":"A. Vishnu , H. van Dam , N. R. Tallent , D. J. Kerbyson , and A. Hoisie , \" Fault modeling of extreme scale applications using machine learning,\" in 2016 IEEE International Parallel and Distributed Processing Symposium , IPDPS 2016 , Chicago, IL, USA , May 23-27, 2016 , 2016, pp. 222 -- 231 . {Online}. Available A. Vishnu, H. van Dam, N. R. Tallent, D. J. Kerbyson, and A. Hoisie, \"Fault modeling of extreme scale applications using machine learning,\" in 2016 IEEE International Parallel and Distributed Processing Symposium, IPDPS 2016, Chicago, IL, USA, May 23-27, 2016, 2016, pp. 222--231. {Online}. Available"},{"key":"e_1_3_2_1_40_1","first-page":"598","volume-title":"ICPP 2016","author":"Shohdy S.","year":"2016","unstructured":"S. Shohdy , A. Vishnu , and G. Agrawal , \" Fault tolerant support vector machines,\" in 45th International Conference on Parallel Processing , ICPP 2016 , Philadelphia, PA, USA , August 16-19, 2016 , 2016, pp. 598 -- 607 . {Online}. Available S. Shohdy, A. Vishnu, and G. Agrawal, \"Fault tolerant support vector machines,\" in 45th International Conference on Parallel Processing, ICPP 2016, Philadelphia, PA, USA, August 16-19, 2016, 2016, pp. 598--607. {Online}. Available"},{"key":"e_1_3_2_1_41_1","first-page":"05116","article-title":"Fault tolerant frequent pattern mining","volume":"1610","author":"Shohdy S.","year":"2016","unstructured":"S. Shohdy , A. Vishnu , and G. Agrawal , \" Fault tolerant frequent pattern mining ,\" CoRR , vol. abs\/ 1610 . 05116 , 2016 . {Online}. Available: http:\/\/arxiv.org\/abs\/1610.05116 S. Shohdy, A. Vishnu, and G. Agrawal, \"Fault tolerant frequent pattern mining,\" CoRR, vol. abs\/1610.05116, 2016. {Online}. Available: http:\/\/arxiv.org\/abs\/1610.05116","journal-title":"CoRR"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_42_1","DOI":"10.1145\/2834892.2834893"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_44_1","DOI":"10.5555\/3018874.3018877"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_45_1","DOI":"10.1109\/IPDPS.2013.115"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_46_1","DOI":"10.1145\/125826.125925"},{"key":"e_1_3_2_1_47_1","volume-title":"forked from BVLC\/caffe,\" https:\/\/github.com\/intel\/caffe","author":"Intel Corporation","year":"2016","unstructured":"Intel Corporation , \"intel\/caffe , forked from BVLC\/caffe,\" https:\/\/github.com\/intel\/caffe , 2016 . Intel Corporation, \"intel\/caffe, forked from BVLC\/caffe,\" https:\/\/github.com\/intel\/caffe, 2016."},{"key":"e_1_3_2_1_48_1","volume-title":"The mnist database of handwritten digits","author":"LeCun Y.","year":"1998","unstructured":"Y. LeCun , C. Cortes , and C. J. Burges , \" The mnist database of handwritten digits , 1998 ,\" Available electronically at http:\/\/yann.lecun.com\/exdb\/mnist, 2012. Y. LeCun, C. Cortes, and C. J. Burges, \"The mnist database of handwritten digits, 1998,\" Available electronically at http:\/\/yann.lecun.com\/exdb\/mnist, 2012."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_49_1","DOI":"10.1109\/5.726791"},{"key":"e_1_3_2_1_50_1","volume-title":"Tech. Rep.","author":"Krizhevsky A.","year":"2009","unstructured":"A. Krizhevsky , \"Learning multiple layers of features from tiny images,\" University of Toronto , Tech. Rep. , 2009 . A. Krizhevsky, \"Learning multiple layers of features from tiny images,\" University of Toronto, Tech. Rep., 2009."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_51_1","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_1_52_1","first-page":"1","volume-title":"March 2007","author":"Hursey J.","unstructured":"J. Hursey , J. M. Squyres , T. I. Mattox , and A. Lumsdaine , \" The design and implementation of checkpoint\/restart process fault tolerance for open mpi,\" in 2007 IEEE International Parallel and Distributed Processing Symposium , March 2007 , pp. 1 -- 8 . J. Hursey, J. M. Squyres, T. I. Mattox, and A. Lumsdaine, \"The design and implementation of checkpoint\/restart process fault tolerance for open mpi,\" in 2007 IEEE International Parallel and Distributed Processing Symposium, March 2007, pp. 1--8."},{"key":"e_1_3_2_1_53_1","first-page":"1","volume-title":"Aug 2009","author":"Ma C.","unstructured":"C. Ma , Z. Huo , J. Cai , and D. Meng , \" Dcr: A fully transparent checkpoint\/restart framework for distributed systems,\" in 2009 IEEE International Conference on Cluster Computing and Workshops , Aug 2009 , pp. 1 -- 10 . C. Ma, Z. Huo, J. Cai, and D. Meng, \"Dcr: A fully transparent checkpoint\/restart framework for distributed systems,\" in 2009 IEEE International Conference on Cluster Computing and Workshops, Aug 2009, pp. 1--10."},{"key":"e_1_3_2_1_54_1","first-page":"346","volume-title":"Aug 2016","author":"Gamell M.","unstructured":"M. Gamell , D. S. Katz , K. Teranishi , M. A. Heroux , R. F. V. der Wijngaart , T. G. Mattson , and M. Parashar , \" Evaluating online global recovery with fenix using application-aware in-memory checkpointing techniques,\" in 2016 45th International Conference on Parallel Processing Workshops (ICPPW) , Aug 2016 , pp. 346 -- 355 . M. Gamell, D. S. Katz, K. Teranishi, M. A. Heroux, R. F. V. der Wijngaart, T. G. Mattson, and M. Parashar, \"Evaluating online global recovery with fenix using application-aware in-memory checkpointing techniques,\" in 2016 45th International Conference on Parallel Processing Workshops (ICPPW), Aug 2016, pp. 346--355."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_55_1","DOI":"10.1109\/ICPP.2011.85"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_56_1","DOI":"10.1145\/2287076.2287099"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_57_1","DOI":"10.1109\/TPDS.2008.172"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_58_1","DOI":"10.1109\/ICPADS.2010.48"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_59_1","DOI":"10.1145\/2807591.2807617"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_60_1","DOI":"10.1007\/11945918_47"}],"event":{"sponsor":["Mellanox Mellanox Technologies","Intel Intel","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"],"acronym":"EuroMPI\/USA '17","name":"EuroMPI\/USA '17: 24th European MPI Users' Group Meeting","location":"Chicago Illinois"},"container-title":["Proceedings of the 24th European MPI Users' Group Meeting"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3127024.3127037","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3127024.3127037","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T02:11:06Z","timestamp":1750212666000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3127024.3127037"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,9,25]]},"references-count":58,"alternative-id":["10.1145\/3127024.3127037","10.1145\/3127024"],"URL":"https:\/\/doi.org\/10.1145\/3127024.3127037","relation":{},"subject":[],"published":{"date-parts":[[2017,9,25]]},"assertion":[{"value":"2017-09-25","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}