{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T23:15:32Z","timestamp":1780355732748,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,12,9]],"date-time":"2019-12-09T00:00:00Z","timestamp":1575849600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,12,9]]},"DOI":"10.1145\/3361525.3361538","type":"proceedings-article","created":{"date-parts":[[2019,11,22]],"date-time":"2019-11-22T18:41:59Z","timestamp":1574448119000},"page":"82-95","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":15,"title":["FfDL"],"prefix":"10.1145","author":[{"given":"K. R.","family":"Jayaram","sequence":"first","affiliation":[{"name":"IBM Research, Yorktown Heights, NY and Cambridge, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Vinod","family":"Muthusamy","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY and Cambridge, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Parijat","family":"Dube","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY and Cambridge, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Vatche","family":"Ishakian","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY and Cambridge, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chen","family":"Wang","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY and Cambridge, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Benjamin","family":"Herta","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY and Cambridge, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Scott","family":"Boag","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY and Cambridge, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Diana","family":"Arroyo","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY and Cambridge, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Asser","family":"Tantawi","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY and Cambridge, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Archit","family":"Verma","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY and Cambridge, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Falk","family":"Pollok","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY and Cambridge, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rania","family":"Khalaf","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY and Cambridge, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2019,12,9]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Vinay Amatya Abhinav Vishnu Charles Siegel and Jeff Daily. 2017. What does fault tolerant Deep Learning need from MPI?. In EuroMPI\/USA.  Vinay Amatya Abhinav Vishnu Charles Siegel and Jeff Daily. 2017. What does fault tolerant Deep Learning need from MPI?. In EuroMPI\/USA.","DOI":"10.1145\/3127024.3127037"},{"key":"e_1_3_2_1_2_1","unstructured":"Amazon Web Services. 2017. Amazon Sagemaker. https:\/\/aws.amazon.com\/sagemaker\/.  Amazon Web Services. 2017. Amazon Sagemaker. https:\/\/aws.amazon.com\/sagemaker\/."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3097983.3098021"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jnca.2015.10.004"},{"key":"e_1_3_2_1_5_1","unstructured":"CoreOS Inc. 2018. The ETCD Key Value Store. https:\/\/coreos.com\/etcd\/.  CoreOS Inc. 2018. The ETCD Key Value Store. https:\/\/coreos.com\/etcd\/."},{"key":"e_1_3_2_1_6_1","volume-title":"Docker: Enterprise Application Container Platform. https:\/\/www.docker.com\/.","year":"2019","unstructured":"Docker. 2019 . Docker: Enterprise Application Container Platform. https:\/\/www.docker.com\/. Docker. 2019. Docker: Enterprise Application Container Platform. https:\/\/www.docker.com\/."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342010391989"},{"key":"e_1_3_2_1_8_1","unstructured":"Jeffrey Dunn. 2016. Introducing FBLearner Flow: Facebook's AI backbone. https:\/\/code.fb.com\/core-data\/introducing-fblearner-flow-facebook-s-ai-backbone\/.  Jeffrey Dunn. 2016. Introducing FBLearner Flow: Facebook's AI backbone. https:\/\/code.fb.com\/core-data\/introducing-fblearner-flow-facebook-s-ai-backbone\/."},{"key":"e_1_3_2_1_9_1","article-title":"A survey of fault tolerance mechanisms and checkpoint\/restart implementations for high performance computing systems","volume":"65","author":"Egwutuoha Ifeanyi P.","year":"2013","unstructured":"Ifeanyi P. Egwutuoha , David Levy , Bran Selic , and Shiping Chen . 2013 . A survey of fault tolerance mechanisms and checkpoint\/restart implementations for high performance computing systems . The Journal of Super computing 65 , 3 (01 Sep 2013), 1302--1326. https:\/\/doi.org\/10.1007\/s11227-013-0884-0 10.1007\/s11227-013-0884-0 Ifeanyi P. Egwutuoha, David Levy, Bran Selic, and Shiping Chen. 2013. A survey of fault tolerance mechanisms and checkpoint\/restart implementations for high performance computing systems. The Journal of Super computing 65, 3 (01 Sep 2013), 1302--1326. https:\/\/doi.org\/10.1007\/s11227-013-0884-0","journal-title":"The Journal of Super computing"},{"key":"e_1_3_2_1_10_1","unstructured":"Apache Software Foundation. 2019. Apache Mesos. http:\/\/mesos.apache.org.  Apache Software Foundation. 2019. Apache Mesos. http:\/\/mesos.apache.org."},{"key":"e_1_3_2_1_11_1","unstructured":"James Fox Yiming Zou and Judy Qiu. 2016. Software Frameworks for Deep Learning at Scale.  James Fox Yiming Zou and Judy Qiu. 2016. Software Frameworks for Deep Learning at Scale."},{"key":"e_1_3_2_1_12_1","volume-title":"Kubernetes: Production Grade Container Orchestration.","year":"2019","unstructured":"Google. 2019 . Kubernetes: Production Grade Container Orchestration. Google. 2019. Kubernetes: Production Grade Container Orchestration."},{"key":"e_1_3_2_1_13_1","unstructured":"Google Inc. 2018. Google Cloud Machine Learning Engine. https:\/\/cloud.google.com\/ml-engine\/.  Google Inc. 2018. Google Cloud Machine Learning Engine. https:\/\/cloud.google.com\/ml-engine\/."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.compind.2018.03.027"},{"key":"e_1_3_2_1_16_1","volume-title":"Meet Michelangelo: Uber's Machine Learning Platform. https:\/\/eng.uber.com\/michelangelo\/.","author":"Hermann Jeremy","year":"2017","unstructured":"Jeremy Hermann and Mike Del Baso . 2017 . Meet Michelangelo: Uber's Machine Learning Platform. https:\/\/eng.uber.com\/michelangelo\/. Jeremy Hermann and Mike Del Baso. 2017. Meet Michelangelo: Uber's Machine Learning Platform. https:\/\/eng.uber.com\/michelangelo\/."},{"key":"e_1_3_2_1_17_1","unstructured":"IBM Corporation. 2018. IBM Watson Machine Learning. https:\/\/developer.ibm.com\/clouddataservices\/docs\/ibm-watson-machine-learning\/.  IBM Corporation. 2018. IBM Watson Machine Learning. https:\/\/developer.ibm.com\/clouddataservices\/docs\/ibm-watson-machine-learning\/."},{"key":"e_1_3_2_1_18_1","unstructured":"IBM Inc. 2018. The IBM Cloud. https:\/\/www.ibm.com\/cloud\/bare-metal-servers.  IBM Inc. 2018. The IBM Cloud. https:\/\/www.ibm.com\/cloud\/bare-metal-servers."},{"key":"e_1_3_2_1_19_1","volume-title":"Tensorflow: An open-source machine learning framework for everyone. https:\/\/www.tensorflow.org\/.","author":"Google Inc.","year":"2018","unstructured":"Google Inc. 2018 . Tensorflow: An open-source machine learning framework for everyone. https:\/\/www.tensorflow.org\/. Google Inc. 2018. Tensorflow: An open-source machine learning framework for everyone. https:\/\/www.tensorflow.org\/."},{"key":"e_1_3_2_1_20_1","unstructured":"Google Inc. 2018. TensorFlow CNN Benchmarks. https:\/\/www.nytimes.com\/2018\/04\/06\/opinion\/sunday\/germs-microbes-processed-foods.html.  Google Inc. 2018. TensorFlow CNN Benchmarks. https:\/\/www.nytimes.com\/2018\/04\/06\/opinion\/sunday\/germs-microbes-processed-foods.html."},{"key":"e_1_3_2_1_21_1","unstructured":"NVIDIA Inc. 2018. NVIDIA DGX-1: Essential Instrument Of AI Research. https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-1\/.  NVIDIA Inc. 2018. NVIDIA DGX-1: Essential Instrument Of AI Research. https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-1\/."},{"key":"e_1_3_2_1_22_1","volume-title":"Caffe: Convolutional Architecture for Fast Feature Embedding. arXiv preprint arXiv:1408.5093","author":"Jia Yangqing","year":"2014","unstructured":"Yangqing Jia , Evan Shelhamer , Jeff Donahue , Sergey Karayev , Jonathan Long , Ross Girshick , Sergio Guadarrama , and Trevor Darrell . 2014 . Caffe: Convolutional Architecture for Fast Feature Embedding. arXiv preprint arXiv:1408.5093 (2014). Yangqing Jia, Evan Shelhamer, Jeff Donahue, Sergey Karayev, Jonathan Long, Ross Girshick, Sergio Guadarrama, and Trevor Darrell. 2014. Caffe: Convolutional Architecture for Fast Feature Embedding. arXiv preprint arXiv:1408.5093 (2014)."},{"key":"e_1_3_2_1_23_1","unstructured":"Justin C. Johnson. 2018. CNN Benchmarks. https:\/\/github.com\/jcjohnson\/cnn-benchmarks.  Justin C. Johnson. 2018. CNN Benchmarks. https:\/\/github.com\/jcjohnson\/cnn-benchmarks."},{"key":"e_1_3_2_1_24_1","volume-title":"Xiangyu Zhang and Jian Sun","author":"Kaiming He Shaoqing Ren","year":"2015","unstructured":"Shaoqing Ren Kaiming He , Xiangyu Zhang and Jian Sun . 2015 . Deep Residual Networks . https:\/\/github.com\/KaimingHe\/deep-residual-networks. Shaoqing Ren Kaiming He, Xiangyu Zhang and Jian Sun. 2015. Deep Residual Networks. https:\/\/github.com\/KaimingHe\/deep-residual-networks."},{"key":"e_1_3_2_1_25_1","volume-title":"Fault-Tolerant Systems","author":"Koren Israel","unstructured":"Israel Koren and C. Mani Krishna . 2007. Fault-Tolerant Systems ( 1 st ed.). Morgan Kaufmann Publishers Inc ., San Francisco, CA, USA. Israel Koren and C. Mani Krishna. 2007. Fault-Tolerant Systems (1st ed.). Morgan Kaufmann Publishers Inc., San Francisco, CA, USA.","edition":"1"},{"key":"e_1_3_2_1_26_1","volume-title":"Mlbase: A distributed machine-learning system. In In CIDR.","author":"Kraska Tim","year":"2013","unstructured":"Tim Kraska , Ameet Talwalkar , and John Duchi . 2013 . Mlbase: A distributed machine-learning system. In In CIDR. Tim Kraska, Ameet Talwalkar, and John Duchi. 2013. Mlbase: A distributed machine-learning system. In In CIDR."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1038\/nature14539"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.5555\/2685048.2685095"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.14778\/2212351.2212354"},{"key":"e_1_3_2_1_30_1","unstructured":"Microsoft Azure. 2018. Microsoft Azure Machine Learning. https:\/\/azure.microsoft.com\/en-us\/overview\/machine-learning\/.  Microsoft Azure. 2018. Microsoft Azure Machine Learning. https:\/\/azure.microsoft.com\/en-us\/overview\/machine-learning\/."},{"key":"e_1_3_2_1_31_1","unstructured":"Mongo Inc. 2018. MongoDB. https:\/\/www.mongodb.com\/.  Mongo Inc. 2018. MongoDB. https:\/\/www.mongodb.com\/."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/2517349.2522738"},{"key":"e_1_3_2_1_33_1","unstructured":"NVIDIA Inc. 2014. TESLA K80. https:\/\/www.nvidia.com\/en-us\/data-center\/teslak80\/.  NVIDIA Inc. 2014. TESLA K80. https:\/\/www.nvidia.com\/en-us\/data-center\/teslak80\/."},{"key":"e_1_3_2_1_34_1","unstructured":"NVIDIA Inc. 2018. TESLA P100: Infinite Compute Power for the Modern Data Center. http:\/\/www.nvidia.com\/object\/tesla-p100.html.  NVIDIA Inc. 2018. TESLA P100: Infinite Compute Power for the Modern Data Center. http:\/\/www.nvidia.com\/object\/tesla-p100.html."},{"key":"e_1_3_2_1_35_1","volume-title":"Hemingway: Modeling Distributed Optimization Algorithms. arXiv preprint arXiv: 1702.05865","author":"Pan Xinghao","year":"2017","unstructured":"Xinghao Pan , Shivaram Venkataraman , Zizheng Tai , and Joseph Gonzalez . 2017 . Hemingway: Modeling Distributed Optimization Algorithms. arXiv preprint arXiv: 1702.05865 (2017). Xinghao Pan, Shivaram Venkataraman, Zizheng Tai, and Joseph Gonzalez. 2017. Hemingway: Modeling Distributed Optimization Algorithms. arXiv preprint arXiv: 1702.05865 (2017)."},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of the Thirteenth EuroSys Conference (EuroSys '18)","author":"Park Jun Woo","year":"1905","unstructured":"Jun Woo Park , Alexey Tumanov , Angela Jiang , Michael A. Kozuch , and Gregory R. Ganger . 2018. 3Sigma: Distribution-based Cluster Scheduling for Runtime Uncertainty . In Proceedings of the Thirteenth EuroSys Conference (EuroSys '18) . ACM, New York, NY, USA, Article 2, 17 pages. https:\/\/doi.org\/10.1145\/3 1905 08.3190515 10.1145\/3190508.3190515 Jun Woo Park, Alexey Tumanov, Angela Jiang, Michael A. Kozuch, and Gregory R. Ganger. 2018. 3Sigma: Distribution-based Cluster Scheduling for Runtime Uncertainty. In Proceedings of the Thirteenth EuroSys Conference (EuroSys '18). ACM, New York, NY, USA, Article 2, 17 pages. https:\/\/doi.org\/10.1145\/3190508.3190515"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"key":"e_1_3_2_1_38_1","volume-title":"Barbosa","author":"Sampaio Altino M.","year":"2017","unstructured":"Altino M. Sampaio and Jorge G . Barbosa . 2017 . A Comparative Cost Study of Fault-Tolerant Techniques for Availability on the Cloud. In Ambient Intelligence - Software and Applications - 8th International Symposium on Ambient Intelligence, ISAmI 2017, Porto, Portugal , June 21-23, 2017. 263--268. https:\/\/doi.org\/10.1007\/978-3-319-61118-1_32 10.1007\/978-3-319-61118-1_32 Altino M. Sampaio and Jorge G. Barbosa. 2017. A Comparative Cost Study of Fault-Tolerant Techniques for Availability on the Cloud. In Ambient Intelligence - Software and Applications - 8th International Symposium on Ambient Intelligence, ISAmI 2017, Porto, Portugal, June 21-23, 2017. 263--268. https:\/\/doi.org\/10.1007\/978-3-319-61118-1_32"},{"key":"e_1_3_2_1_39_1","volume-title":"Horovod: fast and easy distributed deep learning in TensorFlow. CoRR abs\/1802.05799","author":"Sergeev Alexander","year":"2018","unstructured":"Alexander Sergeev and Mike Del Balso . 2018. Horovod: fast and easy distributed deep learning in TensorFlow. CoRR abs\/1802.05799 ( 2018 ). arXiv:1802.05799 http:\/\/arxiv.org\/abs\/1802.05799 Alexander Sergeev and Mike Del Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. CoRR abs\/1802.05799 (2018). arXiv:1802.05799 http:\/\/arxiv.org\/abs\/1802.05799"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jnca.2016.08.010"},{"key":"e_1_3_2_1_41_1","unstructured":"K. Simonyan and A. Zisserman. 2014. Very Deep Convolutional Networks for Large-Scale Image Recognition. http:\/\/www.robots.ox.ac.uk\/vgg\/research\/very_deep\/.  K. Simonyan and A. Zisserman. 2014. Very Deep Convolutional Networks for Large-Scale Image Recognition. http:\/\/www.robots.ox.ac.uk\/vgg\/research\/very_deep\/."},{"key":"e_1_3_2_1_42_1","volume-title":"Rethinking the Inception Architecture for Computer Vision. CoRR abs\/1512.00567","author":"Szegedy Christian","year":"2015","unstructured":"Christian Szegedy , Vincent Vanhoucke , Sergey Ioffe , Jonathon Shlens , and Zbigniew Wojna . 2015. Rethinking the Inception Architecture for Computer Vision. CoRR abs\/1512.00567 ( 2015 ). arXiv:1512.00567 http:\/\/arxiv.org\/abs\/1512.00567 Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, and Zbigniew Wojna. 2015. Rethinking the Inception Architecture for Computer Vision. CoRR abs\/1512.00567 (2015). arXiv:1512.00567 http:\/\/arxiv.org\/abs\/1512.00567"},{"key":"e_1_3_2_1_43_1","volume-title":"Analysis and Simulation of Computer and Telecommunication Systems (MASCOTS), 2015 IEEE 23rd International Symposium on.","author":"Tantawi Asser N.","year":"2015","unstructured":"Asser N. Tantawi . 2015 . On biasing towards optimized application placement in the cloud. In Modeling , Analysis and Simulation of Computer and Telecommunication Systems (MASCOTS), 2015 IEEE 23rd International Symposium on. Asser N. Tantawi. 2015. On biasing towards optimized application placement in the cloud. In Modeling, Analysis and Simulation of Computer and Telecommunication Systems (MASCOTS), 2015 IEEE 23rd International Symposium on."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICAC.2016.34"},{"key":"e_1_3_2_1_45_1","volume-title":"Ernest: Efficient Performance Prediction for Large-scale Advanced Analytics. In 13th Usenix Conference on Networked Systems Design and Implementation (NSDI'16)","author":"Venkataraman Shivaram","year":"2016","unstructured":"Shivaram Venkataraman , Zongheng Yang , Michael Franklin , Benjamin Recht , and Ion Stoica . 2016 . Ernest: Efficient Performance Prediction for Large-scale Advanced Analytics. In 13th Usenix Conference on Networked Systems Design and Implementation (NSDI'16) . USENIX Association, 363--378. Shivaram Venkataraman, Zongheng Yang, Michael Franklin, Benjamin Recht, and Ion Stoica. 2016. Ernest: Efficient Performance Prediction for Large-scale Advanced Analytics. In 13th Usenix Conference on Networked Systems Design and Implementation (NSDI'16). USENIX Association, 363--378."},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of the 2008 ACM\/IEEE Conference on Supercomputing (SC '08)","author":"Wang Chao","unstructured":"Chao Wang , Frank Mueller , Christian Engelmann , and Stephen L. Scott . 2008. Proactive Process-level Live Migration in HPC Environments . In Proceedings of the 2008 ACM\/IEEE Conference on Supercomputing (SC '08) . IEEE Press, Piscataway, NJ, USA, Article 43, 12 pages. http:\/\/dl.acm.org\/citation.cfm?id=1413370.1413414 Chao Wang, Frank Mueller, Christian Engelmann, and Stephen L. Scott. 2008. Proactive Process-level Live Migration in HPC Environments. In Proceedings of the 2008 ACM\/IEEE Conference on Supercomputing (SC '08). IEEE Press, Piscataway, NJ, USA, Article 43, 12 pages. http:\/\/dl.acm.org\/citation.cfm?id=1413370.1413414"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/2723372.2742793"},{"key":"e_1_3_2_1_48_1","volume-title":"Gandiva: Introspective Cluster Scheduling for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao , Romil Bhardwaj , Ramachandran Ramjee , Muthian Sivathanu , Nipun Kwatra , Zhenhua Han , Pratyush Patel , Xuan Peng , Hanyu Zhao , Quanlu Zhang , Fan Yang , and Lidong Zhou . 2018 . Gandiva: Introspective Cluster Scheduling for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18) . USENIX Association, Carlsbad, CA, 595--610. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/xiao Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Quanlu Zhang, Fan Yang, and Lidong Zhou. 2018. Gandiva: Introspective Cluster Scheduling for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). USENIX Association, Carlsbad, CA, 595--610. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/xiao"},{"key":"e_1_3_2_1_49_1","volume-title":"Jinliang Wei, Seunghak Lee, Xun Zheng, Pengtao Xie, Abhimanu Kumar, and Yaoliang Yu.","author":"Xing Eric P.","year":"2015","unstructured":"Eric P. Xing , Qirong Ho , Wei Dai , Jin Kyu Kim , Jinliang Wei, Seunghak Lee, Xun Zheng, Pengtao Xie, Abhimanu Kumar, and Yaoliang Yu. 2015 . Petuum : A New Platform for Distributed Machine Learning on Big Data.. In KDD, Longbing Cao, Chengqi Zhang, Thorsten Joachims, Geoffrey I. Webb, Dragos D. Margineantu, and Graham Williams (Eds.). ACM , 1335--1344. http:\/\/dblp.uni-trier.de\/db\/conf\/kdd\/kdd2015.html#XingHDKWLZXKY15 Eric P. Xing, Qirong Ho, Wei Dai, Jin Kyu Kim, Jinliang Wei, Seunghak Lee, Xun Zheng, Pengtao Xie, Abhimanu Kumar, and Yaoliang Yu. 2015. Petuum: A New Platform for Distributed Machine Learning on Big Data.. In KDD, Longbing Cao, Chengqi Zhang, Thorsten Joachims, Geoffrey I. Webb, Dragos D. Margineantu, and Graham Williams (Eds.). ACM, 1335--1344. http:\/\/dblp.uni-trier.de\/db\/conf\/kdd\/kdd2015.html#XingHDKWLZXKY15"},{"key":"e_1_3_2_1_50_1","volume-title":"Presented as part of the 9th USENIX Symposium on Networked Systems Design and Implementation (NSDI 12)","author":"Zaharia Matei","unstructured":"Matei Zaharia , Mosharaf Chowdhury , Tathagata Das , Ankur Dave , Justin Ma , Murphy McCauly , Michael J. Franklin , Scott Shenker , and Ion Stoica . 2012. Resilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing . In Presented as part of the 9th USENIX Symposium on Networked Systems Design and Implementation (NSDI 12) . USENIX , San Jose, CA , 15--28. https:\/\/www.usenix.org\/conference\/nsdi12\/technical-sessions\/presentation\/zaharia Matei Zaharia, Mosharaf Chowdhury, Tathagata Das, Ankur Dave, Justin Ma, Murphy McCauly, Michael J. Franklin, Scott Shenker, and Ion Stoica. 2012. Resilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing. In Presented as part of the 9th USENIX Symposium on Networked Systems Design and Implementation (NSDI 12). USENIX, San Jose, CA, 15--28. https:\/\/www.usenix.org\/conference\/nsdi12\/technical-sessions\/presentation\/zaharia"},{"key":"e_1_3_2_1_51_1","volume-title":"SLAQ: Quality-Driven Scheduling for Distributed Machine Learning. In ACM Symposium on Cloud Computing. 390--404","author":"Zhang Haoyu","unstructured":"Haoyu Zhang , Logan Stafman , Andrew Or , and Michael J. Freedman . 2017 . SLAQ: Quality-Driven Scheduling for Distributed Machine Learning. In ACM Symposium on Cloud Computing. 390--404 . https:\/\/doi.org\/10.1145\/3127479.3127490 10.1145\/3127479.3127490 Haoyu Zhang, Logan Stafman, Andrew Or, and Michael J. Freedman. 2017. SLAQ: Quality-Driven Scheduling for Distributed Machine Learning. In ACM Symposium on Cloud Computing. 390--404. https:\/\/doi.org\/10.1145\/3127479.3127490"},{"key":"e_1_3_2_1_52_1","volume-title":"2017 26th International Conference on Computer Communication and Networks (ICCCN). 1--9. https:\/\/doi.org\/10","author":"Zhang K.","year":"2017","unstructured":"K. Zhang , S. Alqahtani , and M. Demirbas . 2017. A Comparison of Distributed Machine Learning Platforms . In 2017 26th International Conference on Computer Communication and Networks (ICCCN). 1--9. https:\/\/doi.org\/10 .1109\/ICCCN. 2017 .8038464 10.1109\/ICCCN.2017.8038464 K. Zhang, S. Alqahtani, and M. Demirbas. 2017. A Comparison of Distributed Machine Learning Platforms. In 2017 26th International Conference on Computer Communication and Networks (ICCCN). 1--9. https:\/\/doi.org\/10.1109\/ICCCN.2017.8038464"}],"event":{"name":"Middleware '19: 20th International Middleware Conference","location":"Davis CA USA","acronym":"Middleware '19","sponsor":["ACM Association for Computing Machinery","IFIP"]},"container-title":["Proceedings of the 20th International Middleware Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3361525.3361538","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3361525.3361538","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T17:49:26Z","timestamp":1750268966000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3361525.3361538"}},"subtitle":["A Flexible Multi-tenant Deep Learning Platform"],"short-title":[],"issued":{"date-parts":[[2019,12,9]]},"references-count":51,"alternative-id":["10.1145\/3361525.3361538","10.1145\/3361525"],"URL":"https:\/\/doi.org\/10.1145\/3361525.3361538","relation":{},"subject":[],"published":{"date-parts":[[2019,12,9]]},"assertion":[{"value":"2019-12-09","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}