{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T18:13:11Z","timestamp":1775326391008,"version":"3.50.1"},"reference-count":39,"publisher":"Springer Science and Business Media LLC","issue":"10","license":[{"start":{"date-parts":[[2021,10,1]],"date-time":"2021-10-01T00:00:00Z","timestamp":1633046400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,10,1]],"date-time":"2021-10-01T00:00:00Z","timestamp":1633046400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100000038","name":"Natural Sciences and Engineering Research Council of Canada","doi-asserted-by":"crossref","award":["http:\/\/dx.doi.org\/10.13039\/501100000038"],"award-info":[{"award-number":["http:\/\/dx.doi.org\/10.13039\/501100000038"]}],"id":[{"id":"10.13039\/501100000038","id-type":"DOI","asserted-by":"crossref"}]},{"name":"the Big Data Research Analytics and Informa- tion Network (BRAIN) Alliance established by Ontario Research Fund - Research Excellence Program"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2021,10]]},"DOI":"10.1007\/s10994-021-06064-w","type":"journal-article","created":{"date-parts":[[2021,10,19]],"date-time":"2021-10-19T22:34:05Z","timestamp":1634682845000},"page":"2867-2903","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["ZipLine: an optimized algorithm for the elastic bulk synchronous parallel model"],"prefix":"10.1007","volume":"110","author":[{"given":"Xing","family":"Zhao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Manos","family":"Papagelis","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Aijun","family":"An","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bao Xin","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junfeng","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yonggang","family":"Hu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2021,10,19]]},"reference":[{"key":"6064_CR1","doi-asserted-by":"crossref","unstructured":"Benz, K., & Bohnert, T. (2013). Dependability modeling framework: A test procedure for high availability in cloud operating systems. In: 2013 IEEE 78th Vehicular technology conference (VTC Fall), IEEE, pp 1\u20138.","DOI":"10.1109\/VTCFall.2013.6692157"},{"key":"6064_CR2","unstructured":"Chen, J., Pan, X., Monga, R., Bengio, S., & Jozefowicz, R. (2016). Revisiting distributed synchronous sgd. arXiv preprint arXiv:160400981."},{"key":"6064_CR3","unstructured":"Chen, T., Li, M., Li, Y., Lin, M., Wang, N., Wang, M., Xiao, T., et\u00a0al. (2015). Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems. arXiv preprint arXiv:151201274."},{"key":"6064_CR4","doi-asserted-by":"publisher","first-page":"514","DOI":"10.1109\/ACCESS.2014.2325029","volume":"2","author":"XW Chen","year":"2014","unstructured":"Chen, X. W., & Lin, X. (2014). Big data deep learning: Challenges and perspectives. IEEE Access, 2, 514\u2013525.","journal-title":"IEEE Access"},{"key":"6064_CR5","unstructured":"Coates, A., Huval, B., Wang, T., Wu, D., Catanzaro, B., & Andrew, N. (2013). Deep learning with cots hpc systems. In: International conference on machine learning, PMLR, pp 1337\u20131345."},{"key":"6064_CR6","unstructured":"Cui, H., Cipar, J., Ho, Q., Kim, J. K., Lee, S., Kumar, A., Wei, J., Dai, W., Ganger, G. R., Gibbons, P. B., et\u00a0al. (2014). Exploiting bounded staleness to speed up big data analytics. In: 2014 USENIX Annual technical conference (USENIX ATC 14), pp 37\u201348."},{"key":"6064_CR7","doi-asserted-by":"crossref","unstructured":"Cui, H., Zhang, H., Ganger, G. R., Gibbons, P. B., & Xing, E. P. (2016). Geeps: Scalable deep learning on distributed gpus with a gpu-specialized parameter server. In: Proceedings of the eleventh european conference on computer systems, pp 1\u201316.","DOI":"10.1145\/2901318.2901323"},{"key":"6064_CR8","unstructured":"Dean, J., Corrado, G., Monga, R., Chen, K., Devin, M., Mao, M., Ranzato, M, Senior, A., Tucker, P., Yang, K., Le, Q., & Ng, A. (2012). Large scale distributed deep networks. In  F. Pereira, C. J. C. Burges, L. Bottu,  & K. Q. Weinberger (Eds.), Advances in neural information processing systems (Vol. 25, pp. 1223\u20131231). Curran Associates, Inc. https:\/\/papers.nips.cc\/paper\/2012\/hash\/6aca97005c68f1206823815f66102863-Abstract.html."},{"key":"6064_CR9","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L. J., Li, K., & Fei-Fei, L. (2009). Imagenet: A large-scale hierarchical image database. In: 2009 IEEE conference on computer vision and pattern recognition, IEEE, pp 248\u2013255.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"6064_CR10","doi-asserted-by":"crossref","unstructured":"Dryden, N., Moon, T., Jacobs, S. A., Van\u00a0Essen, B. (2016). Communication quantization for data-parallel training of deep neural networks. In: 2016 2nd Workshop on machine learning in hpc environments (MLHPC), IEEE, pp 1\u20138.","DOI":"10.1109\/MLHPC.2016.004"},{"key":"6064_CR11","doi-asserted-by":"publisher","unstructured":"Dunke, F. (2014). Online optimization with lookahead. PhD thesis, Karlsruher Institut f\u00fcr Technologie (KIT), https:\/\/doi.org\/10.5445\/IR\/1000042132.","DOI":"10.5445\/IR\/1000042132"},{"key":"6064_CR12","unstructured":"Dutta, S., Joshi, G., Ghosh, S., Dube, P., & Nagpurkar, P. (2018). Slow and stale gradients can win the race: Error-runtime trade-offs in distributed sgd. In: International conference on artificial intelligence and statistics, PMLR, pp 803\u2013812."},{"issue":"2","key":"6064_CR13","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1006\/jpdc.1994.1085","volume":"22","author":"AV Gerbessiotis","year":"1994","unstructured":"Gerbessiotis, A. V., & Valiant, L. G. (1994). Direct bulk-synchronous parallel algorithms. Journal of Parallel and Distributed Computing, 22(2), 251\u2013267.","journal-title":"Journal of Parallel and Distributed Computing"},{"key":"6064_CR14","doi-asserted-by":"crossref","unstructured":"Harlap, A., Cui, H., Dai, W., Wei, J., Ganger, G. R., Gibbons, P. B., Gibson, G. A., & Xing, E. P. (2016). Addressing the straggler problem for iterative convergent parallel ml. In: Proceedings of the seventh ACM symposium on cloud computing, pp 98\u2013111.","DOI":"10.1145\/2987550.2987554"},{"key":"6064_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"6064_CR16","first-page":"1223","volume-title":"Advances in neural information processing systems","author":"Q Ho","year":"2013","unstructured":"Ho, Q., Cipar, J., Cui, H., Lee, S., Kim, J. K., Gibbons, P. B., et al. (2013). More effective distributed ml via a stale synchronous parallel parameter server. Advances in neural information processing systems (pp. 1223\u20131231). New York: Curran Associates Inc."},{"key":"6064_CR17","volume-title":"Learning multiple layers of features from tiny images","author":"A Krizhevsky","year":"2009","unstructured":"Krizhevsky, A., & Hinton, G. (2009). Learning multiple layers of features from tiny images. Citeseer: Tech. rep."},{"key":"6064_CR18","first-page":"1097","volume":"25","author":"A Krizhevsky","year":"2012","unstructured":"Krizhevsky, A., Sutskever, I., & Hinton, G. E. (2012). Imagenet classification with deep convolutional neural networks. Advances in Neural Information Processing Systems, 25, 1097\u20131105.","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"12","key":"6064_CR19","doi-asserted-by":"publisher","first-page":"2802","DOI":"10.1109\/TPDS.2020.3003307","volume":"31","author":"M Langer","year":"2020","unstructured":"Langer, M., He, Z., Rahayu, W., & Xue, Y. (2020). Distributed training of deep learning models: A taxonomic perspective. IEEE Transactions on Parallel and Distributed Systems, 31(12), 2802\u20132818.","journal-title":"IEEE Transactions on Parallel and Distributed Systems"},{"key":"6064_CR20","doi-asserted-by":"crossref","unstructured":"Li, H., Kadav, A., Kruus, E., & Ungureanu, C. (2015). Malt: distributed data-parallelism for existing ml applications. In: Proceedings of the tenth european conference on computer systems, pp 1\u201316.","DOI":"10.1145\/2741948.2741965"},{"key":"6064_CR21","doi-asserted-by":"crossref","unstructured":"Li, M., Andersen, D. G., Park, J. W., Smola, A. J., Ahmed, A., Josifovski, V., Long, J., Shekita, E. J., & Su, B. Y. (2014). Scaling distributed machine learning with the parameter server. In: 11th USENIX Symposium on operating systems design and implementation (OSDI 14), pp 583\u2013598.","DOI":"10.1145\/2640087.2644155"},{"key":"6064_CR22","unstructured":"Liu, L., Jiang, H., He, P., Chen, W., Liu, X., Gao, J., & Han, J. (2019). On the variance of the adaptive learning rate and beyond. arXiv preprint arXiv:190803265."},{"key":"6064_CR23","unstructured":"Moritz, P., Nishihara, R., Stoica, I., & Jordan. M. I. (2015). Sparknet: Training deep networks in spark. arXiv preprint arXiv:151106051."},{"key":"6064_CR24","unstructured":"Recht, B., Re, C., Wright, S., & Niu, F. (2011). Hogwild: A lock-free approach to parallelizing stochastic gradient descent. In: Proceedings of the 24th international conference on neural information processing systems, NIPS\u201911, p 693-701."},{"key":"6064_CR25","unstructured":"Simonyan, K., & Zisserman, A. (2014). Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:14091556."},{"key":"6064_CR26","doi-asserted-by":"crossref","unstructured":"Strom, N. (2015). Scalable distributed dnn training using commodity gpu cloud computing. In: Sixteenth annual conference of the international speech communication association, ISCA, pp 1488\u20131492.","DOI":"10.21437\/Interspeech.2015-354"},{"key":"6064_CR27","first-page":"6378","volume":"31","author":"M Teng","year":"2018","unstructured":"Teng, M., & Wood, F. (2018). Bayesian distributed stochastic gradient descent. Advances in Neural Information Processing Systems, 31, 6378\u20136388.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"6064_CR28","doi-asserted-by":"crossref","unstructured":"Van\u00a0Hasselt, H., Guez, A., & Silver, D. (2016). Deep reinforcement learning with double q-learning. In: Proceedings of the AAAI conference on artificial intelligence, AAAI\u201916, pp. 2094\u20132100.","DOI":"10.1609\/aaai.v30i1.10295"},{"key":"6064_CR29","unstructured":"Wang, J., & Joshi, G. (2019). Adaptive communication strategies to achieve the best error-runtime trade-off in local-update sgd. In: Proceedings of machine learning and systems (SysML\u201919) 1: pp. 212\u2013229."},{"key":"6064_CR30","doi-asserted-by":"crossref","unstructured":"Wilson, D. R., & Martinez, T. R. (2001). The need for small learning rates on large problems. In: IJCNN\u201901. International joint conference on neural networks. Proceedings (Cat. No. 01CH37222), IEEE, vol\u00a01, pp 115\u2013119.","DOI":"10.1109\/IJCNN.2001.939002"},{"key":"6064_CR31","doi-asserted-by":"crossref","unstructured":"Wu, Y., Liu, L., Bae, J., Chow, K. H., Iyengar, A., Pu, C., Wei, W., Yu, L., & Zhang, Q. (2019). Demystifying learning rate policies for high accuracy training of deep neural networks. In: 2019 IEEE International conference on big data (Big Data), IEEE, pp 1971\u20131980.","DOI":"10.1109\/BigData47090.2019.9006104"},{"key":"6064_CR32","unstructured":"Zhang, H., Zheng, Z., Xu, S., Dai, W., Ho, Q., Liang, X., et\u00a0al. (2017). Poseidon: An efficient communication architecture for distributed deep learning on gpu clusters. In: USENIX Annual technical conference (USENIX ATC 17), pp 181\u2013193."},{"key":"6064_CR33","first-page":"906","volume":"33","author":"H Zhang","year":"2020","unstructured":"Zhang, H., Li, Y., Deng, Z., Liang, X., Carin, L., & Xing, E. (2020). Autosync: Learning to synchronize for data-parallel distributed deep learning. Advances in Neural Information Processing Systems, 33, 906\u2013917.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"6064_CR34","first-page":"685","volume":"28","author":"S Zhang","year":"2015","unstructured":"Zhang, S., Choromanska, A. E., & LeCun, Y. (2015). Deep learning with elastic averaging sgd. Advances in Neural Information Processing Systems, 28, 685\u2013693.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"6064_CR35","doi-asserted-by":"crossref","unstructured":"Zhao, X., An, A., Liu, J., Chen, B. X. (2019a). Dynamic stale synchronous parallel distributed training for deep learning. In: 2019 IEEE 39th International conference on distributed computing systems (ICDCS\u201919), IEEE, pp 1507\u20131517.","DOI":"10.1109\/ICDCS.2019.00150"},{"key":"6064_CR36","doi-asserted-by":"crossref","unstructured":"Zhao, X., Papagelis, M., An, A., Chen, B. X., Liu, J., & Hu, Y. (2019b). Elastic bulk synchronous parallel model for distributed deep learning. In: 2019 IEEE International conference on data mining (ICDM\u201919), IEEE, pp 1504\u20131509.","DOI":"10.1109\/ICDM.2019.00198"},{"key":"6064_CR37","unstructured":"Zhou, Z., Mertikopoulos, P., Bambos, N., Glynn, P., Ye, Y., Li, L. J., & Li, F. F. (2018). Distributed asynchronous optimization with unbounded delays: How slow can you go? In: 2018 International conference on machine learning, PMLR, pp 5970\u20135979."},{"key":"6064_CR38","unstructured":"Zhu, R., Yang, S., Pfadler, A., Qian, Z., & Zhou, J. (2020). Learning efficient parameter server synchronization policies for distributed sgd. In: 8th International conference on learning representations, URL https:\/\/openreview.net\/forum?id=rJxX8T4Kvr."},{"key":"6064_CR39","unstructured":"Zinkevich, M., Weimer, M., Li, L., & Smola, A. J. (2010). Parallelized stochastic gradient descent. In: Proceedings of the 23rd international conference on neural information processing systems, vol\u00a02, pp 2595\u20132603."}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-021-06064-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10994-021-06064-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-021-06064-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,19]],"date-time":"2022-10-19T00:03:22Z","timestamp":1666137802000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10994-021-06064-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10]]},"references-count":39,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2021,10]]}},"alternative-id":["6064"],"URL":"https:\/\/doi.org\/10.1007\/s10994-021-06064-w","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,10]]},"assertion":[{"value":"2 March 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 August 2021","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 September 2021","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 October 2021","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}