{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T05:08:30Z","timestamp":1755839310628,"version":"3.37.3"},"reference-count":33,"publisher":"Springer Science and Business Media LLC","issue":"13","license":[{"start":{"date-parts":[[2022,4,21]],"date-time":"2022-04-21T00:00:00Z","timestamp":1650499200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,4,21]],"date-time":"2022-04-21T00:00:00Z","timestamp":1650499200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"name":"National Key Research and Development Program of China","award":["(2016YFB0200902"],"award-info":[{"award-number":["(2016YFB0200902"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2022,9]]},"DOI":"10.1007\/s11227-022-04466-8","type":"journal-article","created":{"date-parts":[[2022,4,21]],"date-time":"2022-04-21T10:02:44Z","timestamp":1650535364000},"page":"15663-15680","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["EP4DDL: addressing straggler problem in heterogeneous distributed deep learning"],"prefix":"10.1007","volume":"78","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0362-7506","authenticated-orcid":false,"given":"Zeyu","family":"Ji","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xingjun","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingbo","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jia","family":"Wei","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zheng","family":"Wei","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,4,21]]},"reference":[{"key":"4466_CR1","doi-asserted-by":"publisher","first-page":"101515","DOI":"10.1016\/j.techsoc.2020.101515","volume":"64","author":"Y Zhong","year":"2021","unstructured":"Zhong Y, Oh S, Moon HC (2021) Service transformation under industry 4.0: investigating acceptance of facial recognition payment through an extended technology acceptance model. Technol Soc 64:101515","journal-title":"Technol Soc"},{"issue":"1","key":"4466_CR2","doi-asserted-by":"publisher","first-page":"252","DOI":"10.1038\/s41386-020-00842-1","volume":"46","author":"R Stewart","year":"2021","unstructured":"Stewart R, Velupillai S (2021) Applied natural language processing in mental health big data. Neuropsychopharmacology 46(1):252","journal-title":"Neuropsychopharmacology"},{"key":"4466_CR3","unstructured":"Lanctot M, Lockhart E, Lespiau JB, et al (2019) OpenSpiel: a framework for reinforcement learning in games. arXiv preprint arXiv:1908.09453"},{"issue":"8","key":"4466_CR4","doi-asserted-by":"publisher","first-page":"1947","DOI":"10.1109\/TPDS.2021.3052895","volume":"32","author":"Y Peng","year":"2021","unstructured":"Peng Y, Bao Y, Chen Y et al (2021) Dl2: a deep learning-driven scheduler for deep learning clusters. IEEE Trans Parallel Distrib Syst 32(8):1947\u20131960","journal-title":"IEEE Trans Parallel Distrib Syst"},{"key":"4466_CR5","doi-asserted-by":"crossref","unstructured":"Jiang J, Cui B, Zhang C et al (2017) Heterogeneity-aware distributed parameter servers. In: Proceedings of the ACM International Conference on Management of Data, pp 463\u2013478","DOI":"10.1145\/3035918.3035933"},{"key":"4466_CR6","unstructured":"Ho Q, Cipar J, Cui H et al (2013) More effective distributed ml via a stale synchronous parallel parameter server. Adv Neural Inf Process Syst:1223"},{"issue":"1","key":"4466_CR7","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1109\/TC.2020.2974461","volume":"70","author":"Q Zhou","year":"2020","unstructured":"Zhou Q, Guo S, Lu H et al (2020) Falcon: addressing stragglers in heterogeneous parameter server via multiple parallelism. IEEE Trans Comput 70(1):139\u2013155","journal-title":"IEEE Trans Comput"},{"issue":"12","key":"4466_CR8","doi-asserted-by":"publisher","first-page":"10050","DOI":"10.1007\/s11227-020-03241-x","volume":"76","author":"SS Gill","year":"2020","unstructured":"Gill SS, Ouyang X, Garraghan P (2020)\u00a0Tails in the cloud: a survey and taxonomy of straggler management within large-scale cloud data centres. J Supercomputing\u00a076(12):10050\u201310089","journal-title":"J Supercomputing"},{"key":"4466_CR9","doi-asserted-by":"crossref","unstructured":"Harlap A, Cui H, Dai W et al (2016) Addressing the straggler problem for iterative convergent parallel ML. In: Proceedings of the Seventh ACM Symposium on Cloud Computing, pp 98\u2013111","DOI":"10.1145\/2987550.2987554"},{"issue":"Special Issue o","key":"4466_CR10","first-page":"7","volume":"6","author":"A Kishor","year":"2021","unstructured":"Kishor A, Chakraborty C, Jeberson W (2021) A novel fog computing approach for minimization of latency in healthcare using machine learning. Int J Interact Multimed Artif Intell 6(Special Issue on Current Trends in Intelligent Multimedia Processing Systems):7\u201317","journal-title":"Int J Interact Multimed Artif Intell"},{"key":"4466_CR11","first-page":"1","volume":"4","author":"M Benalla","year":"2016","unstructured":"Benalla M (2016) A distributed intelligent system for emergency convoy. Int J Interact Multimed Artif Intell 4:1","journal-title":"Int J Interact Multimed Artif Intell"},{"issue":"2","key":"4466_CR12","doi-asserted-by":"publisher","first-page":"12","DOI":"10.1145\/3152042.3152047","volume":"45","author":"MF Aktas","year":"2017","unstructured":"Aktas MF, Peng P, Soljanin E (2017) Effective straggler mitigation: which clones should attack and when? ACM SIGMETRICS Perform Eval Rev 45(2):12\u201314","journal-title":"ACM SIGMETRICS Perform Eval Rev"},{"issue":"3","key":"4466_CR13","doi-asserted-by":"publisher","first-page":"962","DOI":"10.1109\/TNNLS.2020.2979762","volume":"32","author":"J Zhang","year":"2020","unstructured":"Zhang J, Simeone O (2020) LAGC: Lazily aggregated gradient coding for straggler-tolerant and communication-efficient distributed learning[J]. IEEE Trans Neural Networks Learn Syst 32(3): 962\u2013974","journal-title":"IEEE Trans Neural Networks Learn Syst"},{"issue":"1","key":"4466_CR14","doi-asserted-by":"publisher","first-page":"277","DOI":"10.1109\/JSAIT.2020.2991361","volume":"1","author":"R Bitar","year":"2020","unstructured":"Bitar R, Wootters M, El Rouayheb S (2020) Stochastic gradient coding for straggler mitigation in distributed learning. IEEE J Sel Areas Inf Theor 1(1):277\u2013291","journal-title":"IEEE J Sel Areas Inf Theor"},{"issue":"3","key":"4466_CR15","doi-asserted-by":"publisher","first-page":"798","DOI":"10.1109\/TPDS.2016.2587641","volume":"28","author":"Y Guo","year":"2016","unstructured":"Guo Y, Rao J, Jiang C et al (2016) Moving hadoop into the cloud with flexible slot management and speculative execution. IEEE Tran Parallel Distrib Syst 28(3):798\u2013812","journal-title":"IEEE Tran Parallel Distrib Syst"},{"issue":"5","key":"4466_CR16","doi-asserted-by":"publisher","first-page":"566","DOI":"10.1145\/3187009.3177734","volume":"11","author":"Y Huang","year":"2018","unstructured":"Huang Y, Jin T, Wu Y et al (2018) Flexps: Flexible parallelism control in parameter server architecture. Proc VLDB Endow 11(5):566\u2013579","journal-title":"Proc VLDB Endow"},{"key":"4466_CR17","first-page":"1097","volume":"25","author":"A Krizhevsky","year":"2012","unstructured":"Krizhevsky A, Sutskever I, Hinton GE (2012) Imagenet classification with deep convolutional neural networks. Adv Neural Inf Process Syst 25:1097\u20131105","journal-title":"Adv Neural Inf Process Syst"},{"key":"4466_CR18","unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556"},{"key":"4466_CR19","unstructured":"LeCun Y, Cortes C, Burges CJC \"THE MNIST DATABASE of handwritten digits\". http:\/\/yann.lecun.com\/exdb\/mnist\/"},{"key":"4466_CR20","unstructured":"Krizhevsky A, Nair V, Hinton G CIFAR-10: cs.toronto.edu\/~kriz\/cifar.html"},{"key":"4466_CR21","unstructured":"Huang Y, Cheng Y, Bapna A et al (2018) Gpipe: Efficient training of giant neural networks using pipeline parallelism. arXiv preprint arXiv:1811.06965"},{"key":"4466_CR22","unstructured":"Dean J, Corrado GS, Monga R et al (2012) Large scale distributed deep networks"},{"key":"4466_CR23","unstructured":"Wu X, Xu H, Li B et al (2020) Stanza: layer separation for distributed training in deep learning. IEEE Trans Serv Comput"},{"key":"4466_CR24","doi-asserted-by":"crossref","unstructured":"Geng J, Li D, Wang S (2020) Fela: incorporating flexible parallelism and elastic tuning to accelerate large-scale DML. In: 2020 IEEE 36th International Conference on Data Engineering (ICDE). IEEE, pp 1393\u20131404","DOI":"10.1109\/ICDE48307.2020.00124"},{"key":"4466_CR25","unstructured":"Chen J, Pan X, Monga R et al (2016) Revisiting distributed synchronous SGD. arXiv preprint arXiv:1604.00981"},{"key":"4466_CR26","unstructured":"Zheng S, Meng Q, Wang T et al (2017) Asynchronous stochastic gradient descent with delay compensation. In: International Conference on Machine Learning. PMLR, pp 4120\u20134129"},{"issue":"Special Issue o","key":"4466_CR27","first-page":"66","volume":"6","author":"S Costantini","year":"2021","unstructured":"Costantini S, De Gasperis G, De Lauretis L (2021) An application of declarative languages in distributed architectures: ASP and DALI microservices. Int J Interact Multimed Artif Intell 6(Special Issue on Artificial Intelligence, Paving the Way to the Future):66\u201378","journal-title":"Int J Interact Multimed Artif Intell"},{"key":"4466_CR28","first-page":"693","volume":"24","author":"F Niu","year":"2011","unstructured":"Niu F, Recht B, Re C et al (2011) HOGWILD!: a lock-free approach to parallelizing stochastic gradient descent. Adv Neural Inf Process Syst 24:693\u2013701","journal-title":"Adv Neural Inf Process Syst"},{"key":"4466_CR29","unstructured":"Zhang W, Gupta S, Lian X et al (2016) Staleness-aware async-SGD for distributed deep learning. In: Proceedings of the Twenty-Fifth International Joint Conference on Artificial Intelligence, pp 2350\u20132356"},{"key":"4466_CR30","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.future.2021.02.012","volume":"120","author":"M Chen","year":"2021","unstructured":"Chen M, Mao B, Ma T (2021) FedSA: a staleness-aware asynchronous Federated Learning algorithm with non-IID data. Fut Gener Comput Syst 120:1\u201312","journal-title":"Fut Gener Comput Syst"},{"issue":"10","key":"4466_CR31","doi-asserted-by":"publisher","first-page":"2176","DOI":"10.1109\/TPDS.2018.2827055","volume":"29","author":"H Khaleghzadeh","year":"2018","unstructured":"Khaleghzadeh H, Manumachu RR, Lastovetsky A (2018) A novel data-partitioning algorithm for performance optimization of data-parallel applications on heterogeneous HPC platforms[J]. IEEE Trans Parallel Distribut Syst 29(10):2176\u20132190","journal-title":"IEEE Trans Parallel Distribut Syst"},{"key":"4466_CR32","doi-asserted-by":"crossref","unstructured":"Cho K, Van Merri\u00ebnboer B, Gulcehre C et al (2014) Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078","DOI":"10.3115\/v1\/D14-1179"},{"key":"4466_CR33","doi-asserted-by":"crossref","unstructured":"Chen, C et al (2018) Fast distributed deep learning via worker-adaptive batch sizing. In: Proceedings of the ACM Symposium on Cloud Computing","DOI":"10.1145\/3267809.3275463"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-022-04466-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-022-04466-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-022-04466-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,8]],"date-time":"2022-08-08T15:14:46Z","timestamp":1659971686000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-022-04466-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,4,21]]},"references-count":33,"journal-issue":{"issue":"13","published-print":{"date-parts":[[2022,9]]}},"alternative-id":["4466"],"URL":"https:\/\/doi.org\/10.1007\/s11227-022-04466-8","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"type":"print","value":"0920-8542"},{"type":"electronic","value":"1573-0484"}],"subject":[],"published":{"date-parts":[[2022,4,21]]},"assertion":[{"value":"17 March 2022","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 April 2022","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}