{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,30]],"date-time":"2025-04-30T10:45:43Z","timestamp":1746009943372},"reference-count":32,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2023,12,24]],"date-time":"2023-12-24T00:00:00Z","timestamp":1703376000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,12,24]],"date-time":"2023-12-24T00:00:00Z","timestamp":1703376000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Comput Manag Sci"],"published-print":{"date-parts":[[2024,6]]},"DOI":"10.1007\/s10287-023-00496-6","type":"journal-article","created":{"date-parts":[[2023,12,24]],"date-time":"2023-12-24T02:01:43Z","timestamp":1703383303000},"update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Preconditioning meets biased compression for efficient distributed optimization"],"prefix":"10.1007","volume":"21","author":[{"given":"Vitali","family":"Pirau","sequence":"first","affiliation":[]},{"given":"Aleksandr","family":"Beznosikov","sequence":"additional","affiliation":[]},{"given":"Martin","family":"Tak\u00e1\u010d","sequence":"additional","affiliation":[]},{"given":"Vladislav","family":"Matyukhin","sequence":"additional","affiliation":[]},{"given":"Alexander","family":"Gasnikov","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,12,24]]},"reference":[{"key":"496_CR1","unstructured":"Agafonov A, Kamzolov D, Tappenden R, Gasnikov A, Tak\u00e1\u010d M (2022) Flecs: a federated learning second-order framework via compression and sketching. arXiv:2206.02009"},{"key":"496_CR2","unstructured":"Alistarh D, Hoefler T, Johansson M, Konstantinov N, Khirirat S, Renggli C (2018) The convergence of sparsified gradient methods. Adv Neural Inform Process Syst 31"},{"key":"496_CR3","unstructured":"Arora S, Cohen N, Hazan E (2018) On the optimization of deep networks: Implicit acceleration by overparameterization. In: International Conference on Machine Learning. PMLR, pp 244\u2013253"},{"issue":"11\u201312","key":"496_CR4","doi-asserted-by":"publisher","first-page":"1214","DOI":"10.1016\/j.apnum.2007.01.003","volume":"57","author":"C Bekas","year":"2007","unstructured":"Bekas C, Kokiopoulou E, Saad Y (2007) An estimator for the diagonal of a matrix. Appl Numer Math 57(11\u201312):1214\u20131229","journal-title":"Appl Numer Math"},{"key":"496_CR5","unstructured":"Beznosikov A, Horv\u00e1th S, Richt\u00e1rik P, Safaryan M (2020) On biased compression for distributed learning. arXiv:2002.12410"},{"key":"496_CR6","unstructured":"Dehghani M, Djolonga J, Mustafa B, Padlewski P, Heek J, Gilmer J, Steiner A, Caron M, Geirhos R, Alabdulmohsin I, et al (2023) Scaling vision transformers to 22 billion parameters. arXiv preprint arXiv:2302.05442"},{"issue":"7","key":"496_CR7","first-page":"2121","volume":"12","author":"J Duchi","year":"2011","unstructured":"Duchi J, Hazan E, Singer Y (2011) Adaptive subgradient methods for online learning and stochastic optimization. J. Mach. Learn. Res. 12(7):2121\u20132159","journal-title":"J. Mach. Learn. Res."},{"key":"496_CR8","unstructured":"Fatkhullin I, Sokolov I, Gorbunov E, Li Z, Richt\u00e1rik P (2021) Ef21 with bells & whistles: Practical algorithmic extensions of modern error feedback. arXiv preprint arXiv:2110.03294"},{"key":"496_CR9","unstructured":"Gorbunov E, Burlachenko KP, Li Z, Richt\u00e1rik P (2021) Marina: Faster non-convex distributed learning with compression. In: International conference on machine learning. PMLR, pp 3788\u20133798"},{"key":"496_CR10","unstructured":"Gruntkowska K, Tyurin A, Richt\u00e1rik P (2022) Ef21-p and friends: Improved theoretical communication complexity for distributed optimization with bidirectional compression. arXiv preprint arXiv:2209.15218"},{"key":"496_CR11","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"496_CR12","unstructured":"Horv\u00f3th S, Ho C-Y, Horvath L, Sahu AN, Canini M, Richt\u00e1rik P (2022) Natural compression for distributed deep learning. In: Mathematical and scientific machine learning. PMLR, pp 129\u2013141"},{"key":"496_CR13","unstructured":"Jahani M, Rusakov S, Shi Z, Richt\u00e1rik P, Mahoney MW, Tak\u00e1\u010d M (2021) Doubly adaptive scaled algorithm for machine learning using second-order information. arXiv preprint arXiv:2109.05198"},{"key":"496_CR14","unstructured":"Karimireddy SP, Rebjock Q, Stich S, Jaggi M (2019) Error feedback fixes signsgd and other gradient compression schemes. In: International conference on machine learning. PMLR, pp 3252\u20133261"},{"key":"496_CR15","unstructured":"Kingma DP, Ba J (2014) Adam: A method for stochastic optimization. arXiv:1412.6980"},{"key":"496_CR16","unstructured":"Kone\u010dn\u1ef3 J, McMahan HB, Yu FX, Richt\u00e1rik P, Suresh AT, Bacon D (2016) Federated learning: Strategies for improving communication efficiency. arXiv:1610.05492"},{"key":"496_CR17","unstructured":"Krizhevsky A, Hinton G, et al (2009) Learning multiple layers of features from tiny images"},{"key":"496_CR18","unstructured":"Lin Y, Han S, Mao H, Wang Y, Dally WJ (2017) Deep gradient compression: Reducing the communication bandwidth for distributed training. arXiv:1712.01887"},{"key":"496_CR19","unstructured":"Mao A, Mohri M, Zhong Y (2023) Cross-entropy loss functions: Theoretical analysis and applications. arXiv preprint arXiv:2304.07288"},{"key":"496_CR20","unstructured":"Mishchenko K, Gorbunov E, Tak\u00e1\u010d M, Richt\u00e1rik P (2019) Distributed learning with compressed gradient differences. arXiv:1901.09269"},{"key":"496_CR21","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-319-91578-4","volume-title":"Lectures on Convex Optimization","author":"Y Nesterov","year":"2018","unstructured":"Nesterov Y et al (2018) Lectures on Convex Optimization, vol 137. Springer, Berlin"},{"key":"496_CR22","unstructured":"OpenAI (2023) Gpt-4 technical report. arXiv:2303.08774"},{"key":"496_CR23","first-page":"4384","volume":"34","author":"P Richt\u00e1rik","year":"2021","unstructured":"Richt\u00e1rik P, Sokolov I, Fatkhullin I (2021) Ef21: A new, simpler, theoretically better, and practically faster error feedback. Adv Neural Inf Process Syst 34:4384\u20134396","journal-title":"Adv Neural Inf Process Syst"},{"key":"496_CR24","unstructured":"Sadiev A, Beznosikov A, Almansoori AJ, Kamzolov D, Tappenden R, Tak\u00e1\u010d M (2022) Stochastic gradient methods with preconditioned updates. arXiv:2206.00285"},{"key":"496_CR25","unstructured":"Sapio A, Canini M, Ho C-Y, Nelson J, Kalnis P, Kim C, Krishnamurthy A, Moshref M, Ports DR, Richt\u00e1rik P (2019) Scaling distributed machine learning with in-network aggregation. arXiv preprint arXiv:1903.06701"},{"key":"496_CR26","doi-asserted-by":"crossref","unstructured":"Seide F, Fu H, Droppo J, Li G, Yu D (2014) 1-bit stochastic gradient descent and its application to data-parallel distributed training of speech dnns. In: Fifteenth annual conference of the international speech communication association","DOI":"10.21437\/Interspeech.2014-274"},{"key":"496_CR27","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9781107298019","volume-title":"Understanding machine learning: from theory to algorithms","author":"S Shalev-Shwartz","year":"2014","unstructured":"Shalev-Shwartz S, Ben-David S (2014) Understanding machine learning: from theory to algorithms. Cambridge University Press, Cambridge"},{"issue":"2","key":"496_CR28","first-page":"26","volume":"4","author":"T Tieleman","year":"2012","unstructured":"Tieleman T, Hinton G et al (2012) Lecture 6.5-rmsprop: divide the gradient by a running average of its recent magnitude. COURSERA Neural Netw Mach Learn 4(2):26\u201331","journal-title":"COURSERA Neural Netw Mach Learn"},{"key":"496_CR29","unstructured":"Vapnik V (1991) Principles of risk minimization for learning theory. Adv Neural Inform Process Syst 4"},{"key":"496_CR30","first-page":"7","volume-title":"Numerical optimization","author":"S Wright","year":"1999","unstructured":"Wright S, Nocedal J et al (1999) Numerical optimization, vol 35(67\u201368). Springer, Berlin, p 7"},{"key":"496_CR31","unstructured":"Zhang J, Karimireddy SP, Veit A, Kim S, Reddi SJ, Kumar S, Sra S (2019) Why adam beats sgd for attention models"},{"key":"496_CR32","unstructured":"Zhao WX, Zhou K, Li J, Tang T, Wang X, Hou Y, Min Y, Zhang B, Zhang J, Dong Z, et al (2023) A survey of large language models. arXiv preprint arXiv:2303.18223"}],"container-title":["Computational Management Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10287-023-00496-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10287-023-00496-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10287-023-00496-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,6,22]],"date-time":"2024-06-22T12:09:42Z","timestamp":1719058182000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10287-023-00496-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,24]]},"references-count":32,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2024,6]]}},"alternative-id":["496"],"URL":"https:\/\/doi.org\/10.1007\/s10287-023-00496-6","relation":{},"ISSN":["1619-697X","1619-6988"],"issn-type":[{"value":"1619-697X","type":"print"},{"value":"1619-6988","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,12,24]]},"assertion":[{"value":"30 June 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 November 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 December 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"14"}}