{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:24:00Z","timestamp":1740122640537,"version":"3.37.3"},"reference-count":51,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2021,9,20]],"date-time":"2021-09-20T00:00:00Z","timestamp":1632096000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,9,20]],"date-time":"2021-09-20T00:00:00Z","timestamp":1632096000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61772020"],"award-info":[{"award-number":["61772020"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2022,5]]},"DOI":"10.1007\/s10489-021-02618-6","type":"journal-article","created":{"date-parts":[[2021,9,20]],"date-time":"2021-09-20T20:37:20Z","timestamp":1632170240000},"page":"7091-7112","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Faster doubly stochastic functional gradient by gradient preconditioning for scalable kernel methods"],"prefix":"10.1007","volume":"52","author":[{"given":"Zhuan","family":"Zhang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4764-9483","authenticated-orcid":false,"given":"Shuisheng","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Ting","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Junna","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,9,20]]},"reference":[{"key":"2618_CR1","unstructured":"Altschuler J, Bach F, Rudi A, Niles-Weed J (2019) Massively scalable Sinkhorn distances via the nystrom\u0308 method. In: Advances in neural information processing systems, pp 4427\u20134437"},{"key":"2618_CR2","unstructured":"Avron H, Kapralov M, Musco C, Musco C, Velingker A, Zandieh A (2017) Random Fourier features for kernel ridge regression: Approximation bounds and statistical guarantees. In: International conference on machine learning, pp 253\u2013262"},{"issue":"1","key":"2618_CR3","first-page":"4096","volume":"17","author":"H Avron","year":"2016","unstructured":"Avron H, Sindhwani V, Yang J, Mahoney MW (2016) Quasi-monte Carlo feature maps for shift-invariant kernels. J Mach Learn Res 17(1):4096\u20134133","journal-title":"J Mach Learn Res"},{"key":"2618_CR4","doi-asserted-by":"crossref","unstructured":"Bengio Y (2012) Practical recommendations for gradient-based training of deep architectures. In: Neural networks: tricks of the trade, Springer, pp 437\u2013478","DOI":"10.1007\/978-3-642-35289-8_26"},{"key":"2618_CR5","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511804441","volume-title":"Convex optimization","author":"S Boyd","year":"2004","unstructured":"Boyd S, Vandenberghe L (2004) Convex optimization. Cambridge University Press, Cambridge"},{"issue":"5","key":"2618_CR6","doi-asserted-by":"publisher","first-page":"1155","DOI":"10.1162\/neco.2007.19.5.1155","volume":"19","author":"O Chapelle","year":"2007","unstructured":"Chapelle O (2007) Training a support vector machine in the primal. Neural Comput 19 (5):1155\u20131178","journal-title":"Neural Comput"},{"key":"2618_CR7","doi-asserted-by":"crossref","unstructured":"Ch\u00e1vez G, Liu Y, Ghysels P, Li XS, Rebrova E (2020) Scalable and memory-efficient kernel ridge regression. In: 2020 IEEE International parallel and distributed processing symposium (IPDPS), pp 956\u2013965","DOI":"10.1109\/IPDPS47924.2020.00102"},{"key":"2618_CR8","unstructured":"Chen X, Yang H, King I, Lyu MR (2015) Training-efficient feature map for shift-invariant kernels. In: Twenty-fourth international joint conference on artificial intelligence, pp 3395\u20133401"},{"key":"2618_CR9","unstructured":"Cutajar K, Osborne M, Cunningham J, Filippone M (2016) Preconditioning kernel matrices. In: International conference on machine learning, pp 2529\u20132538"},{"key":"2618_CR10","unstructured":"Dai B, Xie B, He N, Liang Y, Raj A, Balcan MFF, Song L (2014) Scalable kernel methods via doubly stochastic gradients. In: Advances in neural information processing systems, pp 3041\u20133049"},{"issue":"7","key":"2618_CR11","first-page":"2121","volume":"12","author":"JC Duchi","year":"2011","unstructured":"Duchi JC, Hazan E, Singer Y (2011) Adaptive subgradient methods for online learning and stochastic optimization. J Mach Learn Res 12(7):2121\u20132159","journal-title":"J Mach Learn Res"},{"issue":"Dec","key":"2618_CR12","first-page":"243","volume":"2","author":"S Fine","year":"2001","unstructured":"Fine S, Scheinberg K (2001) Efficient SVM training using low-rank kernel representations. J Mach Learn Res 2(Dec):243\u2013264","journal-title":"J Mach Learn Res"},{"key":"2618_CR13","unstructured":"Gonen A, Orabona F, Shalevshwartz S (2016) Solving ridge regression using sketched preconditioned SVRG. In: International conference on machine learning, pp 1397\u20131405"},{"key":"2618_CR14","unstructured":"Gu B, Geng X, Li X, Shi W, Zheng G, Deng C, Huang H (2020) Scalable kernel ordinal regression via doubly stochastic gradients. IEEE Transactions on Neural Networks and Learning Systems, pp 1\u201313"},{"issue":"4","key":"2618_CR15","doi-asserted-by":"publisher","first-page":"1116","DOI":"10.1137\/16M1105396","volume":"38","author":"KLC Haim Avron","year":"2017","unstructured":"Haim Avron KLC, Woodruff DP (2017) Faster kernel ridge regression using sketching and preconditioning. SIAM J Matrix Anal Appl 38(4):1116\u20131138","journal-title":"SIAM J Matrix Anal Appl"},{"key":"2618_CR16","unstructured":"Kar P, Karnick H (2012) Random feature maps for dot product kernels. In: Artificial intelligence and statistics, pp 583\u2013591"},{"key":"2618_CR17","unstructured":"Kingma DP, Ba J (2015) Adam: a method for stochastic optimization. In: International conference on learning representations, pp 1\u201313"},{"issue":"8","key":"2618_CR18","doi-asserted-by":"publisher","first-page":"2165","DOI":"10.1109\/TSP.2004.830991","volume":"52","author":"J Kivinen","year":"2004","unstructured":"Kivinen J, Smola AJ, Williamson RC (2004) Online learning with kernels. IEEE Trans Signal Process 52(8):2165\u20132176","journal-title":"IEEE Trans Signal Process"},{"key":"2618_CR19","doi-asserted-by":"crossref","unstructured":"Kolotilina LY, Axelsson O (1990) Preconditioned conjugate gradient methods. Springer","DOI":"10.1007\/BFb0090897"},{"key":"2618_CR20","unstructured":"Le Roux N, Manzagol PA, Bengio Y (2007) Topmoumoute online natural gradient algorithm. In: Advances in neural information processing systems, pp 849\u2013856"},{"key":"2618_CR21","doi-asserted-by":"publisher","first-page":"84242","DOI":"10.1109\/ACCESS.2019.2924542","volume":"7","author":"D Lei","year":"2019","unstructured":"Lei D, Tang J, Li Z, Wu Y (2019) Using low-rank approximations to speed up kernel logistic regression algorithm. IEEE Access 7:84242\u201384252","journal-title":"IEEE Access"},{"key":"2618_CR22","unstructured":"Li CL, P\u00f3czos B (2016) Utilize old coordinates: Faster doubly stochastic gradients for kernel methods. UAI, pp 467\u2013476"},{"key":"2618_CR23","unstructured":"Li X, Gu B, Ao S, Wang H, Ling CX (2017) Triply stochastic gradients on multiple kernel learning. In: UAI"},{"key":"2618_CR24","unstructured":"Li Z, Ton JF, Oglic D, Sejdinovic D (2019) Towards a unified analysis of random Fourier features. In: International conference on machine learning, pp 3905\u20133914"},{"key":"2618_CR25","doi-asserted-by":"publisher","first-page":"42","DOI":"10.1016\/j.jco.2018.02.004","volume":"47","author":"J Lin","year":"2018","unstructured":"Lin J, Rosasco L (2018) Generalization properties of doubly stochastic learning algorithms. J Complex 47:42\u201361","journal-title":"J Complex"},{"key":"2618_CR26","doi-asserted-by":"crossref","unstructured":"Liu F, Huang X, Chen Y, Suykens JA (2020) Random features for kernel approximation:, A survey in algorithms, theory, and beyond. arXiv:2004.11154","DOI":"10.1109\/TPAMI.2021.3097011"},{"issue":"4","key":"2618_CR27","doi-asserted-by":"publisher","first-page":"983","DOI":"10.1007\/s10489-016-0881-0","volume":"46","author":"S Maldonado","year":"2017","unstructured":"Maldonado S, L\u00f3pez J (2017) Robust kernel-based multiclass support vector machines via second-order cone programming. Appl Intell 46(4):983\u2013992","journal-title":"Appl Intell"},{"key":"2618_CR28","unstructured":"Mason L, Baxter J, Bartlett PL, Frean M, et al. (1999) Functional gradient techniques for combining hypotheses. In: Advances in neural information processing systems, MIT, pp 221\u2013246"},{"key":"2618_CR29","unstructured":"Munkhoeva M, Kapushev Y, Burnaev E, Oseledets I (2018) Quadrature-based features for kernel approximation. In: Advances in neural information processing systems, pp 9147\u20139156"},{"key":"2618_CR30","unstructured":"Musco C, Musco C (2015) Randomized block Krylov methods for stronger and faster approximate singular value decomposition. In: Advances in neural information processing systems, pp 1396\u20131404"},{"key":"2618_CR31","unstructured":"Rahimi A, Recht B (2008) Random features for large-scale kernel machines. In: Advances in neural information processing systems, pp 1177\u20131184"},{"key":"2618_CR32","first-page":"1017","volume":"20","author":"ND Ratliff","year":"2007","unstructured":"Ratliff ND, Bagnell JA (2007) Kernel conjugate gradient for fast kernel machines. IJCAI 20:1017\u20131021","journal-title":"IJCAI"},{"issue":"3","key":"2618_CR33","doi-asserted-by":"publisher","first-page":"400","DOI":"10.1214\/aoms\/1177729586","volume":"22","author":"H Robbins","year":"1951","unstructured":"Robbins H, Monro S (1951) A stochastic approximation method. Ann Math Stat 22(3):400\u2013407","journal-title":"Ann Math Stat"},{"key":"2618_CR34","doi-asserted-by":"crossref","unstructured":"Scholkopf B, Smola AJ (2018) Learning with kernels: support vector machines, regularization, optimization, and beyond. Adaptive Computation and Machine Learning series","DOI":"10.7551\/mitpress\/4175.001.0001"},{"key":"2618_CR35","unstructured":"Shabat G, Choshen E, Ben-Or D, Carmel N (2019) Fast and accurate Gaussian kernel ridge regression using matrix decompositions for preconditioning. arXiv:1905.10587"},{"issue":"1","key":"2618_CR36","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/s10107-010-0420-4","volume":"127","author":"S Shalev-Shwartz","year":"2011","unstructured":"Shalev-Shwartz S, Singer Y, Srebro N, Cotter A (2011) Pegasos: Primal estimated sub-gradient solver for SVM. Math Program 127(1):3\u201330","journal-title":"Math Program"},{"key":"2618_CR37","doi-asserted-by":"crossref","unstructured":"Shen Z, Qian H, Mu T, Zhang C (2017) Accelerated doubly stochastic gradient algorithm for large-scale empirical risk minimization. In: IJCAI, pp 2715\u20132721","DOI":"10.24963\/ijcai.2017\/378"},{"issue":"2","key":"2618_CR38","first-page":"26","volume":"4","author":"T Tieleman","year":"2012","unstructured":"Tieleman T, Hinton G (2012) Rmsprop: Divide the gradient by a running average of its recent magnitude. COURSERA: Neural Netw Mach Learn 4(2):26\u201331","journal-title":"COURSERA: Neural Netw Mach Learn"},{"key":"2618_CR39","unstructured":"Tu S, Roelofs R, Venkataraman S, Recht B (2016) Large scale kernel learning using block coordinate descent. arXiv:1602.05310"},{"key":"2618_CR40","unstructured":"Vinyals O, Povey D (2012) Krylov subspace descent for deep learning. In: Artificial intelligence and statistics, pp 1261\u20131268"},{"key":"2618_CR41","doi-asserted-by":"publisher","first-page":"280","DOI":"10.1016\/j.neucom.2019.07.070","volume":"364","author":"D Wang","year":"2019","unstructured":"Wang D, Xu J (2019) Faster constrained linear regression via two-step preconditioning. Neurocomputing 364:280\u2013296","journal-title":"Neurocomputing"},{"key":"2618_CR42","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511617539","volume-title":"Scattered data approximation","author":"H Wendland","year":"2004","unstructured":"Wendland H (2004) Scattered data approximation. Cambridge University Press, Cambridge"},{"key":"2618_CR43","unstructured":"Williams CK, Seeger M (2001) Using the nystrom\u0308 method to speed up kernel machines. In: Advances in neural information processing systems, pp 682\u2013688"},{"key":"2618_CR44","doi-asserted-by":"crossref","unstructured":"Yang J, Sindhwani V, Fan Q, Avron H, Mahoney MW (2014) Random Laplace feature maps for semigroup kernels on histograms. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 971\u2013978","DOI":"10.1109\/CVPR.2014.129"},{"issue":"1","key":"2618_CR45","doi-asserted-by":"publisher","first-page":"57","DOI":"10.1007\/s10994-015-5536-6","volume":"103","author":"T Yang","year":"2016","unstructured":"Yang T, Jin R, Zhu S, Lin Q (2016) On data preconditioning for regularized loss minimization. Mach Learn 103(1):57\u201379","journal-title":"Mach Learn"},{"key":"2618_CR46","unstructured":"Yang T, Li YF, Mahdavi M, Jin R, Zhou ZH (2012) Nystrom\u0308 method vs random Fourier features: A theoretical and empirical comparison. In: Advances in neural information processing systems, pp 476\u2013484"},{"key":"2618_CR47","doi-asserted-by":"crossref","unstructured":"Yedida R, Saha S, Prashanth T (2020) Lipschitzlr: Using theoretically computed adaptive learning rates for fast convergence. Appl Intell, pp 1\u201319","DOI":"10.1007\/s10489-020-01892-0"},{"key":"2618_CR48","first-page":"1264","volume":"89","author":"J Zhang","year":"2019","unstructured":"Zhang J, May A, Dao T, R\u00e9 C (2019) Low-precision random Fourier features for memory-constrained kernel approximation. Proc Mach Learn Res 89:1264","journal-title":"Proc Mach Learn Res"},{"key":"2618_CR49","doi-asserted-by":"publisher","first-page":"284","DOI":"10.1016\/j.neucom.2020.06.092","volume":"413","author":"Z Zhang","year":"2020","unstructured":"Zhang Z, Zhou S, Li D, Yang T (2020) Gradient preconditioned mini-batch SGD for ridge regression. Neurocomputing 413:284\u2013293","journal-title":"Neurocomputing"},{"issue":"4","key":"2618_CR50","doi-asserted-by":"publisher","first-page":"783","DOI":"10.1109\/TNNLS.2015.2424684","volume":"27","author":"S Zhou","year":"2016","unstructured":"Zhou S (2016) Sparse LSSVM in primal using Cholesky factorization for large-scale problems. IEEE Trans Neural Netw Learn Syst 27(4):783\u2013795","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"issue":"1","key":"2618_CR51","doi-asserted-by":"publisher","first-page":"185","DOI":"10.1198\/106186005X25619","volume":"14","author":"J Zhu","year":"2005","unstructured":"Zhu J, Hastie T (2005) Kernel logistic regression and the import vector machine. J Comput Graph Stat 14(1):185\u2013205","journal-title":"J Comput Graph Stat"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-021-02618-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-021-02618-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-021-02618-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,5]],"date-time":"2022-05-05T09:14:24Z","timestamp":1651742064000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-021-02618-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,9,20]]},"references-count":51,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2022,5]]}},"alternative-id":["2618"],"URL":"https:\/\/doi.org\/10.1007\/s10489-021-02618-6","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"type":"print","value":"0924-669X"},{"type":"electronic","value":"1573-7497"}],"subject":[],"published":{"date-parts":[[2021,9,20]]},"assertion":[{"value":"14 June 2021","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 September 2021","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}