{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,8]],"date-time":"2026-04-08T03:21:42Z","timestamp":1775618502266,"version":"3.50.1"},"reference-count":26,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2022,4,28]],"date-time":"2022-04-28T00:00:00Z","timestamp":1651104000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,4,28]],"date-time":"2022-04-28T00:00:00Z","timestamp":1651104000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1953199"],"award-info":[{"award-number":["1953199"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100005825","name":"National Institute of Food and Agriculture","doi-asserted-by":"publisher","award":["2020-67021-31526"],"award-info":[{"award-number":["2020-67021-31526"]}],"id":[{"id":"10.13039\/100005825","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1909298"],"award-info":[{"award-number":["1909298"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Math. Program."],"published-print":{"date-parts":[[2023,3]]},"DOI":"10.1007\/s10107-022-01816-5","type":"journal-article","created":{"date-parts":[[2022,4,28]],"date-time":"2022-04-28T14:03:50Z","timestamp":1651154630000},"page":"1059-1106","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":39,"title":["Policy mirror descent for reinforcement learning: linear convergence, new sampling complexity, and generalized problem classes"],"prefix":"10.1007","volume":"198","author":[{"given":"Guanghui","family":"Lan","sequence":"first","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,4,28]]},"reference":[{"key":"1816_CR1","unstructured":"Agarwal, A., Kakade, S.M., Lee, J.D., Mahajan, G.: On the theory of policy gradient methods: optimality, approximation, and distribution shift. arXiv:1908.00261 (2019)"},{"key":"1816_CR2","first-page":"927","volume":"27","author":"A Beck","year":"2003","unstructured":"Beck, A., Teboulle, M.: Mirror descent and nonlinear projected subgradient methods for convex optimization. SIAM J. Optim. 27, 927\u2013956 (2003)","journal-title":"SIAM J. Optim."},{"issue":"68","key":"1816_CR3","doi-asserted-by":"publisher","first-page":"247","DOI":"10.2307\/2002797","volume":"13","author":"R Bellman","year":"1959","unstructured":"Bellman, R., Dreyfus, S.: Functional approximations and dynamic programming. Math. Tables Other Aids Comput. 13(68), 247\u2013251 (1959)","journal-title":"Math. Tables Other Aids Comput."},{"key":"1816_CR4","unstructured":"Bhandari, J., Russo, D.: A Note on the Linear Convergence of Policy Gradient Methods. arXiv e-prints arXiv:2007.11120 (2020)"},{"key":"1816_CR5","unstructured":"Cen, S., Cheng, C., Chen, Y., Wei, Y., Chi, Y.: Fast Global Convergence of Natural Policy Gradient Methods with Entropy Regularization. arXiv e-prints arXiv:2007.06558 (2020)"},{"issue":"2","key":"1816_CR6","doi-asserted-by":"publisher","first-page":"277","DOI":"10.1007\/s10589-014-9673-9","volume":"60","author":"CD Dang","year":"2015","unstructured":"Dang, C.D., Lan, G.: On the convergence properties of non-Euclidean extragradient methods for variational inequalities with generalized monotone operators. Comput. Optim. Appl. 60(2), 277\u2013310 (2015)","journal-title":"Comput. Optim. Appl."},{"issue":"3","key":"1816_CR7","doi-asserted-by":"publisher","first-page":"726","DOI":"10.1287\/moor.1090.0396","volume":"34","author":"E Even-Dar","year":"2009","unstructured":"Even-Dar, E., Kakade, S.M., Mansour, Y.: Online Markov decision processes. Math. Oper. Res. 34(3), 726\u2013736 (2009)","journal-title":"Math. Oper. Res."},{"key":"1816_CR8","volume-title":"Finite-Dimensional Variational Inequalities and Complementarity Problems. Volumes I and II. Comprehensive Study in Mathematics","author":"F Facchinei","year":"2003","unstructured":"Facchinei, F., Pang, J.: Finite-Dimensional Variational Inequalities and Complementarity Problems. Volumes I and II. Comprehensive Study in Mathematics. Springer, New York (2003)"},{"key":"1816_CR9","unstructured":"Kakade, S., Langford, J.: Approximately optimal approximate reinforcement learning. In: Proceedings of the International Conference on Machine Learning (ICML) (2002)"},{"key":"1816_CR10","doi-asserted-by":"crossref","unstructured":"Khodadadian, S., Chen, Z., Maguluri, S.T.: Finite-sample analysis of off-policy natural actor-critic algorithm. arXiv:2102.09318 (2021)","DOI":"10.1109\/TAC.2022.3190032"},{"key":"1816_CR11","unstructured":"Kotsalis, G., Lan, G., Li, T.: Simple and optimal methods for stochastic variational inequalities, I: operator extrapolation. arXiv:2011.02987 (2020)"},{"key":"1816_CR12","unstructured":"Kotsalis, G., Lan, G., Li, T.: Simple and optimal methods for stochastic variational inequalities, II: Markovian noise and policy evaluation in reinforcement learning. arXiv:2011.08434 (2020)"},{"key":"1816_CR13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-39568-1","volume-title":"First-Order and Stochastic Optimization Methods for Machine Learning","author":"G Lan","year":"2020","unstructured":"Lan, G.: First-Order and Stochastic Optimization Methods for Machine Learning. Springer, Switzerland (2020)"},{"key":"1816_CR14","unstructured":"Liu, B., Cai, Q., Yang, Z., Wang, Z.: Neural proximal\/trust region policy optimization attains globally optimal policy. arXiv:1906.10306 (2019)"},{"key":"1816_CR15","unstructured":"Mei, J., Xiao, C., Szepesvari, C., Schuurmans, D.: On the Global Convergence Rates of Softmax Policy Gradient Methods. arXiv:2005.06392 (2020)"},{"key":"1816_CR16","doi-asserted-by":"publisher","first-page":"1574","DOI":"10.1137\/070704277","volume":"19","author":"AS Nemirovski","year":"2009","unstructured":"Nemirovski, A.S., Juditsky, A., Lan, G., Shapiro, A.: Robust stochastic approximation approach to stochastic programming. SIAM J. Optim. 19, 1574\u20131609 (2009)","journal-title":"SIAM J. Optim."},{"key":"1816_CR17","volume-title":"Problem Complexity and Method Efficiency in Optimization. Wiley-Interscience Series in Discrete Mathematics","author":"AS Nemirovski","year":"1983","unstructured":"Nemirovski, A.S., Yudin, D.: Problem Complexity and Method Efficiency in Optimization. Wiley-Interscience Series in Discrete Mathematics. Wiley, New York (1983)"},{"key":"1816_CR18","first-page":"543","volume":"269","author":"YE Nesterov","year":"1983","unstructured":"Nesterov, Y.E.: A method for unconstrained convex minimization problem with the rate of convergence $$O(1\/k^2)$$. Dokl. AN SSSR 269, 543\u2013547 (1983)","journal-title":"Dokl. AN SSSR"},{"key":"1816_CR19","doi-asserted-by":"publisher","DOI":"10.1002\/9780470316887","volume-title":"Markov Decision Processes: Discrete Stochastic Dynamic Programming","author":"Martin L Puterman","year":"1994","unstructured":"Puterman, Martin L.: Markov Decision Processes: Discrete Stochastic Dynamic Programming, 1st edn. Wiley, New York (1994)","edition":"1"},{"key":"1816_CR20","doi-asserted-by":"crossref","unstructured":"Shani, L., Efroni, Y., Mannor, S.: Adaptive trust region policy optimization: global convergence and faster rates for regularized MDPS. In: The Thirty-Fourth AAAI Conference on Artificial Intelligence, AAAI 2020, pp. 5668\u20135675. AAAI Press (2020)","DOI":"10.1609\/aaai.v34i04.6021"},{"key":"1816_CR21","unstructured":"Sutton, R.S., McAllester, D., Singh, S., Mansour, Y.: Policy gradient methods for reinforcement learning with function approximation. In: NIPS\u201999: Proceedings of the 12th International Conference on Neural Information Processing Systems, pp. 1057\u20131063 (1999)"},{"key":"1816_CR22","unstructured":"Tomar, M., Shani, L., Efroni, Y., Ghavamzadeh, M.: Mirror descent policy optimization. arXiv:2005.09814 (2020)"},{"key":"1816_CR23","volume-title":"High-Dimensional Probability: An Introduction with Applications in Data Science","author":"R Vershynin","year":"2018","unstructured":"Vershynin, R.: High-Dimensional Probability: An Introduction with Applications in Data Science, vol. 47. Cambridge University Press, Cambridge (2018)"},{"key":"1816_CR24","unstructured":"Wang, L., Cai, Q., Yang, Z., Wang, Z.: Neural policy gradient methods: global optimality and rates of convergence. arXiv:abs\/1909.01150 (2020)"},{"key":"1816_CR25","doi-asserted-by":"crossref","unstructured":"Wolfer, G., Kontorovich, A.: Statistical estimation of ergodic Markov chain kernel over discrete state space. arXiv:1809.05014v6 (2020)","DOI":"10.3150\/20-BEJ1248"},{"key":"1816_CR26","unstructured":"Xu, T., Wang, Z., Liang, Y.: Improving sample complexity bounds for actor-critic algorithms. arXiv:2004.12956 (2020)"}],"container-title":["Mathematical Programming"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10107-022-01816-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10107-022-01816-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10107-022-01816-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,2,21]],"date-time":"2023-02-21T22:23:05Z","timestamp":1677018185000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10107-022-01816-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,4,28]]},"references-count":26,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2023,3]]}},"alternative-id":["1816"],"URL":"https:\/\/doi.org\/10.1007\/s10107-022-01816-5","relation":{},"ISSN":["0025-5610","1436-4646"],"issn-type":[{"value":"0025-5610","type":"print"},{"value":"1436-4646","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,4,28]]},"assertion":[{"value":"5 February 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 April 2022","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 April 2022","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}