{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T14:39:26Z","timestamp":1771511966419,"version":"3.50.1"},"reference-count":38,"publisher":"Springer Science and Business Media LLC","issue":"1-2","license":[{"start":{"date-parts":[[2023,9,19]],"date-time":"2023-09-19T00:00:00Z","timestamp":1695081600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,9,19]],"date-time":"2023-09-19T00:00:00Z","timestamp":1695081600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CCF-1909298"],"award-info":[{"award-number":["CCF-1909298"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["DMS-2134037"],"award-info":[{"award-number":["DMS-2134037"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Math. Program."],"published-print":{"date-parts":[[2024,9]]},"DOI":"10.1007\/s10107-023-02017-4","type":"journal-article","created":{"date-parts":[[2023,9,19]],"date-time":"2023-09-19T14:04:01Z","timestamp":1695132241000},"page":"457-513","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Homotopic policy mirror descent: policy convergence, algorithmic regularization, and improved sample complexity"],"prefix":"10.1007","volume":"207","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7605-1670","authenticated-orcid":false,"given":"Yan","family":"Li","sequence":"first","affiliation":[]},{"given":"Guanghui","family":"Lan","sequence":"additional","affiliation":[]},{"given":"Tuo","family":"Zhao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,9,19]]},"reference":[{"key":"2017_CR1","unstructured":"Agarwal, A., Kakade, S.M, Lee, J.D., Mahajan, G.: Optimality and approximation with policy gradient methods in Markov decision processes. In: Conference on Learning Theory, pp. 64\u201366. PMLR (2020)"},{"issue":"3","key":"2017_CR2","doi-asserted-by":"publisher","first-page":"167","DOI":"10.1016\/S0167-6377(02)00231-6","volume":"31","author":"A Beck","year":"2003","unstructured":"Beck, A., Teboulle, M.: Mirror descent and nonlinear projected subgradient methods for convex optimization. Oper. Res. Lett. 31(3), 167\u2013175 (2003)","journal-title":"Oper. Res. Lett."},{"key":"2017_CR3","unstructured":"Ben-Tal, A., Nemirovski, A.: Optimization iii: convex analysis, nonlinear programming theory, nonlinear programming algorithms. In: Lecture notes, vol. 34 (2012)"},{"key":"2017_CR4","unstructured":"Bhandari, J., Russo, D.: A note on the linear convergence of policy gradient methods. arXiv preprint arXiv:2007.11120 (2020)"},{"key":"2017_CR5","doi-asserted-by":"publisher","first-page":"2563","DOI":"10.1287\/opre.2021.2151","volume":"70","author":"S Cen","year":"2021","unstructured":"Cen, S., Cheng, C., Chen, Y., Wei, Y., Chi, Y.: Fast global convergence of natural policy gradient methods with entropy regularization. Oper. Res. 70, 2563\u20132578 (2021)","journal-title":"Oper. Res."},{"key":"2017_CR6","first-page":"22274","volume":"34","author":"E Derman","year":"2021","unstructured":"Derman, E., Geist, M., Mannor, S.: Twice regularized MDPs and the equivalence between robustness and regularization. Adv. Neural Inf. Process. Syst. 34, 22274\u201322287 (2021)","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"2","key":"2017_CR7","doi-asserted-by":"publisher","first-page":"302","DOI":"10.1214\/07-AOAS131","volume":"1","author":"J Friedman","year":"2007","unstructured":"Friedman, J., Hastie, T., H\u00f6fling, H., Tibshirani, R.: Pathwise coordinate optimization. Ann. Appl. Stat. 1(2), 302\u2013332 (2007)","journal-title":"Ann. Appl. Stat."},{"key":"2017_CR8","unstructured":"Gunasekar, S., Lee, J., Soudry, D., Srebro, N.: Characterizing implicit bias in terms of optimization geometry. In: International Conference on Machine Learning, pp. 1832\u20131841. PMLR (2018)"},{"issue":"Oct","key":"2017_CR9","first-page":"1391","volume":"5","author":"T Hastie","year":"2004","unstructured":"Hastie, T., Rosset, S., Tibshirani, R., Zhu, J.: The entire regularization path for the support vector machine. J. Mach. Learn. Res. 5(Oct), 1391\u20131415 (2004)","journal-title":"J. Mach. Learn. Res."},{"key":"2017_CR10","unstructured":"Hu, Y., Ji, Z., Telgarsky, M.: Actor-critic is implicitly biased towards high entropy optimal policies. arXiv preprint arXiv:2110.11280 (2021)"},{"key":"2017_CR11","unstructured":"Ji, Z., Telgarsky, M.: The implicit bias of gradient descent on nonseparable data. In: Conference on Learning Theory, pp. 1772\u20131798. PMLR (2019)"},{"key":"2017_CR12","unstructured":"Kakade, S., Langford, J.: Approximately optimal approximate reinforcement learning. In: Proceedings of the 19th International Conference on Machine Learning. Citeseer (2002)"},{"key":"2017_CR13","unstructured":"Kakade, S.M.: A natural policy gradient. In: Advances in Neural Information Processing Systems, vol. 14 (2001)"},{"key":"2017_CR14","doi-asserted-by":"crossref","unstructured":"Khodadadian, S., Jhunjhunwala, P.R., Varma, S.M., Maguluri, S.T.: On the linear convergence of natural policy gradient algorithm. arXiv preprint arXiv:2105.01424 (2021)","DOI":"10.1109\/CDC45484.2021.9682908"},{"key":"2017_CR15","doi-asserted-by":"publisher","DOI":"10.1007\/s10107-022-01816-5","author":"G Lan","year":"2022","unstructured":"Lan, G.: Policy mirror descent for reinforcement learning: linear convergence, new sampling complexity, and generalized problem classes. Math. Program. (2022). https:\/\/doi.org\/10.1007\/s10107-022-01816-5","journal-title":"Math. Program."},{"key":"2017_CR16","unstructured":"Lan, G., Li, Y., Zhao, T.: Block policy mirror descent. arXiv e-prints arXiv:2201.05756 (2022)"},{"key":"2017_CR17","unstructured":"Li, Y., Ju, C., Fang, E.X., Zhao, T.: Implicit regularization of Bregman proximal point algorithm and mirror descent on separable data. arXiv preprint arXiv:2108.06808 (2021)"},{"key":"2017_CR18","unstructured":"Li, Y., Fang, E.X., Xu, H., Zhao, T.: Implicit bias of gradient descent based adversarial training on separable data. In: International Conference on Learning Representations (2020)"},{"key":"2017_CR19","unstructured":"Lillicrap, T.P., Hunt, J.J., Pritzel, A., Heess, N., Erez, T., Tassa, Y., Silver, D., Wierstra, D.: Continuous control with deep reinforcement learning. arXiv preprint arXiv:1509.02971 (2015)"},{"key":"2017_CR20","unstructured":"Liu, B., Cai, Q., Yang, Z., Wang, Z.: Neural trust region\/proximal policy optimization attains globally optimal policy (2019)"},{"issue":"4","key":"2017_CR21","doi-asserted-by":"publisher","first-page":"1574","DOI":"10.1137\/070704277","volume":"19","author":"A Nemirovski","year":"2009","unstructured":"Nemirovski, A., Juditsky, A., Lan, G., Shapiro, A.: Robust stochastic approximation approach to stochastic programming. SIAM J. Optim. 19(4), 1574\u20131609 (2009)","journal-title":"SIAM J. Optim."},{"key":"2017_CR22","unstructured":"Nemirovskij, A.S., Yudin, D.B.: Problem complexity and method efficiency in optimization (1983)"},{"key":"2017_CR23","unstructured":"Neu, G., Jonsson, A., G\u00f3mez, V.: A unified view of entropy-regularized Markov decision processes. arXiv preprint arXiv:1705.07798 (2017)"},{"key":"2017_CR24","volume-title":"Numerical Optimization","author":"J Nocedal","year":"2006","unstructured":"Nocedal, J., Wright, S.: Numerical Optimization. Springer, Berlin (2006)"},{"issue":"3","key":"2017_CR25","doi-asserted-by":"publisher","first-page":"389","DOI":"10.1093\/imanum\/20.3.389","volume":"20","author":"MR Osborne","year":"2000","unstructured":"Osborne, M.R., Presnell, B., Turlach, B.A.: A new approach to variable selection in least squares problems. IMA J. Numer. Anal. 20(3), 389\u2013403 (2000)","journal-title":"IMA J. Numer. Anal."},{"issue":"4","key":"2017_CR26","doi-asserted-by":"publisher","first-page":"659","DOI":"10.1111\/j.1467-9868.2007.00607.x","volume":"69","author":"MY Park","year":"2007","unstructured":"Park, M.Y., Hastie, T.: L1-regularization path algorithm for generalized linear models. J. R. Stat. Soc. Ser. B (Stat. Methodol.) 69(4), 659\u2013677 (2007)","journal-title":"J. R. Stat. Soc. Ser. B (Stat. Methodol.)"},{"key":"2017_CR27","doi-asserted-by":"crossref","unstructured":"Peters, J., Mulling, K., Altun, Y.: Relative entropy policy search. In: Twenty-Fourth AAAI Conference on Artificial Intelligence (2010)","DOI":"10.1609\/aaai.v24i1.7727"},{"key":"2017_CR28","volume-title":"Markov Decision Processes: Discrete Stochastic Dynamic Programming","author":"ML Puterman","year":"2005","unstructured":"Puterman, M.L.: Markov Decision Processes: Discrete Stochastic Dynamic Programming. Wiley, London (2005)"},{"key":"2017_CR29","first-page":"941","volume":"5","author":"S Rosset","year":"2004","unstructured":"Rosset, S., Zhu, J., Hastie, T.: Boosting as a regularized path to a maximum margin classifier. J. Mach. Learn. Res. 5, 941\u2013973 (2004)","journal-title":"J. Mach. Learn. Res."},{"key":"2017_CR30","unstructured":"Scherrer, B.: Improved and generalized upper bounds on the complexity of policy iteration. In: Advances in Neural Information Processing Systems, vol. 26 (2013)"},{"key":"2017_CR31","unstructured":"Schulman, J., Levine, S., Abbeel, P., Jordan, M., Moritz, P.: Trust region policy optimization. In: International conference on machine learning, pp. 1889\u20131897. PMLR (2015)"},{"key":"2017_CR32","doi-asserted-by":"crossref","unstructured":"Shani, L., Efroni, Y., Mannor, S.: Adaptive trust region policy optimization: global convergence and faster rates for regularized MDPs. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, pp. 5668\u20135675 (2020)","DOI":"10.1609\/aaai.v34i04.6021"},{"issue":"1","key":"2017_CR33","first-page":"2822","volume":"19","author":"D Soudry","year":"2018","unstructured":"Soudry, D., Hoffer, E., Nacson, M.S., Gunasekar, S., Srebro, N.: The implicit bias of gradient descent on separable data. J. Mach. Learn. Res. 19(1), 2822\u20132878 (2018)","journal-title":"J. Mach. Learn. Res."},{"key":"2017_CR34","unstructured":"Xiao, L.: On the convergence rates of policy gradient methods. arXiv preprint arXiv:2201.07443 (2022)"},{"issue":"2","key":"2017_CR35","doi-asserted-by":"publisher","first-page":"1062","DOI":"10.1137\/120869997","volume":"23","author":"L Xiao","year":"2013","unstructured":"Xiao, L., Zhang, T.: A proximal-gradient homotopy method for the sparse least-squares problem. SIAM J. Optim. 23(2), 1062\u20131091 (2013)","journal-title":"SIAM J. Optim."},{"issue":"4","key":"2017_CR36","doi-asserted-by":"publisher","first-page":"593","DOI":"10.1287\/moor.1110.0516","volume":"36","author":"Y Ye","year":"2011","unstructured":"Ye, Y.: The simplex and policy-iteration methods are strongly polynomial for the Markov decision problem with a fixed discount rate. Math. Oper. Res. 36(4), 593\u2013603 (2011)","journal-title":"Math. Oper. Res."},{"key":"2017_CR37","unstructured":"Zhan, W., Cen, S., Huang, B., Chen, Y., Lee, J.D., Chi, Y.: Policy mirror descent for regularized reinforcement learning: A generalized framework with linear convergence. arXiv preprint arXiv:2105.11066 (2021)"},{"key":"2017_CR38","first-page":"2701","volume":"8","author":"P Zhao","year":"2007","unstructured":"Zhao, P., Bin, Yu.: Stagewise lasso. J. Mach. Learn. Res. 8, 2701\u20132726 (2007)","journal-title":"J. Mach. Learn. Res."}],"container-title":["Mathematical Programming"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10107-023-02017-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10107-023-02017-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10107-023-02017-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,7]],"date-time":"2024-08-07T15:06:03Z","timestamp":1723043163000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10107-023-02017-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,9,19]]},"references-count":38,"journal-issue":{"issue":"1-2","published-print":{"date-parts":[[2024,9]]}},"alternative-id":["2017"],"URL":"https:\/\/doi.org\/10.1007\/s10107-023-02017-4","relation":{},"ISSN":["0025-5610","1436-4646"],"issn-type":[{"value":"0025-5610","type":"print"},{"value":"1436-4646","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,9,19]]},"assertion":[{"value":"15 January 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 July 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 September 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"We acknowledge the submission policy and declare no conflict of interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}