{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,19]],"date-time":"2026-06-19T06:44:15Z","timestamp":1781851455400,"version":"3.54.5"},"publisher-location":"Berlin, Heidelberg","reference-count":49,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"value":"9783662635186","type":"print"},{"value":"9783662635193","type":"electronic"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-662-63519-3_5","type":"book-chapter","created":{"date-parts":[[2021,5,17]],"date-time":"2021-05-17T11:03:01Z","timestamp":1621249381000},"page":"105-130","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":36,"title":["Convergence Proof for Actor-Critic Methods Applied to PPO and RUDDER"],"prefix":"10.1007","author":[{"given":"Markus","family":"Holzleitner","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lukas","family":"Gruber","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jos\u00e9","family":"Arjona-Medina","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Johannes","family":"Brandstetter","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Sepp","family":"Hochreiter","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2021,5,18]]},"reference":[{"issue":"7","key":"5_CR1","doi-asserted-by":"publisher","first-page":"573","DOI":"10.1016\/j.sysconle.2006.01.002","volume":"55","author":"PA Absil","year":"2006","unstructured":"Absil, P.A., Kurdyka, K.: On the stable equilibrium points of gradient systems. Syst. Control Lett. 55(7), 573\u2013577 (2006)","journal-title":"Syst. Control Lett."},{"key":"5_CR2","unstructured":"Arjona-Medina, J.A., Gillhofer, M., Widrich, M., Unterthiner, T., Brandstetter, J., Hochreiter, S.: RUDDER: Return decomposition for delayed rewards (2018). ArXiv https:\/\/arxiv.org\/abs\/1806.07857"},{"key":"5_CR3","unstructured":"Arjona-Medina, J.A., Gillhofer, M., Widrich, M., Unterthiner, T., Brandstetter, J., Hochreiter, S.: RUDDER: return decomposition for delayed rewards. In: Advances in Neural Information Processing Systems, vol. 33 (2019). ArXiv https:\/\/arxiv.org\/abs\/1806.07857"},{"key":"5_CR4","doi-asserted-by":"publisher","unstructured":"Bakker, B.: Reinforcement learning by backpropagation through an LSTM model\/critic. In: IEEE International Symposium on Approximate Dynamic Programming and Reinforcement Learning, pp. 127\u2013134 (2007). https:\/\/doi.org\/10.1109\/ADPRL.2007.368179","DOI":"10.1109\/ADPRL.2007.368179"},{"key":"5_CR5","volume-title":"Neuro-Dynamic Programming","author":"DP Bertsekas","year":"1996","unstructured":"Bertsekas, D.P., Tsitsiklis, J.N.: Neuro-Dynamic Programming. Athena Scientific, Belmont (1996)"},{"key":"5_CR6","series-title":"Lecture Notes in Control and Information Sciences","doi-asserted-by":"publisher","first-page":"302","DOI":"10.1007\/978-1-4471-4285-0","volume-title":"Stochastic Recursive Algorithms for Optimization","author":"S Bhatnagar","year":"2013","unstructured":"Bhatnagar, S., Prasad, H.L., Prashanth, L.A.: Stochastic Recursive Algorithms for Optimization. Lecture Notes in Control and Information Sciences, 1st edn., p. 302. Springer, London (2013). https:\/\/doi.org\/10.1007\/978-1-4471-4285-0","edition":"1"},{"key":"5_CR7","doi-asserted-by":"publisher","unstructured":"Stochastic Approximation. TRM, vol. 48. Hindustan Book Agency, Gurgaon (2008). https:\/\/doi.org\/10.1007\/978-93-86279-38-5","DOI":"10.1007\/978-93-86279-38-5"},{"key":"5_CR8","doi-asserted-by":"publisher","unstructured":"Borkar, V.S., Meyn, S.P.: The O.D.E. method for convergence of stochastic approximation and reinforcement learning. SIAM J. Control Optim. 38(2), 447\u2013469 (2000). https:\/\/doi.org\/10.1137\/S0363012997331639","DOI":"10.1137\/S0363012997331639"},{"key":"5_CR9","unstructured":"Casella, G., Berger, R.L.: Statistical Inference. Wadsworth and Brooks\/Cole, Stanley (2002)"},{"key":"5_CR10","unstructured":"Choromanska, A., Henaff, M., Mathieu, M., Arous, G.B., LeCun, Y.: The loss surfaces of multilayer networks. In: Proceedings of the Eighteenth International Conference on Artificial Intelligence and Statistics, pp. 192\u2013204 (2015)"},{"key":"5_CR11","first-page":"341","volume":"8","author":"P Dayan","year":"1992","unstructured":"Dayan, P.: The convergence of TD($$\\lambda $$) for general $$\\lambda $$. Mach. Learn. 8, 341 (1992)","journal-title":"Mach. Learn."},{"key":"5_CR12","unstructured":"Fan, J., Wang, Z., Xie, Y., Yang, Z.: A theoretical analysis of deep $$q$$-learning. CoRR abs\/1901.00137 (2020)"},{"key":"5_CR13","unstructured":"Hairer, M.: Ergodic properties of Markov processes. In: Lecture Notes (2018)"},{"key":"5_CR14","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Klambauer, G., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a Nash equilibrium. In: Guyon, I., et al. (eds.) Advances in Neural Information Processing Systems, vol. 30. pp. 6626\u20136637. Curran Associates, Inc. (2017). Preprint arXiv:1706.08500"},{"key":"5_CR15","unstructured":"Jin, C., Netrapalli, P., Jordan, M.I.: Minmax optimization: Stable limit points of gradient descent ascent are locally optimal. arXiv:1902.00618 (2019)"},{"key":"5_CR16","doi-asserted-by":"publisher","DOI":"10.1287\/moor.2017.0855","author":"P Karmakar","year":"2017","unstructured":"Karmakar, P., Bhatnagar, S.: Two time-scale stochastic approximation with controlled Markov noise and off-policy temporal-difference learning. Math. Oper. Res. (2017). https:\/\/doi.org\/10.1287\/moor.2017.0855","journal-title":"Math. Oper. Res."},{"key":"5_CR17","unstructured":"Kawaguchi, K.: Deep learning without poor local minima. In: Lee, D.D., Sugiyama, M., Luxburg, U.V., Guyon, I., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol. 29. pp. 586\u2013594 (2016)"},{"key":"5_CR18","doi-asserted-by":"publisher","first-page":"167","DOI":"10.1016\/j.neunet.2019.06.009","volume":"118","author":"K Kawaguchi","year":"2019","unstructured":"Kawaguchi, K., Bengio, Y.: Depth with nonlinearity creates no bad local minima in ResNets. Neural Netw. 118, 167\u2013174 (2019)","journal-title":"Neural Netw."},{"issue":"6","key":"5_CR19","doi-asserted-by":"publisher","first-page":"1462","DOI":"10.1162\/neco_a_01195","volume":"31","author":"K Kawaguchi","year":"2019","unstructured":"Kawaguchi, K., Huang, J., Kaelbling, L.P.: Effect of depth and width on local minima in deep learning. Neural Comput. 31(6), 1462\u20131498 (2019)","journal-title":"Neural Comput."},{"key":"5_CR20","unstructured":"Kawaguchi, K., Kaelbling, L.P., Bengio, Y.: Generalization in deep learning. arXiv:1710.05468 (2017)"},{"issue":"1","key":"5_CR21","doi-asserted-by":"publisher","first-page":"94","DOI":"10.1137\/S036301299731669X","volume":"38","author":"VR Konda","year":"1999","unstructured":"Konda, V.R., Borkar, V.S.: Actor-critic-type learning algorithms for Markov decision processes. SIAM J. Control Optim. 38(1), 94\u2013123 (1999). https:\/\/doi.org\/10.1137\/S036301299731669X","journal-title":"SIAM J. Control Optim."},{"key":"5_CR22","unstructured":"Konda, V.R., Tsitsiklis, J.N.: Actor-critic algorithms. In: Advances in Neural Information Processing Systems, pp. 1008\u20131014 (2000)"},{"issue":"4","key":"5_CR23","doi-asserted-by":"publisher","first-page":"1143","DOI":"10.1137\/S0363012901385691","volume":"42","author":"VR Konda","year":"2003","unstructured":"Konda, V.R., Tsitsiklis, J.N.: On actor-critic algorithms. SIAM J. Control Optim. 42(4), 1143\u20131166 (2003). https:\/\/doi.org\/10.1137\/S0363012901385691","journal-title":"SIAM J. Control Optim."},{"key":"5_CR24","doi-asserted-by":"publisher","unstructured":"Kushner, H.J., Clark, D.S.: Stochastic Approximation Methods for Constrained and Unconstrained Systems. Applied Mathematical Sciences. Springer, New York (1978). https:\/\/doi.org\/10.1007\/978-1-4684-9352-8","DOI":"10.1007\/978-1-4684-9352-8"},{"key":"5_CR25","doi-asserted-by":"publisher","unstructured":"Kushner, H.J., Yin, G.G.: Stochastic Approximation and Recursive Algorithms and Applications. Stochastic Modelling and Applied Probability. Springer, New York (2003). https:\/\/doi.org\/10.1007\/b97441","DOI":"10.1007\/b97441"},{"key":"5_CR26","unstructured":"Lin, T., Jin, C., Jordan, M.I.: On gradient descent ascent for nonconvex-concave minimax problems. arXiv:1906.00331 (2019)"},{"key":"5_CR27","unstructured":"Liu, B., Cai, Q., Yang, Z., Wang, Z.: Neural proximal\/trust region policy optimization attains globally optimal policy. In: Advances in Neural Information Processing Systems, vol. 33. arXiv:1906.10306 (2019)"},{"key":"5_CR28","unstructured":"Maei, H.R., Szepesv\u00e1ri, C., Bhatnagar, S., Precup, D., Silver, D., Sutton, R.S.: Convergent temporal-difference learning with arbitrary smooth function approximation. In: Bengio, Y., Schuurmans, D., Lafferty, J.D., Williams, C.K.I., Culotta, A. (eds.) Advances in Neural Information Processing Systems, vol. 22. pp. 1204\u20131212. Curran Associates, Inc. (2009)"},{"key":"5_CR29","unstructured":"Mazumdar, E.V., Jordan, M.I., Sastry, S.S.: On finding local Nash equilibria (and only local Nash equilibria) in zero-sum games. arXiv:1901.00838 (2019)"},{"key":"5_CR30","unstructured":"Metrikopoulos, P., Hallak, N., Kavis, A., Cevher, V.: On the almost sure convergence of stochastic gradient descent in non-convex problems. In: Advances in Neural Information Processing Systems, vol. 34 (2020). arXiv:2006.11144"},{"key":"5_CR31","unstructured":"Mnih, V., et al.: Playing atari with deep reinforcement learning. arXiv:1312.5602 (2013)"},{"issue":"7540","key":"5_CR32","doi-asserted-by":"publisher","first-page":"529","DOI":"10.1038\/nature14236","volume":"518","author":"V Mnih","year":"2015","unstructured":"Mnih, V., et al.: Human-level control through deep reinforcement learning. Nature 518(7540), 529\u2013533 (2015). https:\/\/doi.org\/10.1038\/nature14236","journal-title":"Nature"},{"key":"5_CR33","unstructured":"Munro, P.W.: A dual back-propagation scheme for scalar reinforcement learning. In: Proceedings of the Ninth Annual Conference of the Cognitive Science Society, Seattle, WA, pp. 165\u2013176 (1987)"},{"key":"5_CR34","unstructured":"Open, A.I., et al.: Dota 2 with large scale deep reinforcement learning. arXiv:1912.06680 (2019)"},{"key":"5_CR35","unstructured":"Patil, V.P., et al.: Align-RUDDER: learning from few demonstrations by reward redistribution. arXiv:2009.14108 (2020)"},{"key":"5_CR36","volume-title":"Markov Decision Processes","author":"ML Puterman","year":"2005","unstructured":"Puterman, M.L.: Markov Decision Processes, 2nd edn. Wiley, Hoboken (2005)","edition":"2"},{"issue":"3","key":"5_CR37","doi-asserted-by":"publisher","first-page":"400","DOI":"10.1214\/aoms\/1177729586","volume":"22","author":"H Robbins","year":"1951","unstructured":"Robbins, H., Monro, S.: A stochastic approximation method. Ann. Math. Stat. 22(3), 400\u2013407 (1951). https:\/\/doi.org\/10.1214\/aoms\/1177729586","journal-title":"Ann. Math. Stat."},{"key":"5_CR38","unstructured":"Robinson, A.J.: Dynamic error propagation networks. Ph.D. thesis, Trinity Hall and Cambridge University Engineering Department (1989)"},{"key":"5_CR39","unstructured":"Robinson, T., Fallside, F.: Dynamic reinforcement driven error propagation networks with application to game playing. In: Proceedings of the 11th Conference of the Cognitive Science Society, Ann Arbor, pp. 836\u2013843 (1989)"},{"key":"5_CR40","unstructured":"Schulman, J., Levine, S., Moritz, P., Jordan, M.I., Abbeel, P.: Trust region policy optimization. arXiv:1502.05477 (2015). 31st International Conference on Machine Learning (ICML), Proceedings of Machine Learning Research, vol. 37"},{"key":"5_CR41","unstructured":"Schulman, J., Wolski, F., Dhariwal, P., Radford, A., Klimov, O.: Proximal policy optimization algorithms. arXiv:1707.06347 (2018)"},{"key":"5_CR42","doi-asserted-by":"publisher","first-page":"287","DOI":"10.1023\/A:1007678930559","volume":"38","author":"S Singh","year":"2000","unstructured":"Singh, S., Jaakkola, T., Littman, M., Szepesv\u00e1ri, C.: Convergence results for single-step on-policy reinforcement-learning algorithms. Mach. Learn. 38, 287\u2013308 (2000). https:\/\/doi.org\/10.1023\/A:1007678930559","journal-title":"Mach. Learn."},{"key":"5_CR43","volume-title":"Reinforcement Learning: An Introduction","author":"RS Sutton","year":"2018","unstructured":"Sutton, R.S., Barto, A.G.: Reinforcement Learning: An Introduction, 2nd edn. MIT Press, Cambridge (2018)","edition":"2"},{"key":"5_CR44","unstructured":"Sutton, R.S., McAllester, D., Singh, S., Mansour, Y.: Policy gradient methods for reinforcement learning with function approximation. In: Advances in Neural Information Processing Systems, pp. 1057\u20131063 (2000)"},{"issue":"3","key":"5_CR45","doi-asserted-by":"publisher","first-page":"185","DOI":"10.1023\/A:1022689125041","volume":"16","author":"JN Tsitsiklis","year":"1994","unstructured":"Tsitsiklis, J.N.: Asynchronous stochastic approximation and $$q$$-learning. Mach. Learn. 16(3), 185\u2013202 (1994). https:\/\/doi.org\/10.1023\/A:1022689125041","journal-title":"Mach. Learn."},{"issue":"7782","key":"5_CR46","doi-asserted-by":"publisher","first-page":"350","DOI":"10.1038\/s41586-019-1724-z","volume":"575","author":"O Vinyals","year":"2019","unstructured":"Vinyals, O., et al.: Grandmaster level in StarCraft II using multi-agent reinforcement learning. Nature 575(7782), 350\u2013354 (2019). https:\/\/doi.org\/10.1038\/s41586-019-1724-z","journal-title":"Nature"},{"key":"5_CR47","first-page":"279","volume":"8","author":"CJCH Watkins","year":"1992","unstructured":"Watkins, C.J.C.H., Dayan, P.: Q-learning. Mach. Learn. 8, 279\u2013292 (1992)","journal-title":"Mach. Learn."},{"key":"5_CR48","first-page":"10633","volume":"32","author":"T Xu","year":"2019","unstructured":"Xu, T., Zou, S., Liang, Y.: Two time-scale off-policy TD learning: non-asymptotic analysis over Markovian samples. Adv. Neural Inf. Process. Syst. 32, 10633\u201310643 (2019)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"5_CR49","first-page":"8351","volume":"32","author":"Z Yang","year":"2019","unstructured":"Yang, Z., Chen, Y., Hong, M., Wang, Z.: Provably global convergence of actor-critic: a case for linear quadratic regulator with ergodic cost. Adv. Neural Inf. Process. Syst. 32, 8351\u20138363 (2019)","journal-title":"Adv. Neural Inf. Process. Syst."}],"container-title":["Lecture Notes in Computer Science","Transactions on Large-Scale Data- and Knowledge-Centered Systems XLVIII"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-662-63519-3_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,7,14]],"date-time":"2021-07-14T06:04:39Z","timestamp":1626242679000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-662-63519-3_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783662635186","9783662635193"],"references-count":49,"URL":"https:\/\/doi.org\/10.1007\/978-3-662-63519-3_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"18 May 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}}]}}