{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T11:41:03Z","timestamp":1775562063581,"version":"3.50.1"},"reference-count":44,"publisher":"Springer Science and Business Media LLC","issue":"8-9","license":[{"start":{"date-parts":[[2019,2,21]],"date-time":"2019-02-21T00:00:00Z","timestamp":1550707200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2019,9]]},"DOI":"10.1007\/s10994-019-05788-0","type":"journal-article","created":{"date-parts":[[2019,2,21]],"date-time":"2019-02-21T18:26:00Z","timestamp":1550773560000},"page":"1467-1501","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":30,"title":["TD-regularized actor-critic methods"],"prefix":"10.1007","volume":"108","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3886-8131","authenticated-orcid":false,"given":"Simone","family":"Parisi","sequence":"first","affiliation":[]},{"given":"Voot","family":"Tangkaratt","sequence":"additional","affiliation":[]},{"given":"Jan","family":"Peters","sequence":"additional","affiliation":[]},{"given":"Mohammad Emtiyaz","family":"Khan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,2,21]]},"reference":[{"key":"5788_CR1","unstructured":"Achiam, J., Held, D., Tamar, A., & Abbeel, P. (2017). Constrained policy optimization. In Proceedings of the international conference on machine learning (ICML)."},{"key":"5788_CR2","unstructured":"Akrour, R., Abdolmaleki, A., Abdulsamad, H., & Neumann, G. (2016). Model-Free trajectory optimization for reinforcement learning. In Proceedings of the international conference on machine learning (ICML)."},{"key":"5788_CR3","doi-asserted-by":"crossref","unstructured":"Baird, L. (1993). Advantage updating. Tech. rep., Wright-Patterson Air Force Base Ohio: Wright Laboratory.","DOI":"10.21236\/ADA280862"},{"key":"5788_CR4","doi-asserted-by":"crossref","unstructured":"Baird, L. (1995). Residual algorithms: Reinforcement learning with function approximation. In Proceedings of the international conference on machine learning (ICML).","DOI":"10.1016\/B978-1-55860-377-6.50013-X"},{"key":"5788_CR5","unstructured":"Belousov, B., & Peters, J. (2017). f-Divergence constrained policy improvement. arXiv:1801.00056."},{"key":"5788_CR6","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511804441","volume-title":"Convex optimization","author":"S Boyd","year":"2004","unstructured":"Boyd, S., & Vandenberghe, L. (2004). Convex optimization. New York, NY: Cambridge University Press."},{"key":"5788_CR7","unstructured":"Brockman, G., Cheung, V., Pettersson, L., Schneider, J., Schulman, J., Tang, J., & Zaremba, W. (2016). OpenAI gym. arXiv:1606.01540."},{"key":"5788_CR8","unstructured":"Castro, D.D., Volkinshtein, D., & Meir, R. (2008). Temporal difference based actor critic learning\u2014Convergence and neural implementation. In Advances in neural information processing systems (NIPS)."},{"key":"5788_CR9","unstructured":"Dai, B., Shaw, A., He, N., Li, L., & Song, L. (2018). Boosting the actor with dual critic. In Proceedings of the international conference on learning representations (ICLR)."},{"issue":"1\u20132","key":"5788_CR10","first-page":"1","volume":"2","author":"MP Deisenroth","year":"2013","unstructured":"Deisenroth, M. P., Neumann, G., & Peters, J. (2013). A survey on policy search for robotics. Foundations and Trends in Robotics, 2(1\u20132), 1\u2013142.","journal-title":"Foundations and Trends in Robotics"},{"key":"5788_CR11","unstructured":"Fujimoto, S., van Hoof, H., & Meger, D. (2018). Addressing function approximation error in Actor-Critic methods. In Proceedings of the international conference on machine learning (ICML)."},{"issue":"(Nov)","key":"5788_CR12","first-page":"1471","volume":"5","author":"E Greensmith","year":"2004","unstructured":"Greensmith, E., Bartlett, P. L., & Baxter, J. (2004). Variance reduction techniques for gradient estimates in reinforcement learning. Journal of Machine Learning Research (JMLR), 5((Nov)), 1471\u20131530.","journal-title":"Journal of Machine Learning Research (JMLR)"},{"key":"5788_CR13","unstructured":"Gruslys, A., Azar, M. G., Bellemare, M. G., & Munos, R. (2018). The reactor: A fast and sample-efficient Actor-Critic agent for reinforcement learning. In Proceedings of the international conference on learning representations (ICLR)."},{"key":"5788_CR14","unstructured":"Gu, S., Levine, S., Sutskever, I., & Mnih, A. (2016a). Muprop: Unbiased backpropagation for stochastic neural networks. In Proceedings of the international conference on learning representations (ICLR)."},{"key":"5788_CR15","unstructured":"Gu, S., Lillicrap, T., Sutskever, I., & Levine, S. (2016b). Continuous deep Q-Learning with Model-based acceleration. In Proceedings of the international conference on machine learning (ICML)."},{"key":"5788_CR16","unstructured":"Haarnoja, T., Zhou, A., Abbeel, P., & Levine, S. (2018). Soft Actor-Critic: Off-Policy maximum entropy deep reinforcement learning with a stochastic actor. In Proceedings of the international conference on machine learning (ICML)."},{"key":"5788_CR17","unstructured":"Henderson, P., Islam, R., Bachman, P., Pineau, J., Precup, D., & Meger, D. (2017). Deep reinforcement learning that matters. In Proceedings of the conference on artificial intelligence (AAAI)."},{"key":"5788_CR18","doi-asserted-by":"crossref","unstructured":"Hessel, M., Modayil, J., van Hasselt, H., Schaul, T., Ostrovski, G., Dabney, W., Horgan, D., & Silver, D. (2018). Rainbow: Combining improvements in deep reinforcement learning. In Proceedings of the conference on artificial intelligence (AAAI).","DOI":"10.1609\/aaai.v32i1.11796"},{"key":"5788_CR19","unstructured":"Kingma, D. P., & Ba, J. (2014). Adam: A method for stochastic optimization. In Proceedings of the international conference on learning representations (ICLR)."},{"key":"5788_CR20","unstructured":"Konda, V. R., & Tsitsiklis, J. N. (2000). Actor-critic algorithms. In Advances in neural information processing systems (NIPS)."},{"key":"5788_CR21","unstructured":"Lillicrap, T. P., Hunt, J. J., Pritzel, A., Heess, N., Erez, T., Tassa, Y., Silver, D., & Wierstra, D. (2016). Continuous control with deep reinforcement learning. In Proceedings of the international conference on learning representations (ICLR)."},{"key":"5788_CR22","unstructured":"Mnih, V., Badia, A. P., Mirza, M., Graves, A., Lillicrap, T., Harley, T., Silver, D., & Kavukcuoglu, K. (2016). Asynchronous methods for deep reinforcement learning. In International conference on machine learning (ICML)."},{"issue":"7540","key":"5788_CR23","doi-asserted-by":"publisher","first-page":"529","DOI":"10.1038\/nature14236","volume":"518","author":"V Mnih","year":"2015","unstructured":"Mnih, V., Kavukcuoglu, K., Silver, D., Rusu, A. A., Veness, J., Bellemare, M. G., et al. (2015). Human-level control through deep reinforcement learning. Nature, 518(7540), 529\u2013533.","journal-title":"Nature"},{"key":"5788_CR24","unstructured":"Munos, R., Stepleton, T., Harutyunyan, A., & Bellemare, M. G. (2016). Safe and efficient Off-Policy reinforcement learning. In Proceedings of the international conference on learning representations (ICLR)."},{"key":"5788_CR25","unstructured":"Nachum, O., Norouzi, M., Xu, K., & Schuurmans, D. (2018). Trust-PCL: An Off-Policy trust region method for continuous control. In Proceedings of the international conference on learning representations (ICLR)."},{"key":"5788_CR26","unstructured":"Nocedal, J., & Wright, S. (2006). Numerical optimization (2nd ed.). Springer, New York, NY: Springer Series in Operations Research and Financial Engineering."},{"key":"5788_CR27","doi-asserted-by":"crossref","unstructured":"Peters, J., Muelling, K., & Altun, Y. (2010). Relative entropy policy search. In Proceedings of the conference on artificial intelligence (AAAI).","DOI":"10.1609\/aaai.v24i1.7727"},{"issue":"7","key":"5788_CR28","doi-asserted-by":"publisher","first-page":"1180","DOI":"10.1016\/j.neucom.2007.11.026","volume":"71","author":"J Peters","year":"2008","unstructured":"Peters, J., & Schaal, S. (2008). Natural actor-critic. Neurocomputing, 71(7), 1180\u20131190.","journal-title":"Neurocomputing"},{"issue":"5","key":"5788_CR29","doi-asserted-by":"publisher","first-page":"997","DOI":"10.1109\/72.623201","volume":"8","author":"DV Prokhorov","year":"1997","unstructured":"Prokhorov, D. V., & Wunsch, D. C. (1997). Adaptive critic designs. Transactions on Neural Networks, 8(5), 997\u20131007.","journal-title":"Transactions on Neural Networks"},{"key":"5788_CR30","unstructured":"Rajeswaran, A., Lowrey, K., Todorov, E. V., & Kakade, S. M. (2017). Towards generalization and simplicity in continuous control. In Advances in neural information processing systems (NIPS)."},{"key":"5788_CR31","doi-asserted-by":"crossref","unstructured":"Robbins, H., & Monro, S. (1985). A stochastic approximation method. In Herbert Robbins selected papers (pp. 102\u2013109). Springer.","DOI":"10.1007\/978-1-4612-5110-1_9"},{"key":"5788_CR32","unstructured":"Schulman, J., Levine, S., Abbeel, P., Jordan, M., & Moritz, P. (2015). Trust region policy optimization. In Proceedings of the international conference on machine learning (ICML)."},{"key":"5788_CR33","unstructured":"Schulman, J., Moritz, P., Levine, S., Jordan, M., & Abbeel, P. (2016). High-dimensional continuous control using generalized advantage estimation. In Proceedings of the international conference on learning representations (ICLR)."},{"key":"5788_CR34","unstructured":"Schulman, J., Wolski, F., Dhariwal, P., Radford, A., & Klimov, O. (2017). Proximal policy optimization algorithms. arXiv:1707.06347."},{"key":"5788_CR35","unstructured":"Silver, D., Lever, G., Heess, N., Degris, T., Wierstra, D., & Riedmiller, M., et\u00a0al. (2014). Deterministic policy gradient algorithms. In Proceedings of the international conference on machine learning (ICML)."},{"issue":"7587","key":"5788_CR36","doi-asserted-by":"publisher","first-page":"484","DOI":"10.1038\/nature16961","volume":"529","author":"D Silver","year":"2016","unstructured":"Silver, D., Huang, A., Maddison, C. J., Guez, A., Sifre, L., Van Den Driessche, G., et al. (2016). Mastering the game of go with deep neural networks and tree search. Nature, 529(7587), 484\u2013489.","journal-title":"Nature"},{"key":"5788_CR37","volume-title":"Reinforcement Learning: An introduction","author":"RS Sutton","year":"1998","unstructured":"Sutton, R. S., & Barto, A. G. (1998). Reinforcement Learning: An introduction. Cambridge: The MIT Press."},{"key":"5788_CR38","unstructured":"Sutton, R. S., McAllester, D. A., Singh, S. P., & Mansour, Y. (1999). Policy gradient methods for reinforcement learning with function approximation. In Advances in neural information processing systems (NIPS)."},{"key":"5788_CR39","unstructured":"Tamar, A., Di\u00a0Castro, D., & Mannor, S. (2012). Policy gradients with variance related risk criteria. In Proceedings of the international conference on machine learning (ICML)."},{"key":"5788_CR40","doi-asserted-by":"crossref","unstructured":"Todorov, E., Erez, T., & Tassa, Y. (2012). MuJoCo: A physics engine for model-based control. In Proceedings of the international conference on intelligent robots and systems (IROS).","DOI":"10.1109\/IROS.2012.6386109"},{"key":"5788_CR41","unstructured":"van Hasselt, H. (2010). Double Q-learning. In Advances in neural information processing systems (NIPS)."},{"issue":"3\u20134","key":"5788_CR42","first-page":"229","volume":"8","author":"RJ Williams","year":"1992","unstructured":"Williams, R. J. (1992). Simple statistical Gradient-Following algorithms for connectionist reinforcement learning. Machine Learning, 8(3\u20134), 229\u2013256.","journal-title":"Machine Learning"},{"key":"5788_CR43","unstructured":"Wu, C., Rajeswaran, A., Duan, Y., Kumar, V., Bayen, A. M., Kakade, S., Mordatch, I., & Abbeel, P. (2018). Variance reduction for policy gradient with action-dependent factorized baselines. In Proceedings of the international conference on learning representations (ICLR)."},{"key":"5788_CR44","doi-asserted-by":"crossref","DOI":"10.7551\/mitpress\/3074.001.0001","volume-title":"Foundations of robotics: Analysis and control","author":"T Yoshikawa","year":"1990","unstructured":"Yoshikawa, T. (1990). Foundations of robotics: Analysis and control. Cambridge: MIT Press."}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10994-019-05788-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-019-05788-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-019-05788-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,15]],"date-time":"2024-07-15T00:35:01Z","timestamp":1721003701000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10994-019-05788-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,2,21]]},"references-count":44,"journal-issue":{"issue":"8-9","published-print":{"date-parts":[[2019,9]]}},"alternative-id":["5788"],"URL":"https:\/\/doi.org\/10.1007\/s10994-019-05788-0","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019,2,21]]},"assertion":[{"value":"13 August 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 January 2019","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 February 2019","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}