{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,13]],"date-time":"2025-06-13T14:25:47Z","timestamp":1749824747636,"version":"3.37.3"},"reference-count":52,"publisher":"Springer Science and Business Media LLC","issue":"9-10","license":[{"start":{"date-parts":[[2017,7,13]],"date-time":"2017-07-13T00:00:00Z","timestamp":1499904000000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100004963","name":"Seventh Framework Programme","doi-asserted-by":"publisher","award":["610967"],"award-info":[{"award-number":["610967"]}],"id":[{"id":"10.13039\/501100004963","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Seventh Framework Programme (BE)","award":["610967"],"award-info":[{"award-number":["610967"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2017,10]]},"DOI":"10.1007\/s10994-017-5657-1","type":"journal-article","created":{"date-parts":[[2017,7,13]],"date-time":"2017-07-13T18:21:29Z","timestamp":1499970089000},"page":"1705-1724","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Generalized exploration in policy search"],"prefix":"10.1007","volume":"106","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1583-3692","authenticated-orcid":false,"given":"Herke","family":"van Hoof","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1363-7970","authenticated-orcid":false,"given":"Daniel","family":"Tanneberg","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5266-8091","authenticated-orcid":false,"given":"Jan","family":"Peters","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2017,7,13]]},"reference":[{"key":"5657_CR1","unstructured":"Asmuth, J., Li, L., Littman, M. L., Nouri, A., & Wingate, D. (2009). A Bayesian sampling approach to exploration in reinforcement learning. In Proceedings of the conference on uncertainty in artificial intelligence (UAI) (pp. 19\u201326). AUAI Press."},{"key":"5657_CR2","doi-asserted-by":"crossref","first-page":"319","DOI":"10.1613\/jair.806","volume":"15","author":"J Baxter","year":"2001","unstructured":"Baxter, J., & Bartlett, P. L. (2001). Infinite-horizon policy-gradient estimation. Journal of Artificial Intelligence Research, 15, 319\u2013350.","journal-title":"Journal of Artificial Intelligence Research"},{"key":"5657_CR3","unstructured":"Daniel, C., Neumann, G., Kroemer, O., & Peters, J. (2016a). Hierarchical relative entropy policy search. Journal of Machine Learning Research, 17(93), 1\u201350."},{"key":"5657_CR4","doi-asserted-by":"crossref","unstructured":"Daniel, C., van Hoof, H., Peters, J., & Neumann, G. (2016b). Probabilistic inference for determining options in reinforcement learning. Machine Learning, 104, 337\u2013357.","DOI":"10.1007\/s10994-016-5580-x"},{"key":"5657_CR5","unstructured":"da\u00a0Silva, B.\u00a0C., Konidaris, G., & Barto, A.\u00a0G. (2012).Learning parameterized skills. In Proceedings of the international conference on machine learning (ICML) (pp. 1679\u20131686)."},{"key":"5657_CR6","unstructured":"Dearden, R., Friedman, N., & Andre, D. (1999). Model based Bayesian exploration. In Proceedings of the conference on uncertainty in artificial intelligence (UAI) (pp. 150\u2013159)."},{"key":"5657_CR7","unstructured":"Dearden, R., Friedman, N., & Russell, S. (1998) Bayesian Q-learning. In Proceedings of the national conference on artificial intelligence (AAAI) (pp. 761\u2013768)."},{"key":"5657_CR8","unstructured":"Deisenroth, M. P., Neumann, G., & Peters, J. (2013). A survey on policy search for robotics. Foundations and Trends in Robotics, 2(1\u20132), 1\u2013142."},{"issue":"7","key":"5657_CR9","doi-asserted-by":"crossref","first-page":"1508","DOI":"10.1016\/j.neucom.2008.12.019","volume":"72","author":"MP Deisenroth","year":"2009","unstructured":"Deisenroth, M. P., Rasmussen, C. E., & Peters, J. (2009). Gaussian process dynamic programming. Neurocomputing, 72(7), 1508\u20131524.","journal-title":"Neurocomputing"},{"key":"5657_CR10","doi-asserted-by":"crossref","first-page":"227","DOI":"10.1613\/jair.639","volume":"13","author":"TG Dietterich","year":"2000","unstructured":"Dietterich, T. G. (2000). Hierarchical reinforcement learning with the MAXQ value function decomposition. Journal of Artificial Intelligence Research, 13, 227\u2013303.","journal-title":"Journal of Artificial Intelligence Research"},{"key":"5657_CR11","unstructured":"Doshi-Velez, F., Wingate, D., Roy, N., & Tenenbaum\u00a0J. B. (2010). Nonparametric Bayesian policy priors for reinforcement learning. In Advances in neural information processing systems (NIPS) (pp. 532\u2013540)."},{"key":"5657_CR12","unstructured":"Ghavamzadeh, M.,&\u00a0Mahadevan, S. (2003). Hierarchical policy gradient algorithms. In Proceedings of the international conference on machine learning (ICML) (pp. 226\u2013233)."},{"issue":"5","key":"5657_CR13","doi-asserted-by":"crossref","first-page":"2140","DOI":"10.1109\/TSMCB.2004.832154","volume":"34","author":"M Guo","year":"2004","unstructured":"Guo, M., Liu, Y., & Malec, J. (2004). A new Q-learning algorithm based on the metropolis criterion. IEEE Transactions on Systems, Man, and Cybernetics, Part B: Cybernetics, 34(5), 2140\u20132143.","journal-title":"IEEE Transactions on Systems, Man, and Cybernetics, Part B: Cybernetics"},{"issue":"1","key":"5657_CR14","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1162\/106365603321828970","volume":"11","author":"N Hansen","year":"2003","unstructured":"Hansen, N., M\u00fcller, S. D., & Koumoutsakos, P. (2003). Reducing the time complexity of the derandomized evolution strategy with covariance matrix adaptation (CMA-ES). Evolutionary Computation, 11(1), 1\u201318.","journal-title":"Evolutionary Computation"},{"issue":"1","key":"5657_CR15","doi-asserted-by":"crossref","first-page":"97","DOI":"10.1093\/biomet\/57.1.97","volume":"57","author":"WK Hastings","year":"1970","unstructured":"Hastings, W. K. (1970). Monte Carlo sampling methods using Markov chains and their applications. Biometrika, 57(1), 97\u2013109.","journal-title":"Biometrika"},{"key":"5657_CR16","unstructured":"Hausknecht, M., & Stone, P. (2016). Deep reinforcement learning in parameterized action space. In Proceedings of the international conference on learning representations."},{"key":"5657_CR17","unstructured":"Hoffman, M., Doucet, A., de Freitas, N., & Jasra, A. (2007). Bayesian policy learning with trans-dimensional MCMC. In Advances in neural information processing systems (NIPS) (pp. 665\u2013672)."},{"key":"5657_CR18","doi-asserted-by":"crossref","unstructured":"Kaelbling, L.\u00a0P. (1993). Hierarchical learning in stochastic domains: Preliminary results. In Proceedings of the international conference on machine learning (ICML) (pp. 167\u2013173).","DOI":"10.1016\/B978-1-55860-307-3.50028-9"},{"key":"5657_CR19","doi-asserted-by":"crossref","first-page":"237","DOI":"10.1613\/jair.301","volume":"4","author":"LP Kaelbling","year":"1996","unstructured":"Kaelbling, L. P., Littman, M. L., & Moore, A. W. (1996). Reinforcement learning: A survey. Journal of Artificial Intelligence Research, 4, 237\u2013285.","journal-title":"Journal of Artificial Intelligence Research"},{"issue":"32","key":"5657_CR20","doi-asserted-by":"crossref","first-page":"1238","DOI":"10.1177\/0278364913495721","volume":"11","author":"J Kober","year":"2013","unstructured":"Kober, J., Bagnell, J. A., & Peters, J. (2013). Reinforcement learning in robotics: A survey. The International Journal of Robotics Research, 11(32), 1238\u20131274.","journal-title":"The International Journal of Robotics Research"},{"key":"5657_CR21","unstructured":"Kober, J., & Peters, J. (2009). Policy search for motor primitives in robotics. In Advances in neural information processing systems (NIPS) (pp. 849\u2013856)."},{"key":"5657_CR22","doi-asserted-by":"crossref","unstructured":"Kohl, N., & Stone, P. (2004). Policy gradient reinforcement learning for fast quadrupedal locomotion. Proceedings of the IEEE international conference on robotics and automation (ICRA), vol. 3 (pp. 2619\u20132624).","DOI":"10.1109\/ROBOT.2004.1307456"},{"key":"5657_CR23","unstructured":"Konidaris, G., & Barto, A. (2009). Skill discovery in continuous reinforcement learning domains using skill chaining. In Advances in neural information processing systems (NIPS) (pp. 1015\u20131023)."},{"key":"5657_CR24","unstructured":"Kormushev, P., & Caldwell, D.\u00a0G. (2012). Direct policy search reinforcement learning based on particle filtering. In Proceedings of the European workshop on reinforcement learning (EWRL)."},{"key":"5657_CR25","unstructured":"Lillicrap, T.\u00a0P., Hunt, J.\u00a0J., Pritzel, A., Heess, N., Erez, T., & Tassa, Y. et al. (2016). Continuous control with deep reinforcement learning. In Proceedings of the international conference on learning representations."},{"key":"5657_CR26","doi-asserted-by":"crossref","unstructured":"Meijdam, H.\u00a0J., Plooij, M.\u00a0C., & Caarls, W. (2013). Learning while preventing mechanical failure due to random motions. In IEEE\/RSJ international conference on intelligent robots and systems (pp. 182\u2013187).","DOI":"10.1109\/IROS.2013.6696351"},{"issue":"1","key":"5657_CR27","doi-asserted-by":"crossref","first-page":"37","DOI":"10.1016\/S0921-8890(01)00113-0","volume":"36","author":"J Morimoto","year":"2001","unstructured":"Morimoto, J., & Doya, K. (2001). Acquisition of stand-up behavior by a real robot using hierarchical reinforcement learning. Robotics and Autonomous Systems, 36(1), 37\u201351.","journal-title":"Robotics and Autonomous Systems"},{"key":"5657_CR28","first-page":"771","volume":"7","author":"R Munos","year":"2006","unstructured":"Munos, R. (2006). Policy gradient in continuous time. Journal of Machine Learning Research, 7, 771\u2013791.","journal-title":"Journal of Machine Learning Research"},{"key":"5657_CR29","doi-asserted-by":"crossref","first-page":"475","DOI":"10.1613\/jair.3062","volume":"38","author":"PA Ortega","year":"2010","unstructured":"Ortega, P. A., & Braun, D. A. (2010). A minimum relative entropy principle for learning and acting. Journal of Artificial Intelligence Research, 38, 475\u2013511.","journal-title":"Journal of Artificial Intelligence Research"},{"key":"5657_CR30","unstructured":"Osband, I., Blundell, C., Pritzel, A., & Van\u00a0Roy, B. (2016). Deep exploration via bootstrapped dqn. In Advances in neural information processing systems (pp. 4026\u20134034)."},{"key":"5657_CR31","unstructured":"Osband, I., Van\u00a0Roy, B., & Wen, Z. (2016). Generalization and exploration via randomized value functions. In Proceedings of the international conference on machine learning (ICML) (pp. 2377\u20132386)."},{"key":"5657_CR32","unstructured":"Parr, R., & Russell, S. (1998). Reinforcement learning with hierarchies of machines. Advances in neural information processing systems (NIPS) (pp. 1043\u20131049)."},{"key":"5657_CR33","doi-asserted-by":"crossref","unstructured":"Peters, J., M\u00fclling, K., & Alt\u00fcn, Y. (2010). Relative entropy policy search. In Proceedings of the national conference on artificial intelligence (AAAI), physically grounded AI track (pp. 1607\u20131612).","DOI":"10.1609\/aaai.v24i1.7727"},{"key":"5657_CR34","unstructured":"Precup, D. (2000). Temporal abstraction in reinforcement learning. Ph.D. thesis, University of Massachusetts Amherst."},{"key":"5657_CR35","unstructured":"Rahimi, A., & Recht, B. (2007). Random features for large-scale kernel machines. In Advances in neural information processing systems (NIPS) (pp. 1177\u20131184)."},{"issue":"1","key":"5657_CR36","doi-asserted-by":"crossref","first-page":"14","DOI":"10.2478\/s13230-010-0002-4","volume":"1","author":"T R\u00fcckstie\u00df","year":"2010","unstructured":"R\u00fcckstie\u00df, T., Sehnke, F., Schaul, T., Wierstra, D., Sun, Y., & Schmidhuber, J. (2010). Exploring parameter space in reinforcement learning. Paladyn, Journal of Behavioral Robotics, 1(1), 14\u201324.","journal-title":"Paladyn, Journal of Behavioral Robotics"},{"key":"5657_CR37","doi-asserted-by":"crossref","unstructured":"Schaal, S., Peters, J., Nakanishi, J., & Ijspeert, A. (2005). Learning movement primitives. In International symposium on robotics research (pp. 561\u2013572).","DOI":"10.1007\/11008941_60"},{"issue":"4","key":"5657_CR38","doi-asserted-by":"crossref","first-page":"551","DOI":"10.1016\/j.neunet.2009.12.004","volume":"23","author":"F Sehnke","year":"2010","unstructured":"Sehnke, F., Osendorfer, C., R\u00fcckstie\u00df, T., Graves, A., Peters, J., & Schmidhuber, J. (2010). Parameter-exploring policy gradients. Neural Networks, 23(4), 551\u2013559.","journal-title":"Neural Networks"},{"issue":"3","key":"5657_CR39","first-page":"323","volume":"8","author":"S Singh","year":"1992","unstructured":"Singh, S. (1992). Transfer of learning by composing solutions of elemental sequential tasks. Machine Learning, 8(3), 323\u2013339.","journal-title":"Machine Learning"},{"key":"5657_CR40","unstructured":"Strens, M. (2000). A Bayesian framework for reinforcement learning. In Proceedings of the international conference on machine learning (ICML) (pp. 943\u2013950)."},{"key":"5657_CR41","doi-asserted-by":"crossref","unstructured":"Stulp, F., & Schaal, S. (2011). Hierarchical reinforcement learning with movement primitives. In Proceedings of the IEEE international conference on humanoid robots (Humanoids) (pp. 231\u2013238).","DOI":"10.1109\/Humanoids.2011.6100841"},{"key":"5657_CR42","unstructured":"Stulp, F., & Sigaud, O. (2012). Path integral policy improvement with covariance matrix adaptation. In Proceedings of the international conference on machine learning (ICML)."},{"key":"5657_CR43","volume-title":"Reinforcement learning: An introduction","author":"RS Sutton","year":"1998","unstructured":"Sutton, R. S., & Barto, A. G. (1998). Reinforcement learning: An introduction. Cambridge: MIT press."},{"issue":"1","key":"5657_CR44","doi-asserted-by":"crossref","first-page":"181","DOI":"10.1016\/S0004-3702(99)00052-1","volume":"112","author":"RS Sutton","year":"1999","unstructured":"Sutton, R. S., Precup, D., & Singh, S. (1999). Between MDPs and semi-MDPs: A framework for temporal abstraction in reinforcement learning. Artificial Intelligence, 112(1), 181\u2013211.","journal-title":"Artificial Intelligence"},{"key":"5657_CR45","first-page":"3137","volume":"11","author":"E Theodorou","year":"2010","unstructured":"Theodorou, E., Buchli, J., & Schaal, S. (2010). A generalized path integral control approach to reinforcement learning. The Journal of Machine Learning Research, 11, 3137\u20133181.","journal-title":"The Journal of Machine Learning Research"},{"key":"5657_CR46","unstructured":"van Hoof, H., Peters, J., & Neumann, G. (2015). Learning of non-parametric control policies with high-dimensional state features. In Proceedings of the international conference on artificial intelligence and statistics (AIstats) (pp. 995\u20131003)."},{"key":"5657_CR47","unstructured":"Vezhnevets, A., Mnih, V., Osindero, S., Graves, A., Vinyals, O., & Agapiou, J., et\u00a0al. (2016). Strategic attentive writer for learning macro-actions. In Advances in neural information processing systems (pp. 3486\u20133494)."},{"key":"5657_CR48","unstructured":"Watkins, C., & Buttkewitz, Y. (2014). Sex as Gibbs sampling: A probability model of evolution. Technical Report. arXiv:1402.2704"},{"key":"5657_CR49","doi-asserted-by":"crossref","first-page":"91","DOI":"10.7763\/IJMLC.2015.V5.489","volume":"5","author":"P Wawrzy\u0144ski","year":"2015","unstructured":"Wawrzy\u0144ski, P. (2015). Control policy with autocorrelated noise in reinforcement learning for robotics. International Journal of Machine Learning and Computing, 5, 91\u201395.","journal-title":"International Journal of Machine Learning and Computing"},{"issue":"3\u20134","key":"5657_CR50","first-page":"229","volume":"8","author":"RJ Williams","year":"1992","unstructured":"Williams, R. J. (1992). Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine Learning, 8(3\u20134), 229\u2013256.","journal-title":"Machine Learning"},{"key":"5657_CR51","unstructured":"Wingate, D., Goodman, N.\u00a0D., Roy, D.\u00a0M., Kaelbling, L.\u00a0P. & Tenenbaum, J.\u00a0B. (2011). Bayesian policy search with policy priors. In International joint conference on artificial intelligence (IJCAI)."},{"key":"5657_CR52","unstructured":"Wyatt, J. (1998) Exploration and inference in learning from reinforcement. Ph.D. thesis, University of Edinburgh, College of Science and Engineering, School of Informatics."}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10994-017-5657-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-017-5657-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-017-5657-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,24]],"date-time":"2023-08-24T13:42:35Z","timestamp":1692884555000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10994-017-5657-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,7,13]]},"references-count":52,"journal-issue":{"issue":"9-10","published-print":{"date-parts":[[2017,10]]}},"alternative-id":["5657"],"URL":"https:\/\/doi.org\/10.1007\/s10994-017-5657-1","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"type":"print","value":"0885-6125"},{"type":"electronic","value":"1573-0565"}],"subject":[],"published":{"date-parts":[[2017,7,13]]}}}