{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,29]],"date-time":"2026-06-29T17:51:49Z","timestamp":1782755509499,"version":"3.54.5"},"publisher-location":"Berlin, Heidelberg","reference-count":83,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"value":"9783642276446","type":"print"},{"value":"9783642276453","type":"electronic"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2012]]},"DOI":"10.1007\/978-3-642-27645-3_11","type":"book-chapter","created":{"date-parts":[[2012,3,5]],"date-time":"2012-03-05T22:18:12Z","timestamp":1330985892000},"page":"359-386","source":"Crossref","is-referenced-by-count":39,"title":["Bayesian Reinforcement Learning"],"prefix":"10.1007","author":[{"given":"Nikos","family":"Vlassis","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mohammad","family":"Ghavamzadeh","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shie","family":"Mannor","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Pascal","family":"Poupart","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","reference":[{"key":"11_CR1","unstructured":"Aharony, N., Zehavi, T., Engel, Y.: Learning wireless network association control with Gaussian process temporal difference methods. In: Proceedings of OPNETWORK (2005)"},{"key":"11_CR2","unstructured":"Asmuth, J., Li, L., Littman, M.L., Nouri, A., Wingate, D.: A Bayesian sampling approach to exploration in reinforcement learning. In: Proceedings of the Twenty-Fifth Conference on Uncertainty in Artificial Intelligence, UAI 2009, pp. 19\u201326. AUAI Press (2009)"},{"key":"11_CR3","unstructured":"Bagnell, J., Schneider, J.: Covariant policy search. In: Proceedings of the Eighteenth International Joint Conference on Artificial Intelligence (2003)"},{"key":"11_CR4","first-page":"835","volume":"13","author":"A. Barto","year":"1983","unstructured":"Barto, A., Sutton, R., Anderson, C.: Neuron-like elements that can solve difficult learning control problems. IEEE Transaction on Systems, Man and Cybernetics\u00a013, 835\u2013846 (1983)","journal-title":"IEEE Transaction on Systems, Man and Cybernetics"},{"key":"11_CR5","doi-asserted-by":"crossref","first-page":"149","DOI":"10.1613\/jair.731","volume":"12","author":"J. Baxter","year":"2000","unstructured":"Baxter, J.: A model of inductive bias learning. Journal of Artificial Intelligence Research\u00a012, 149\u2013198 (2000)","journal-title":"Journal of Artificial Intelligence Research"},{"key":"11_CR6","doi-asserted-by":"crossref","first-page":"319","DOI":"10.1613\/jair.806","volume":"15","author":"J. Baxter","year":"2001","unstructured":"Baxter, J., Bartlett, P.: Infinite-horizon policy-gradient estimation. Journal of Artificial Intelligence Research\u00a015, 319\u2013350 (2001)","journal-title":"Journal of Artificial Intelligence Research"},{"key":"11_CR7","first-page":"221","volume":"16","author":"R. Bellman","year":"1956","unstructured":"Bellman, R.: A problem in sequential design of experiments. Sankhya\u00a016, 221\u2013229 (1956)","journal-title":"Sankhya"},{"key":"11_CR8","doi-asserted-by":"crossref","unstructured":"Bellman, R.: Adaptive Control Processes: A Guided Tour. Princeton University Press (1961)","DOI":"10.1515\/9781400874668"},{"issue":"2","key":"11_CR9","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TAC.1959.1104847","volume":"4","author":"R. Bellman","year":"1959","unstructured":"Bellman, R., Kalaba, R.: On adaptive control processes. Transactions on Automatic Control, IRE\u00a04(2), 1\u20139 (1959)","journal-title":"Transactions on Automatic Control, IRE"},{"key":"11_CR10","unstructured":"Bhatnagar, S., Sutton, R., Ghavamzadeh, M., Lee, M.: Incremental natural actor-critic algorithms. In: Proceedings of Advances in Neural Information Processing Systems, vol.\u00a020, pp. 105\u2013112. MIT Press (2007)"},{"issue":"11","key":"11_CR11","doi-asserted-by":"publisher","first-page":"2471","DOI":"10.1016\/j.automatica.2009.07.008","volume":"45","author":"S. Bhatnagar","year":"2009","unstructured":"Bhatnagar, S., Sutton, R., Ghavamzadeh, M., Lee, M.: Natural actor-critic algorithms. Automatica\u00a045(11), 2471\u20132482 (2009)","journal-title":"Automatica"},{"key":"11_CR12","first-page":"213","volume":"3","author":"R. Brafman","year":"2002","unstructured":"Brafman, R., Tennenholtz, M.: R-max - a general polynomial time algorithm for near-optimal reinforcement learning. JMLR\u00a03, 213\u2013231 (2002)","journal-title":"JMLR"},{"issue":"1","key":"11_CR13","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1023\/A:1007379606734","volume":"28","author":"R. Caruana","year":"1997","unstructured":"Caruana, R.: Multitask learning. Machine Learning\u00a028(1), 41\u201375 (1997)","journal-title":"Machine Learning"},{"key":"11_CR14","unstructured":"Castro, P., Precup, D.: Using linear programming for Bayesian exploration in Markov decision processes. In: Proc. 20th International Joint Conference on Artificial Intelligence (2007)"},{"key":"11_CR15","doi-asserted-by":"crossref","unstructured":"Chalkiadakis, G., Boutilier, C.: Coordination in multi-agent reinforcement learning: A Bayesian approach. In: International Joint Conference on Autonomous Agents and Multiagent Systems (AAMAS), pp. 709\u2013716 (2003)","DOI":"10.1145\/860685.860689"},{"key":"11_CR16","unstructured":"Chalkiadakis, G., Boutilier, C.: Bayesian reinforcement learning for coalition formation under uncertainty. In: International Joint Conference on Autonomous Agents and Multiagent Systems (AAMAS), pp. 1090\u20131097 (2004)"},{"key":"11_CR17","doi-asserted-by":"crossref","unstructured":"Cozzolino, J., Gonzales-Zubieta, R., Miller, R.L.: Markovian decision processes with uncertain transition probabilities. Tech. Rep. Technical Report No. 11, Research in the Control of Complex Systems. Operations Research Center, Massachusetts Institute of Technology (1965)","DOI":"10.21236\/AD0612601"},{"key":"11_CR18","unstructured":"Cozzolino, J.M.: Optimal sequential decision making under uncertainty. Master\u2019s thesis, Massachusetts Institute of Technology (1964)"},{"key":"11_CR19","unstructured":"Dearden, R., Friedman, N., Russell, S.: Bayesian Q-learning. In: Proceedings of the Fifteenth National Conference on Artificial Intelligence, pp. 761\u2013768 (1998)"},{"key":"11_CR20","unstructured":"Dearden, R., Friedman, N., Andre, D.: Model based Bayesian exploration. In: UAI, pp. 150\u2013159 (1999)"},{"key":"11_CR21","volume-title":"Optimal Statistical Decisions","author":"M.H. DeGroot","year":"1970","unstructured":"DeGroot, M.H.: Optimal Statistical Decisions. McGraw-Hill, New York (1970)"},{"issue":"1","key":"11_CR22","doi-asserted-by":"publisher","first-page":"203","DOI":"10.1287\/opre.1080.0685","volume":"58","author":"E. Delage","year":"2010","unstructured":"Delage, E., Mannor, S.: Percentile optimization for Markov decision processes with parameter uncertainty. Operations Research\u00a058(1), 203\u2013213 (2010)","journal-title":"Operations Research"},{"key":"11_CR23","doi-asserted-by":"crossref","unstructured":"Dimitrakakis, C.: Complexity of stochastic branch and bound methods for belief tree search in bayesian reinforcement learning. In: ICAART (1), pp. 259\u2013264 (2010)","DOI":"10.5220\/0002721402590264"},{"key":"11_CR24","unstructured":"Doshi-Velez, F.: The infinite partially observable Markov decision process. In: Neural Information Processing Systems (2009)"},{"key":"11_CR25","unstructured":"Doshi-Velez, F., Wingate, D., Roy, N., Tenenbaum, J.: Nonparametric Bayesian policy priors for reinforcement learning. In: NIPS (2010)"},{"key":"11_CR26","unstructured":"Duff, M.: Optimal learning: Computational procedures for Bayes-adaptive Markov decision processes. PhD thesis, University of Massassachusetts Amherst (2002)"},{"key":"11_CR27","unstructured":"Duff, M.: Design for an optimal probe. In: ICML, pp. 131\u2013138 (2003)"},{"key":"11_CR28","unstructured":"Engel, Y.: Algorithms and representations for reinforcement learning. PhD thesis, The Hebrew University of Jerusalem, Israel (2005)"},{"key":"11_CR29","series-title":"Lecture Notes in Artificial Intelligence","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1007\/3-540-36755-1_8","volume-title":"Machine Learning: ECML 2002","author":"Y. Engel","year":"2002","unstructured":"Engel, Y., Mannor, S., Meir, R.: Sparse Online Greedy Support Vector Regression. In: Elomaa, T., Mannila, H., Toivonen, H. (eds.) ECML 2002. LNCS (LNAI), vol.\u00a02430, pp. 84\u201396. Springer, Heidelberg (2002)"},{"key":"11_CR30","unstructured":"Engel, Y., Mannor, S., Meir, R.: Bayes meets Bellman: The Gaussian process approach to temporal difference learning. In: Proceedings of the Twentieth International Conference on Machine Learning, pp. 154\u2013161 (2003)"},{"key":"11_CR31","doi-asserted-by":"crossref","unstructured":"Engel, Y., Mannor, S., Meir, R.: Reinforcement learning with Gaussian processes. In: Proceedings of the Twenty Second International Conference on Machine Learning, pp. 201\u2013208 (2005a)","DOI":"10.1145\/1102351.1102377"},{"key":"11_CR32","unstructured":"Engel, Y., Szabo, P., Volkinshtein, D.: Learning to control an octopus arm with Gaussian process temporal difference methods. In: Proceedings of Advances in Neural Information Processing Systems, vol.\u00a018, pp. 347\u2013354. MIT Press (2005b)"},{"key":"11_CR33","unstructured":"Fard, M.M., Pineau, J.: PAC-Bayesian model selection for reinforcement learning. In: Lafferty, J., Williams, C.K.I., Shawe-Taylor, J., Zemel, R., Culotta, A. (eds.) Advances in Neural Information Processing Systems, vol.\u00a023, pp. 1624\u20131632 (2010)"},{"key":"11_CR34","doi-asserted-by":"crossref","unstructured":"Ghavamzadeh, M., Engel, Y.: Bayesian policy gradient algorithms. In: Proceedings of Advances in Neural Information Processing Systems, vol.\u00a019, MIT Press (2006)","DOI":"10.7551\/mitpress\/7503.003.0062"},{"key":"11_CR35","doi-asserted-by":"crossref","unstructured":"Ghavamzadeh, M., Engel, Y.: Bayesian Actor-Critic algorithms. In: Proceedings of the Twenty-Fourth International Conference on Machine Learning (2007)","DOI":"10.1145\/1273496.1273534"},{"key":"11_CR36","doi-asserted-by":"crossref","first-page":"49","DOI":"10.1613\/jair.1579","volume":"24","author":"P. Gmytrasiewicz","year":"2005","unstructured":"Gmytrasiewicz, P., Doshi, P.: A framework for sequential planning in multi-agent settings. Journal of Artificial Intelligence Research (JAIR)\u00a024, 49\u201379 (2005)","journal-title":"Journal of Artificial Intelligence Research (JAIR)"},{"key":"11_CR37","first-page":"1471","volume":"5","author":"E. Greensmith","year":"2004","unstructured":"Greensmith, E., Bartlett, P., Baxter, J.: Variance reduction techniques for gradient estimates in reinforcement learning. Journal of Machine Learning Research\u00a05, 1471\u20131530 (2004)","journal-title":"Journal of Machine Learning Research"},{"issue":"2","key":"11_CR38","doi-asserted-by":"publisher","first-page":"257","DOI":"10.1287\/moor.1040.0129","volume":"30","author":"G.N. Iyengar","year":"2005","unstructured":"Iyengar, G.N.: Robust dynamic programming. Mathematics of Operations Research\u00a030(2), 257\u2013280 (2005)","journal-title":"Mathematics of Operations Research"},{"key":"11_CR39","unstructured":"Jaakkola, T., Haussler, D.: Exploiting generative models in discriminative classifiers. In: Proceedings of Advances in Neural Information Processing Systems, vol.\u00a011, MIT Press (1999)"},{"key":"11_CR40","doi-asserted-by":"crossref","unstructured":"Kaelbling, L.P.: Learning in Embedded Systems. MIT Press (1993)","DOI":"10.7551\/mitpress\/4168.001.0001"},{"key":"11_CR41","unstructured":"Kakade, S.: A natural policy gradient. In: Proceedings of Advances in Neural Information Processing Systems, vol.\u00a014 (2002)"},{"key":"11_CR42","unstructured":"Kearns, M., Mansour, Y., Ng, A.: A sparse sampling algorithm for near-optimal planning in large Markov decision processes. In: Proc. IJCAI (1999)"},{"key":"11_CR43","first-page":"513","volume-title":"Proceedings of the 26th Annual International Conference on Machine Learning, ICML 2009","author":"J.Z. Kolter","year":"2009","unstructured":"Kolter, J.Z., Ng, A.Y.: Near-bayesian exploration in polynomial time. In: Proceedings of the 26th Annual International Conference on Machine Learning, ICML 2009, pp. 513\u2013520. ACM, New York (2009)"},{"key":"11_CR44","unstructured":"Konda, V., Tsitsiklis, J.: Actor-Critic algorithms. In: Proceedings of Advances in Neural Information Processing Systems, vol.\u00a012, pp. 1008\u20131014 (2000)"},{"key":"11_CR45","unstructured":"Lazaric, A., Ghavamzadeh, M.: Bayesian multi-task reinforcement learning. In: Proceedings of the Twenty-Seventh International Conference on Machine Learning, pp. 599\u2013606 (2010)"},{"key":"11_CR46","doi-asserted-by":"crossref","unstructured":"Lazaric, A., Restelli, M., Bonarini, A.: Transfer of samples in batch reinforcement learning. In: Proceedings of ICML, vol.\u00a025, pp. 544\u2013551 (2008)","DOI":"10.1145\/1390156.1390225"},{"issue":"2","key":"11_CR47","doi-asserted-by":"publisher","first-page":"308","DOI":"10.1287\/mnsc.1060.0614","volume":"53","author":"S. Mannor","year":"2007","unstructured":"Mannor, S., Simester, D., Sun, P., Tsitsiklis, J.N.: Bias and variance approximation in value function estimates. Management Science\u00a053(2), 308\u2013322 (2007)","journal-title":"Management Science"},{"key":"11_CR48","unstructured":"Marbach, P.: Simulated-based methods for Markov decision processes. PhD thesis, Massachusetts Institute of Technology (1998)"},{"key":"11_CR49","volume-title":"Bayesian decision problems and Markov chains","author":"J.J. Martin","year":"1967","unstructured":"Martin, J.J.: Bayesian decision problems and Markov chains. John Wiley, New York (1967)"},{"issue":"3","key":"11_CR50","doi-asserted-by":"publisher","first-page":"289","DOI":"10.1007\/s10994-008-5061-y","volume":"73","author":"N. Mehta","year":"2008","unstructured":"Mehta, N., Natarajan, S., Tadepalli, P., Fern, A.: Transfer in variable-reward hierarchical reinforcement learning. Machine Learning\u00a073(3), 289\u2013312 (2008)","journal-title":"Machine Learning"},{"key":"11_CR51","doi-asserted-by":"publisher","first-page":"117","DOI":"10.1023\/A:1007541107674","volume":"35","author":"N. Meuleau","year":"1999","unstructured":"Meuleau, N., Bourgine, P.: Exploration of multi-state environments: local measures and back-propagation of uncertainty. Machine Learning\u00a035, 117\u2013154 (1999)","journal-title":"Machine Learning"},{"issue":"5","key":"11_CR52","doi-asserted-by":"publisher","first-page":"780","DOI":"10.1287\/opre.1050.0216","volume":"53","author":"A. Nilim","year":"2005","unstructured":"Nilim, A., El Ghaoui, L.: Robust control of Markov decision processes with uncertain transition matrices. Operations Research\u00a053(5), 780\u2013798 (2005)","journal-title":"Operations Research"},{"key":"11_CR53","doi-asserted-by":"publisher","first-page":"247","DOI":"10.2307\/2348519","volume":"36","author":"A. O\u2019Hagan","year":"1987","unstructured":"O\u2019Hagan, A.: Monte Carlo is fundamentally unsound. The Statistician\u00a036, 247\u2013249 (1987)","journal-title":"The Statistician"},{"key":"11_CR54","doi-asserted-by":"publisher","first-page":"245","DOI":"10.1016\/0378-3758(91)90002-V","volume":"29","author":"A. O\u2019Hagan","year":"1991","unstructured":"O\u2019Hagan, A.: Bayes-Hermite quadrature. Journal of Statistical Planning and Inference\u00a029, 245\u2013260 (1991)","journal-title":"Journal of Statistical Planning and Inference"},{"key":"11_CR55","unstructured":"Pavlov, M., Poupart, P.: Towards global reinforcement learning. In: NIPS Workshop on Model Uncertainty and Risk in Reinforcement Learning (2008)"},{"issue":"4","key":"11_CR56","doi-asserted-by":"publisher","first-page":"682","DOI":"10.1016\/j.neunet.2008.02.003","volume":"21","author":"J. Peters","year":"2008","unstructured":"Peters, J., Schaal, S.: Reinforcement learning of motor skills with policy gradients. Neural Networks\u00a021(4), 682\u2013697 (2008)","journal-title":"Neural Networks"},{"key":"11_CR57","unstructured":"Peters, J., Vijayakumar, S., Schaal, S.: Reinforcement learning for humanoid robotics. In: Proceedings of the Third IEEE-RAS International Conference on Humanoid Robots (2003)"},{"key":"11_CR58","series-title":"Lecture Notes in Artificial Intelligence","doi-asserted-by":"publisher","first-page":"280","DOI":"10.1007\/11564096_29","volume-title":"Machine Learning: ECML 2005","author":"J. Peters","year":"2005","unstructured":"Peters, J., Vijayakumar, S., Schaal, S.: Natural Actor-Critic. In: Gama, J., Camacho, R., Brazdil, P.B., Jorge, A.M., Torgo, L. (eds.) ECML 2005. LNCS (LNAI), vol.\u00a03720, pp. 280\u2013291. Springer, Heidelberg (2005)"},{"key":"11_CR59","unstructured":"Porta, J.M., Spaan, M.T., Vlassis, N.: Robot planning in partially observable continuous domains. In: Proc. Robotics: Science and Systems (2005)"},{"key":"11_CR60","unstructured":"Poupart, P., Vlassis, N.: Model-based Bayesian reinforcement learning in partially observable domains. In: International Symposium on Artificial Intelligence and Mathematics, ISAIM (2008)"},{"key":"11_CR61","doi-asserted-by":"crossref","unstructured":"Poupart, P., Vlassis, N., Hoey, J., Regan, K.: An analytic solution to discrete Bayesian reinforcement learning. In: Proc. Int. Conf. on Machine Learning, Pittsburgh, USA (2006)","DOI":"10.1145\/1143844.1143932"},{"key":"11_CR62","unstructured":"Rasmussen, C., Ghahramani, Z.: Bayesian Monte Carlo. In: Proceedings of Advances in Neural Information Processing Systems, vol.\u00a015, pp. 489\u2013496. MIT Press (2003)"},{"key":"11_CR63","doi-asserted-by":"crossref","unstructured":"Rasmussen, C., Williams, C.: Gaussian Processes for Machine Learning. MIT Press (2006)","DOI":"10.7551\/mitpress\/3206.001.0001"},{"key":"11_CR64","doi-asserted-by":"crossref","unstructured":"Reisinger, J., Stone, P., Miikkulainen, R.: Online kernel selection for Bayesian reinforcement learning. In: Proceedings of the Twenty-Fifth Conference on Machine Learning, pp. 816\u2013823 (2008)","DOI":"10.1145\/1390156.1390259"},{"key":"11_CR65","unstructured":"Ross, S., Pineau, J.: Model-based Bayesian reinforcement learning in large structured domains. In: Uncertainty in Artificial Intelligence, UAI (2008)"},{"key":"11_CR66","unstructured":"Ross, S., Chaib-Draa, B., Pineau, J.: Bayes-adaptive POMDPs. In: Advances in Neural Information Processing Systems, NIPS (2007)"},{"key":"11_CR67","doi-asserted-by":"crossref","unstructured":"Ross, S., Chaib-Draa, B., Pineau, J.: Bayesian reinforcement learning in continuous POMDPs with application to robot navigation. In: IEEE International Conference on Robotics and Automation (ICRA), pp. 2845\u20132851 (2008)","DOI":"10.1109\/ROBOT.2008.4543641"},{"key":"11_CR68","doi-asserted-by":"crossref","unstructured":"Shawe-Taylor, J., Cristianini, N.: Kernel Methods for Pattern Analysis. Cambridge University Press (2004)","DOI":"10.1017\/CBO9780511809682"},{"key":"11_CR69","unstructured":"Silver, E.A.: Markov decision processes with uncertain transition probabilities or rewards. Tech. Rep. Technical Report No. 1, Research in the Control of Complex Systems. Operations Research Center, Massachusetts Institute of Technology (1963)"},{"key":"11_CR70","doi-asserted-by":"crossref","first-page":"195","DOI":"10.1613\/jair.1659","volume":"24","author":"M.T.J. Spaan","year":"2005","unstructured":"Spaan, M.T.J., Vlassis, N.: Perseus: Randomized point-based value iteration for POMDPs. Journal of Artificial Intelligence Research\u00a024, 195\u2013220 (2005)","journal-title":"Journal of Artificial Intelligence Research"},{"key":"11_CR71","unstructured":"Strehl, A.L., Li, L., Littman, M.L.: Incremental model-based learners with formal learning-time guarantees. In: UAI (2006)"},{"key":"11_CR72","unstructured":"Strens, M.: A Bayesian framework for reinforcement learning. In: ICML (2000)"},{"key":"11_CR73","unstructured":"Sutton, R.: Temporal credit assignment in reinforcement learning. PhD thesis, University of Massachusetts Amherst (1984)"},{"key":"11_CR74","first-page":"9","volume":"3","author":"R. Sutton","year":"1988","unstructured":"Sutton, R.: Learning to predict by the methods of temporal differences. Machine Learning\u00a03, 9\u201344 (1988)","journal-title":"Machine Learning"},{"key":"11_CR75","unstructured":"Sutton, R., McAllester, D., Singh, S., Mansour, Y.: Policy gradient methods for reinforcement learning with function approximation. In: Proceedings of Advances in Neural Information Processing Systems, vol.\u00a012, pp. 1057\u20131063 (2000)"},{"key":"11_CR76","first-page":"2125","volume":"8","author":"M. Taylor","year":"2007","unstructured":"Taylor, M., Stone, P., Liu, Y.: Transfer learning via inter-task mappings for temporal difference learning. JMLR\u00a08, 2125\u20132167 (2007)","journal-title":"JMLR"},{"key":"11_CR77","doi-asserted-by":"crossref","first-page":"285","DOI":"10.1093\/biomet\/25.3-4.285","volume":"25","author":"W.R. Thompson","year":"1933","unstructured":"Thompson, W.R.: On the likelihood that one unknown probability exceeds another in view of the evidence of two samples. Biometrika\u00a025, 285\u2013294 (1933)","journal-title":"Biometrika"},{"key":"11_CR78","doi-asserted-by":"crossref","unstructured":"Veness, J., Ng, K.S., Hutter, M., Silver, D.: Reinforcement learning via AIXI approximation. In: AAAI (2010)","DOI":"10.1613\/jair.3125"},{"key":"11_CR79","doi-asserted-by":"crossref","unstructured":"Wang, T., Lizotte, D., Bowling, M., Schuurmans, D.: Bayesian sparse sampling for on-line reward optimization. In: ICML (2005)","DOI":"10.1145\/1102351.1102472"},{"key":"11_CR80","unstructured":"Watkins, C.: Learning from delayed rewards. PhD thesis, Kings College, Cambridge, England (1989)"},{"key":"11_CR81","unstructured":"Wiering, M.: Explorations in efficient reinforcement learning. PhD thesis, University of Amsterdam (1999)"},{"key":"11_CR82","first-page":"229","volume":"8","author":"R. Williams","year":"1992","unstructured":"Williams, R.: Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine Learning\u00a08, 229\u2013256 (1992)","journal-title":"Machine Learning"},{"key":"11_CR83","doi-asserted-by":"crossref","unstructured":"Wilson, A., Fern, A., Ray, S., Tadepalli, P.: Multi-task reinforcement learning: A hierarchical Bayesian approach. In: Proceedings of ICML, vol.\u00a024, pp. 1015\u20131022 (2007)","DOI":"10.1145\/1273496.1273624"}],"container-title":["Adaptation, Learning, and Optimization","Reinforcement Learning"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-27645-3_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,22]],"date-time":"2025-03-22T13:02:59Z","timestamp":1742648579000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-27645-3_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012]]},"ISBN":["9783642276446","9783642276453"],"references-count":83,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-27645-3_11","relation":{},"ISSN":["1867-4534","1867-4542"],"issn-type":[{"value":"1867-4534","type":"print"},{"value":"1867-4542","type":"electronic"}],"subject":[],"published":{"date-parts":[[2012]]}}}