{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T01:36:17Z","timestamp":1780709777803,"version":"3.54.1"},"publisher-location":"Berlin, Heidelberg","reference-count":62,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"value":"9783642276446","type":"print"},{"value":"9783642276453","type":"electronic"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2012]]},"DOI":"10.1007\/978-3-642-27645-3_1","type":"book-chapter","created":{"date-parts":[[2012,3,5]],"date-time":"2012-03-05T22:18:12Z","timestamp":1330985892000},"page":"3-42","source":"Crossref","is-referenced-by-count":422,"title":["Reinforcement Learning and Markov Decision Processes"],"prefix":"10.1007","author":[{"given":"Martijn","family":"van Otterlo","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Marco","family":"Wiering","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","reference":[{"key":"1_CR1","doi-asserted-by":"crossref","unstructured":"Bain, M., Sammut, C.: A framework for behavioral cloning. In: Muggleton, S.H., Furakawa, K., Michie, D. (eds.) Machine Intelligence, vol.\u00a015, pp. 103\u2013129. Oxford University Press (1995)","DOI":"10.1093\/oso\/9780198538677.003.0006"},{"key":"1_CR2","first-page":"835","volume":"13","author":"A.G. Barto","year":"1983","unstructured":"Barto, A.G., Sutton, R.S., Anderson, C.W.: Neuronlike elements that can solve difficult learning control problems. IEEE Transactions on Systems, Man, and Cybernetics\u00a013, 835\u2013846 (1983)","journal-title":"IEEE Transactions on Systems, Man, and Cybernetics"},{"issue":"1","key":"1_CR3","doi-asserted-by":"crossref","first-page":"81","DOI":"10.1016\/0004-3702(94)00011-O","volume":"72","author":"A.G. Barto","year":"1995","unstructured":"Barto, A.G., Bradtke, S.J., Singh, S.: Learning to act using real-time dynamic programming. Artificial Intelligence\u00a072(1), 81\u2013138 (1995)","journal-title":"Artificial Intelligence"},{"key":"1_CR4","volume-title":"Dynamic Programming","author":"R.E. Bellman","year":"1957","unstructured":"Bellman, R.E.: Dynamic Programming. Princeton University Press, Princeton (1957)"},{"key":"1_CR5","volume-title":"Dynamic Programming and Optimal Control","author":"D.P. Bertsekas","year":"1995","unstructured":"Bertsekas, D.P.: Dynamic Programming and Optimal Control, vol.\u00a01, 2. Athena Scientific, Belmont (1995)"},{"key":"1_CR6","volume-title":"Neuro-Dynamic Programming","author":"D.P. Bertsekas","year":"1996","unstructured":"Bertsekas, D.P., Tsitsiklis, J.: Neuro-Dynamic Programming. Athena Scientific, Belmont (1996)"},{"key":"1_CR7","unstructured":"Bonet, B., Geffner, H.: Faster heuristic search algorithms for planning with uncertainty and full feedback. In: Proceedings of the International Joint Conference on Artificial Intelligence (IJCAI), pp. 1233\u20131238 (2003a)"},{"key":"1_CR8","unstructured":"Bonet, B., Geffner, H.: Labeled RTDP: Improving the convergence of real-time dynamic programming. In: Proceedings of the International Conference on Artificial Intelligence Planning Systems (ICAPS), pp. 12\u201321 (2003b)"},{"key":"1_CR9","series-title":"Lecture Notes in Artificial Intelligence","doi-asserted-by":"crossref","first-page":"111","DOI":"10.1007\/3-540-48317-9_5","volume-title":"Artificial Intelligence Today","author":"C. Boutilier","year":"1999","unstructured":"Boutilier, C.: Knowledge Representation for Stochastic Decision Processes. In: Veloso, M.M., Wooldridge, M.J. (eds.) Artificial Intelligence Today. LNCS (LNAI), vol.\u00a01600, pp. 111\u2013152. Springer, Heidelberg (1999)"},{"key":"1_CR10","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1613\/jair.575","volume":"11","author":"C. Boutilier","year":"1999","unstructured":"Boutilier, C., Dean, T., Hanks, S.: Decision theoretic planning: Structural assumptions and computational leverage. Journal of Artificial Intelligence Research\u00a011, 1\u201394 (1999)","journal-title":"Journal of Artificial Intelligence Research"},{"key":"1_CR11","first-page":"213","volume":"3","author":"R.I. Brafman","year":"2002","unstructured":"Brafman, R.I., Tennenholtz, M.: R-MAX - a general polynomial time algorithm for near-optimal reinforcement learning. Journal of Machine Learning Research (JMLR)\u00a03, 213\u2013231 (2002)","journal-title":"Journal of Machine Learning Research (JMLR)"},{"key":"1_CR12","doi-asserted-by":"crossref","first-page":"35","DOI":"10.1016\/0004-3702(94)00086-G","volume":"76","author":"T. Dean","year":"1995","unstructured":"Dean, T., Kaelbling, L.P., Kirman, J., Nicholson, A.: Planning under time constraints in stochastic domains. Artificial Intelligence\u00a076, 35\u201374 (1995)","journal-title":"Artificial Intelligence"},{"key":"1_CR13","unstructured":"Dixon, K.R., Malak, M.J., Khosla, P.K.: Incorporating prior knowledge and previously learned information into reinforcement learning agents. Tech. rep., Institute for Complex Engineered Systems, Carnegie Mellon University (2000)"},{"key":"1_CR14","doi-asserted-by":"crossref","DOI":"10.7551\/mitpress\/5988.001.0001","volume-title":"Robot Shaping: An Experiment in Behavior Engineering","author":"M. Dorigo","year":"1997","unstructured":"Dorigo, M., Colombetti, M.: Robot Shaping: An Experiment in Behavior Engineering. The MIT Press, Cambridge (1997)"},{"key":"1_CR15","doi-asserted-by":"crossref","DOI":"10.7551\/mitpress\/4378.001.0001","volume-title":"Made-Up Minds: A Constructivist Approach to Artificial Intelligence","author":"G. Drescher","year":"1991","unstructured":"Drescher, G.: Made-Up Minds: A Constructivist Approach to Artificial Intelligence. The MIT Press, Cambridge (1991)"},{"key":"1_CR16","unstructured":"Ferguson, D., Stentz, A.: Focussed dynamic programming: Extensive comparative results. Tech. Rep. CMU-RI-TR-04-13, Robotics Institute, Carnegie Mellon University, Pittsburgh, Pennsylvania (2004)"},{"key":"1_CR17","unstructured":"Fr\u00e4mling, K.: Bi-memory model for guiding exploration by pre-existing knowledge. In: Driessens, K., Fern, A., van Otterlo, M. (eds.) Proceedings of the ICML-2005 Workshop on Rich Representations for Reinforcement Learning, pp. 21\u201326 (2005)"},{"key":"1_CR18","unstructured":"Gro\u00dfmann, A.: Adaptive state-space quantisation and multi-task reinforcement learning using constructive neural networks. In: From Animals to Animats: Proceedings of The International Conference on Simulation of Adaptive Behavior (SAB), pp. 160\u2013169 (2000)"},{"key":"1_CR19","doi-asserted-by":"crossref","first-page":"35","DOI":"10.1016\/S0004-3702(01)00106-0","volume":"129","author":"E.A. Hansen","year":"2001","unstructured":"Hansen, E.A., Zilberstein, S.: LAO*: A heuristic search algorithm that finds solutions with loops. Artificial Intelligence\u00a0129, 35\u201362 (2001)","journal-title":"Artificial Intelligence"},{"key":"1_CR20","volume-title":"Dynamic Programming and Markov Processes","author":"R.A. Howard","year":"1960","unstructured":"Howard, R.A.: Dynamic Programming and Markov Processes. The MIT Press, Cambridge (1960)"},{"key":"1_CR21","doi-asserted-by":"crossref","DOI":"10.7551\/mitpress\/4168.001.0001","volume-title":"Learning in Embedded Systems","author":"L.P. Kaelbling","year":"1993","unstructured":"Kaelbling, L.P.: Learning in Embedded Systems. The MIT Press, Cambridge (1993)"},{"key":"1_CR22","doi-asserted-by":"crossref","first-page":"237","DOI":"10.1613\/jair.301","volume":"4","author":"L.P. Kaelbling","year":"1996","unstructured":"Kaelbling, L.P., Littman, M.L., Moore, A.W.: Reinforcement learning: A survey. Journal of Artificial Intelligence Research\u00a04, 237\u2013285 (1996)","journal-title":"Journal of Artificial Intelligence Research"},{"key":"1_CR23","unstructured":"Kearns, M., Singh, S.: Near-optimal reinforcement learning in polynomial time. In: Proceedings of the International Conference on Machine Learning (ICML) (1998)"},{"issue":"4","key":"1_CR24","doi-asserted-by":"crossref","first-page":"303","DOI":"10.1080\/09528130210151831","volume":"14","author":"S. Koenig","year":"2002","unstructured":"Koenig, S., Liu, Y.: The interaction of representations and planning objectives for decision-theoretic planning. Journal of Experimental and Theoretical Artificial Intelligence\u00a014(4), 303\u2013326 (2002)","journal-title":"Journal of Experimental and Theoretical Artificial Intelligence"},{"issue":"4","key":"1_CR25","doi-asserted-by":"crossref","first-page":"1143","DOI":"10.1137\/S0363012901385691","volume":"42","author":"V. Konda","year":"2003","unstructured":"Konda, V., Tsitsiklis, J.: Actor-critic algorithms. SIAM Journal on Control and Optimization\u00a042(4), 1143\u20131166 (2003)","journal-title":"SIAM Journal on Control and Optimization"},{"key":"1_CR26","unstructured":"Konidaris, G.: A framework for transfer in reinforcement learning. In: ICML-2006 Workshop on Structural Knowledge Transfer for Machine Learning (2006)"},{"issue":"1-2","key":"1_CR27","doi-asserted-by":"crossref","first-page":"239","DOI":"10.1016\/0004-3702(94)00087-H","volume":"76","author":"N. Kushmerick","year":"1995","unstructured":"Kushmerick, N., Hanks, S., Weld, D.S.: An algorithm for probabilistic planning. Artificial Intelligence\u00a076(1-2), 239\u2013286 (1995)","journal-title":"Artificial Intelligence"},{"key":"1_CR28","unstructured":"Littman, M.L., Dean, T., Kaelbling, L.P.: On the complexity of solving Markov decision problems. In: Proceedings of the National Conference on Artificial Intelligence (AAAI), pp. 394\u2013402 (1995)"},{"key":"1_CR29","first-page":"159","volume":"22","author":"S. Mahadevan","year":"1996","unstructured":"Mahadevan, S.: Average reward reinforcement learning: Foundations, algorithms, and empirical results. Machine Learning\u00a022, 159\u2013195 (1996)","journal-title":"Machine Learning"},{"key":"1_CR30","doi-asserted-by":"crossref","unstructured":"Maloof, M.A.: Incremental rule learning with partial instance memory for changing concepts. In: Proceedings of the International Joint Conference on Neural Networks, pp. 2764\u20132769 (2003)","DOI":"10.1109\/IJCNN.2003.1224005"},{"key":"1_CR31","doi-asserted-by":"crossref","unstructured":"Mataric, M.J.: Reward functions for accelerated learning. In: Proceedings of the International Conference on Machine Learning (ICML), pp. 181\u2013189 (1994)","DOI":"10.1016\/B978-1-55860-335-6.50030-1"},{"key":"1_CR32","doi-asserted-by":"crossref","unstructured":"Matthews, W.H.: Mazes and Labyrinths: A General Account of their History and Developments. Longmans, Green and Co., London (1922); Mazes & Labyrinths: Their History & Development. Dover Publications, New York (reprinted in 1970)","DOI":"10.5962\/bhl.title.112365"},{"key":"1_CR33","doi-asserted-by":"crossref","unstructured":"McMahan, H.B., Likhachev, M., Gordon, G.J.: Bounded real-time dynamic programming: RTDP with monotone upper bounds and performance guarantees. In: Proceedings of the International Conference on Machine Learning (ICML), pp. 569\u2013576 (2005)","DOI":"10.1145\/1102351.1102423"},{"issue":"1","key":"1_CR34","first-page":"103","volume":"13","author":"A.W. Moore","year":"1993","unstructured":"Moore, A.W., Atkeson, C.G.: Prioritized sweeping: Reinforcement learning with less data and less time. Machine Learning\u00a013(1), 103\u2013130 (1993)","journal-title":"Machine Learning"},{"key":"1_CR35","unstructured":"Ng, A.Y., Harada, D., Russell, S.J.: Policy invariance under reward transformations: Theory and application to reward shaping. In: Proceedings of the International Conference on Machine Learning (ICML), pp. 278\u2013287 (1999)"},{"key":"1_CR36","first-page":"283","volume":"22","author":"J. Peng","year":"1996","unstructured":"Peng, J., Williams, R.J.: Incremental multi-step Q-learning. Machine Learning\u00a022, 283\u2013290 (1996)","journal-title":"Machine Learning"},{"key":"1_CR37","doi-asserted-by":"crossref","DOI":"10.1002\/9780470316887","volume-title":"Markov Decision Processes\u2014Discrete Stochastic Dynamic Programming","author":"M.L. Puterman","year":"1994","unstructured":"Puterman, M.L.: Markov Decision Processes\u2014Discrete Stochastic Dynamic Programming. John Wiley & Sons, Inc., New York (1994)"},{"key":"1_CR38","doi-asserted-by":"crossref","first-page":"1127","DOI":"10.1287\/mnsc.24.11.1127","volume":"24","author":"M.L. Puterman","year":"1978","unstructured":"Puterman, M.L., Shin, M.C.: Modified policy iteration algorithms for discounted Markov decision processes. Management Science\u00a024, 1127\u20131137 (1978)","journal-title":"Management Science"},{"key":"1_CR39","unstructured":"Ratitch, B.: On characteristics of Markov decision processes and reinforcement learning in large domains. PhD thesis, The School of Computer Science, McGill University, Montreal (2005)"},{"key":"1_CR40","unstructured":"Reynolds, S.I.: Reinforcement learning with exploration. PhD thesis, The School of Computer Science, The University of Birmingham, UK (2002)"},{"key":"1_CR41","unstructured":"Rummery, G.A.: Problem solving with reinforcement learning. PhD thesis, Cambridge University, Engineering Department, Cambridge, England (1995)"},{"key":"1_CR42","unstructured":"Rummery, G.A., Niranjan, M.: On-line Q-Learning using connectionist systems. Tech. Rep. CUED\/F-INFENG\/TR 166, Cambridge University, Engineering Department (1994)"},{"key":"1_CR43","volume-title":"Artificial Intelligence: a Modern Approach","author":"S.J. Russell","year":"2003","unstructured":"Russell, S.J., Norvig, P.: Artificial Intelligence: a Modern Approach, 2nd edn. Prentice Hall, New Jersey (2003)","edition":"2"},{"issue":"2","key":"1_CR44","first-page":"95","volume":"20","author":"J. Schaeffer","year":"1997","unstructured":"Schaeffer, J., Plaat, A.: Kasparov versus deep blue: The re-match. International Computer Chess Association Journal\u00a020(2), 95\u2013101 (1997)","journal-title":"International Computer Chess Association Journal"},{"key":"1_CR45","doi-asserted-by":"crossref","unstructured":"Schwartz, A.: A reinforcement learning method for maximizing undiscounted rewards. In: Proceedings of the International Conference on Machine Learning (ICML), pp. 298\u2013305 (1993)","DOI":"10.1016\/B978-1-55860-307-3.50045-9"},{"issue":"3","key":"1_CR46","doi-asserted-by":"crossref","first-page":"287","DOI":"10.1023\/A:1007678930559","volume":"38","author":"S. Singh","year":"2000","unstructured":"Singh, S., Jaakkola, T., Littman, M., Szepesvari, C.: Convergence results for single-step on-policy reinforcement-learning algorithms. Machine Learning\u00a038(3), 287\u2013308 (2000)","journal-title":"Machine Learning"},{"key":"1_CR47","first-page":"9","volume":"3","author":"R.S. Sutton","year":"1988","unstructured":"Sutton, R.S.: Learning to predict by the methods of temporal differences. Machine Learning\u00a03, 9\u201344 (1988)","journal-title":"Machine Learning"},{"key":"1_CR48","doi-asserted-by":"crossref","unstructured":"Sutton, R.S.: Integrated architectures for learning, planning, and reacting based on approximating dynamic programming. In: Proceedings of the International Conference on Machine Learning (ICML), pp. 216\u2013224 (1990)","DOI":"10.1016\/B978-1-55860-141-3.50030-4"},{"key":"1_CR49","doi-asserted-by":"crossref","unstructured":"Sutton, R.S.: DYNA, an integrated architecture for learning, planning and reacting. In: Working Notes of the AAAI Spring Symposium on Integrated Intelligent Architectures, pp. 151\u2013155 (1991a)","DOI":"10.7551\/mitpress\/4939.003.0012"},{"key":"1_CR50","doi-asserted-by":"crossref","unstructured":"Sutton, R.S.: Reinforcement learning architectures for animats. In: From Animals to Animats: Proceedings of The International Conference on Simulation of Adaptive Behavior (SAB), pp. 288\u2013296 (1991b)","DOI":"10.7551\/mitpress\/3115.003.0040"},{"key":"1_CR51","unstructured":"Sutton, R.S.: Generalization in reinforcement learning: Successful examples using sparse coarse coding. In: Proceedings of the Neural Information Processing Conference (NIPS), pp. 1038\u20131044 (1996)"},{"key":"1_CR52","volume-title":"Reinforcement Learning: an Introduction","author":"R.S. Sutton","year":"1998","unstructured":"Sutton, R.S., Barto, A.G.: Reinforcement Learning: an Introduction. The MIT Press, Cambridge (1998)"},{"key":"1_CR53","unstructured":"Tash, J., Russell, S.J.: Control strategies for a stochastic planner. In: Proceedings of the National Conference on Artificial Intelligence (AAAI), pp. 1079\u20131085 (1994)"},{"key":"1_CR54","unstructured":"Watkins, C.J.C.H.: Learning from delayed rewards. PhD thesis, King\u2019s College, Cambridge, England (1989)"},{"key":"1_CR55","doi-asserted-by":"crossref","unstructured":"Watkins, C.J.C.H., Dayan, P.: Q-learning. Machine Learning 8(3\/4) (1992); Special Issue on Reinforcement Learning","DOI":"10.1023\/A:1022676722315"},{"key":"1_CR56","unstructured":"Wiering, M.A.: Explorations in efficient reinforcement learning. PhD thesis, Faculteit der Wiskunde, Informatica, Natuurkunde en Sterrenkunde, Universiteit van Amsterdam (1999)"},{"key":"1_CR57","unstructured":"Wiering, M.A.: Model-based reinforcement learning in dynamic environments. Tech. Rep. UU-CS-2002-029, Institute of Information and Computing Sciences, University of Utrecht, The Netherlands (2002)"},{"key":"1_CR58","unstructured":"Wiering, M.A.: QV(\u03bb)-Learning: A new on-policy reinforcement learning algorithm. In: Proceedings of the 7th European Workshop on Reinforcement Learning (2005)"},{"key":"1_CR59","doi-asserted-by":"crossref","unstructured":"Wiering, M.A., Schmidhuber, J.H.: Efficient model-based exploration. In: From Animals to Animats: Proceedings of The International Conference on Simulation of Adaptive Behavior (SAB), pp. 223\u2013228 (1998a)","DOI":"10.7551\/mitpress\/3119.003.0034"},{"issue":"1","key":"1_CR60","doi-asserted-by":"crossref","first-page":"105","DOI":"10.1023\/A:1007562800292","volume":"33","author":"M.A. Wiering","year":"1998","unstructured":"Wiering, M.A., Schmidhuber, J.H.: Fast online Q(\u03bb). Machine Learning\u00a033(1), 105\u2013115 (1998b)","journal-title":"Machine Learning"},{"key":"1_CR61","volume-title":"Operations Research Applications and Algorithms","author":"W.L. Winston","year":"1991","unstructured":"Winston, W.L.: Operations Research Applications and Algorithms, 2nd edn. Thomson Information\/Publishing Group, Boston (1991)","edition":"2"},{"key":"1_CR62","doi-asserted-by":"crossref","first-page":"286","DOI":"10.1016\/S0019-9958(77)90354-0","volume":"34","author":"I.H. Witten","year":"1977","unstructured":"Witten, I.H.: An adaptive optimal controller for discrete-time markov environments. Information and Control\u00a034, 286\u2013295 (1977)","journal-title":"Information and Control"}],"container-title":["Adaptation, Learning, and Optimization","Reinforcement Learning"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-27645-3_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,22]],"date-time":"2025-03-22T13:01:34Z","timestamp":1742648494000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-27645-3_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012]]},"ISBN":["9783642276446","9783642276453"],"references-count":62,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-27645-3_1","relation":{},"ISSN":["1867-4534","1867-4542"],"issn-type":[{"value":"1867-4534","type":"print"},{"value":"1867-4542","type":"electronic"}],"subject":[],"published":{"date-parts":[[2012]]}}}