{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T11:12:52Z","timestamp":1764587572949,"version":"3.40.2"},"publisher-location":"Berlin, Heidelberg","reference-count":57,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783642276446"},{"type":"electronic","value":"9783642276453"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2012]]},"DOI":"10.1007\/978-3-642-27645-3_3","type":"book-chapter","created":{"date-parts":[[2012,3,5]],"date-time":"2012-03-05T22:18:12Z","timestamp":1330985892000},"page":"75-109","source":"Crossref","is-referenced-by-count":10,"title":["Least-Squares Methods for Policy Iteration"],"prefix":"10.1007","author":[{"given":"Lucian","family":"Bu\u015foniu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alessandro","family":"Lazaric","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mohammad","family":"Ghavamzadeh","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"R\u00e9mi","family":"Munos","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Robert","family":"Babu\u0161ka","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bart","family":"De Schutter","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","reference":[{"issue":"1","key":"3_CR1","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1007\/s10994-007-5038-2","volume":"71","author":"A. Antos","year":"2008","unstructured":"Antos, A., Szepesv\u00e1ri, C., Munos, R.: Learning near-optimal policies with Bellman-residual minimization based fitted policy iteration and a single sample path. Machine Learning\u00a071(1), 89\u2013129 (2008)","journal-title":"Machine Learning"},{"key":"3_CR2","doi-asserted-by":"crossref","unstructured":"Baird, L.: Residual algorithms: Reinforcement learning with function approximation. In: Proceedings 12th International Conference on Machine Learning (ICML-1995), Tahoe City, U.S, pp. 30\u201337 (1995)","DOI":"10.1016\/B978-1-55860-377-6.50013-X"},{"key":"3_CR3","doi-asserted-by":"publisher","first-page":"270","DOI":"10.1162\/neco.1995.7.2.270","volume":"7","author":"D.P. Bertsekas","year":"1995","unstructured":"Bertsekas, D.P.: A counterexample to temporal differences learning. Neural Computation\u00a07, 270\u2013279 (1995)","journal-title":"Neural Computation"},{"key":"3_CR4","unstructured":"Bertsekas, D.P.: Approximate dynamic programming. In: Dynamic Programming and Optimal Control, Ch. 6, vol. 2 (2010), http:\/\/web.mit.edu\/dimitrib\/www\/dpchapter.html"},{"issue":"3","key":"3_CR5","doi-asserted-by":"publisher","first-page":"310","DOI":"10.1007\/s11768-011-1005-3","volume":"9","author":"D.P. Bertsekas","year":"2011","unstructured":"Bertsekas, D.P.: Approximate policy iteration: A survey and some new methods. Journal of Control Theory and Applications\u00a09(3), 310\u2013335 (2011a)","journal-title":"Journal of Control Theory and Applications"},{"issue":"9","key":"3_CR6","doi-asserted-by":"publisher","first-page":"2128","DOI":"10.1109\/TAC.2011.2115290","volume":"56","author":"D.P. Bertsekas","year":"2011","unstructured":"Bertsekas, D.P.: Temporal difference methods for general projected equations. IEEE Transactions on Automatic Control\u00a056(9), 2128\u20132139 (2011b)","journal-title":"IEEE Transactions on Automatic Control"},{"key":"3_CR7","unstructured":"Bertsekas, D.P., Ioffe, S.: Temporal differences-based policy iteration and applications in neuro-dynamic programming. Tech. Rep. LIDS-P-2349, Massachusetts Institute of Technology, Cambridge, US (1996), http:\/\/web.mit.edu\/dimitrib\/www\/Tempdif.pdf"},{"key":"3_CR8","unstructured":"Bertsekas, D.P., Tsitsiklis, J.N.: Neuro-Dynamic Programming. Athena Scientific (1996)"},{"key":"3_CR9","unstructured":"Bertsekas, D.P., Borkar, V., Nedi\u0107, A.: Improved temporal difference methods with linear function approximation. In: Si, J., Barto, A., Powell, W. (eds.) Learning and Approximate Dynamic Programming. IEEE Press (2004)"},{"key":"3_CR10","doi-asserted-by":"publisher","first-page":"233","DOI":"10.1023\/A:1017936530646","volume":"49","author":"J. Boyan","year":"2002","unstructured":"Boyan, J.: Technical update: Least-squares temporal difference learning. Machine Learning\u00a049, 233\u2013246 (2002)","journal-title":"Machine Learning"},{"issue":"1-3","key":"3_CR11","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1007\/BF00114723","volume":"22","author":"S.J. Bradtke","year":"1996","unstructured":"Bradtke, S.J., Barto, A.G.: Linear least-squares algorithms for temporal difference learning. Machine Learning\u00a022(1-3), 33\u201357 (1996)","journal-title":"Machine Learning"},{"key":"3_CR12","unstructured":"Bu\u015foniu, L., Babu\u0161ka, R., De Schutter, B., Ernst, D.: Reinforcement Learning and Dynamic Programming Using Function Approximators. In: Automation and Control Engineering. Taylor & Francis, CRC Press (2010a)"},{"key":"3_CR13","doi-asserted-by":"crossref","unstructured":"Bu\u015foniu, L., De Schutter, B., Babu\u0161ka, R., Ernst, D.: Using prior knowledge to accelerate online least-squares policy iteration. In: 2010 IEEE International Conference on Automation, Quality and Testing, Robotics (AQTR-2010), Cluj-Napoca, Romania (2010b)","DOI":"10.1109\/AQTR.2010.5520917"},{"issue":"5","key":"3_CR14","doi-asserted-by":"publisher","first-page":"804","DOI":"10.1016\/j.automatica.2010.02.006","volume":"46","author":"L. Bu\u015foniu","year":"2010","unstructured":"Bu\u015foniu, L., Ernst, D., De Schutter, B., Babu\u0161ka, R.: Approximate dynamic programming with a fuzzy parameterization. Automatica\u00a046(5), 804\u2013814 (2010c)","journal-title":"Automatica"},{"key":"3_CR15","doi-asserted-by":"crossref","unstructured":"Bu\u015foniu, L., Ernst, D., De Schutter, B., Babu\u0161ka, R.: Online least-squares policy iteration for reinforcement learning control. In: Proceedings 2010 American Control Conference (ACC-2010), Baltimore, US, pp. 486\u2013491 (2010d)","DOI":"10.1109\/ACC.2010.5530856"},{"issue":"3","key":"3_CR16","doi-asserted-by":"publisher","first-page":"157","DOI":"10.1007\/s10994-008-5069-3","volume":"72","author":"C. Dimitrakakis","year":"2008","unstructured":"Dimitrakakis, C., Lagoudakis, M.: Rollout sampling approximate policy iteration. Machine Learning\u00a072(3), 157\u2013171 (2008)","journal-title":"Machine Learning"},{"key":"3_CR17","unstructured":"Engel, Y., Mannor, S., Meir, R.: Bayes meets Bellman: The Gaussian process approach to temporal difference learning. In: Proceedings 20th International Conference on Machine Learning (ICML-2003), Washington, US, pp. 154\u2013161 (2003)"},{"key":"3_CR18","doi-asserted-by":"crossref","unstructured":"Engel, Y., Mannor, S., Meir, R.: Reinforcement learning with Gaussian processes. In: Proceedings 22nd International Conference on Machine Learning (ICML-2005), Bonn, Germany, pp. 201\u2013208 (2005)","DOI":"10.1145\/1102351.1102377"},{"key":"3_CR19","first-page":"503","volume":"6","author":"D. Ernst","year":"2005","unstructured":"Ernst, D., Geurts, P., Wehenkel, L.: Tree-based batch mode reinforcement learning. Journal of Machine Learning Research\u00a06, 503\u2013556 (2005)","journal-title":"Journal of Machine Learning Research"},{"key":"3_CR20","unstructured":"Farahmand, A.M., Ghavamzadeh, M., Szepesv\u00e1ri, C.S., Mannor, S.: Regularized policy iteration. In: Koller, D., Schuurmans, D., Bengio, Y., Bottou, L. (eds.) Advances in Neural Information Processing Systems, vol.\u00a021, pp. 441\u2013448. MIT Press (2009)"},{"key":"3_CR21","unstructured":"Geramifard, A., Bowling, M.H., Sutton, R.S.: Incremental least-squares temporal difference learning. In: Proceedings 21st National Conference on Artificial Intelligence and 18th Innovative Applications of Artificial Intelligence Conference (AAAI-2006), Boston, US, pp. 356\u2013361 (2006)"},{"key":"3_CR22","doi-asserted-by":"crossref","unstructured":"Geramifard, A., Bowling, M., Zinkevich, M., Sutton, R.S.: iLSTD: Eligibility traces & convergence analysis. In: Sch\u00f6lkopf, B., Platt, J., Hofmann, T. (eds.) Advances in Neural Information Processing Systems, vol.\u00a019, pp. 440\u2013448. MIT Press (2007)","DOI":"10.7551\/mitpress\/7503.003.0060"},{"key":"3_CR23","unstructured":"Golub, G.H., Van Loan, C.F.: Matrix Computations, 3rd edn. Johns Hopkins (1996)"},{"key":"3_CR24","doi-asserted-by":"crossref","unstructured":"Jung, T., Polani, D.: Kernelizing LSPE(\u03bb). In: Proceedings 2007 IEEE Symposium on Approximate Dynamic Programming and Reinforcement Learning (ADPRL-2007), Honolulu, US, pp. 338\u2013345 (2007a)","DOI":"10.1109\/ADPRL.2007.368208"},{"key":"3_CR25","unstructured":"Jung, T., Polani, D.: Learning RoboCup-keepaway with kernels. In: Gaussian Processes in Practice, JMLR Workshop and Conference Proceedings, vol.\u00a01, pp. 33\u201357 (2007b)"},{"key":"3_CR26","doi-asserted-by":"crossref","unstructured":"Kolter, J.Z., Ng, A.: Regularization and feature selection in least-squares temporal difference learning. In: Proceedings 26th International Conference on Machine Learning (ICML-2009), Montreal, Canada, pp. 521\u2013528 (2009)","DOI":"10.1145\/1553374.1553442"},{"key":"3_CR27","unstructured":"Konda, V.: Actor-critic algorithms. PhD thesis, Massachusetts Institute of Technology, Cambridge, US (2002)"},{"key":"3_CR28","series-title":"Lecture Notes in Artificial Intelligence","doi-asserted-by":"publisher","first-page":"249","DOI":"10.1007\/3-540-46014-4_23","volume-title":"Methods and Applications of Artificial Intelligence","author":"M. Lagoudakis","year":"2002","unstructured":"Lagoudakis, M., Parr, R., Littman, M.: Least-squares Methods in Reinforcement Learning for Control. In: Vlahavas, I.P., Spyropoulos, C.D. (eds.) SETN 2002. LNCS (LNAI), vol.\u00a02308, pp. 249\u2013260. Springer, Heidelberg (2002)"},{"key":"3_CR29","first-page":"1107","volume":"4","author":"M.G. Lagoudakis","year":"2003","unstructured":"Lagoudakis, M.G., Parr, R.: Least-squares policy iteration. Journal of Machine Learning Research\u00a04, 1107\u20131149 (2003a)","journal-title":"Journal of Machine Learning Research"},{"key":"3_CR30","unstructured":"Lagoudakis, M.G., Parr, R.: Reinforcement learning as classification: Leveraging modern classifiers. In: Proceedings 20th International Conference on Machine Learning (ICML-2003), Washington, US, pp. 424\u2013431 (2003b)"},{"key":"3_CR31","unstructured":"Lazaric, A., Ghavamzadeh, M., Munos, R.: Analysis of a classification-based policy iteration algorithm. In: Proceedings 27th International Conference on Machine Learning (ICML-2010), Haifa, Israel, pp. 607\u2013614 (2010a)"},{"key":"3_CR32","unstructured":"Lazaric, A., Ghavamzadeh, M., Munos, R.: Finite-sample analysis of LSTD. In: Proceedings 27th International Conference on Machine Learning (ICML-2010), Haifa, Israel, pp. 615\u2013622 (2010b)"},{"key":"3_CR33","unstructured":"Li, L., Littman, M.L., Mansley, C.R.: Online exploration in least-squares policy iteration. In: Proceedings 8th International Joint Conference on Autonomous Agents and Multiagent Systems (AAMAS-2009), Budapest, Hungary, vol.\u00a02, pp. 733\u2013739 (2009)"},{"key":"3_CR34","unstructured":"Maei, H.R., Szepesv\u00e1ri, C., Bhatnagar, S., Sutton, R.S.: Toward off-policy learning control with function approximation. In: Proceedings 27th International Conference on Machine Learning (ICML-2010), Haifa, Israel, pp. 719\u2013726 (2010)"},{"key":"3_CR35","unstructured":"Maillard, O.A., Munos, R., Lazaric, A., Ghavamzadeh, M.: Finite-sample analysis of Bellman residual minimization, vol.\u00a013, pp. 299\u2013314 (2010)"},{"key":"3_CR36","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4471-3267-7","volume-title":"Markov chains and stochastic stability","author":"S. Meyn","year":"1993","unstructured":"Meyn, S., Tweedie, L.: Markov chains and stochastic stability. Springer, Heidelberg (1993)"},{"issue":"3","key":"3_CR37","first-page":"199","volume":"21","author":"A.W. Moore","year":"1995","unstructured":"Moore, A.W., Atkeson, C.R.: The parti-game algorithm for variable resolution reinforcement learning in multidimensional state-spaces. Machine Learning\u00a021(3), 199\u2013233 (1995)","journal-title":"Machine Learning"},{"key":"3_CR38","unstructured":"Munos, R.: Error bounds for approximate policy iteration. In: Proceedings 20th International Conference (ICML-2003), Washington, US, pp. 560\u2013567 (2003)"},{"key":"3_CR39","unstructured":"Munos, R.: Approximate dynamic programming. In: Markov Decision Processes in Artificial Intelligence. Wiley (2010)"},{"key":"3_CR40","first-page":"815","volume":"9","author":"R. Munos","year":"2008","unstructured":"Munos, R., Szepesv\u00e1ri, C.S.: Finite time bounds for fitted value iteration. Journal of Machine Learning Research\u00a09, 815\u2013857 (2008)","journal-title":"Journal of Machine Learning Research"},{"issue":"1-2","key":"3_CR41","doi-asserted-by":"crossref","first-page":"79","DOI":"10.1023\/A:1022192903948","volume":"13","author":"A. Nedi\u0107","year":"2003","unstructured":"Nedi\u0107, A., Bertsekas, D.P.: Least-squares policy evaluation algorithms with linear function approximation. Discrete Event Dynamic Systems: Theory and Applications\u00a013(1-2), 79\u2013110 (2003)","journal-title":"Discrete Event Dynamic Systems: Theory and Applications"},{"key":"3_CR42","unstructured":"Rasmussen, C.E., Kuss, M.: Gaussian processes in reinforcement learning. In: Thrun, S., Saul, L.K., Sch\u00f6lkopf, B. (eds.) Advances in Neural Information Processing Systems, vol.\u00a016. MIT Press (2004)"},{"key":"3_CR43","unstructured":"Scherrer, B.: Should one compute the Temporal Difference fix point or minimize the Bellman Residual? the unified oblique projection view. In: Proceedings 27th International Conference on Machine Learning (ICML-2010), Haifa, Israel, pp. 959\u2013966 (2010)"},{"issue":"2","key":"3_CR44","doi-asserted-by":"publisher","first-page":"568","DOI":"10.1016\/0022-247X(85)90317-8","volume":"110","author":"P.J. Schweitzer","year":"1985","unstructured":"Schweitzer, P.J., Seidmann, A.: Generalized polynomial approximations in Markovian decision processes. Journal of Mathematical Analysis and Applications\u00a0110(2), 568\u2013582 (1985)","journal-title":"Journal of Mathematical Analysis and Applications"},{"key":"3_CR45","doi-asserted-by":"crossref","unstructured":"Sutton, R., Maei, H., Precup, D., Bhatnagar, S., Silver, D., Szepesvari, C.S., Wiewiora, E.: Fast gradient-descent methods for temporal-difference learning with linear function approximation. In: Proceedings 26th International Conference on Machine Learning (ICML-2009), Montreal, Canada, pp. 993\u20131000 (2009a)","DOI":"10.1145\/1553374.1553501"},{"key":"3_CR46","first-page":"9","volume":"3","author":"R.S. Sutton","year":"1988","unstructured":"Sutton, R.S.: Learning to predict by the method of temporal differences. Machine Learning\u00a03, 9\u201344 (1988)","journal-title":"Machine Learning"},{"key":"3_CR47","doi-asserted-by":"crossref","unstructured":"Sutton, R.S., Szepesv\u00e1ri, C.S., Maei, H.R.: A convergent O(n) temporal-difference algorithm for off-policy learning with linear function approximation. In: Koller, D., Schuurmans, D., Bengio, Y., Bottou, L. (eds.) Advances in Neural Information Processing Systems, vol.\u00a021, pp. 1609\u20131616. MIT Press (2009b)","DOI":"10.1145\/1553374.1553501"},{"key":"3_CR48","doi-asserted-by":"crossref","unstructured":"Szepesv\u00e1ri, C.S.: Algorithms for Reinforcement Learning. Morgan & Claypool Publishers (2010)","DOI":"10.2200\/S00268ED1V01Y201005AIM009"},{"key":"3_CR49","doi-asserted-by":"crossref","unstructured":"Taylor, G., Parr, R.: Kernelized value function approximation for reinforcement learning. In: Proceedings 26th International Conference on Machine Learning (ICML-2009), Montreal, Canada, pp. 1017\u20131024 (2009)","DOI":"10.1145\/1553374.1553504"},{"key":"3_CR50","unstructured":"Thiery, C., Scherrer, B.: Least-squares \u03bb policy iteration: Bias-variance trade-off in control problems. In: Proceedings 27th International Conference on Machine Learning (ICML-2010), Haifa, Israel, pp. 1071\u20131078 (2010)"},{"key":"3_CR51","first-page":"59","volume":"3","author":"J.N. Tsitsiklis","year":"2002","unstructured":"Tsitsiklis, J.N.: On the convergence of optimistic policy iteration. Journal of Machine Learning Research\u00a03, 59\u201372 (2002)","journal-title":"Journal of Machine Learning Research"},{"issue":"5","key":"3_CR52","doi-asserted-by":"publisher","first-page":"674","DOI":"10.1109\/9.580874","volume":"42","author":"J.N. Tsitsiklis","year":"1997","unstructured":"Tsitsiklis, J.N., Van Roy, B.: An analysis of temporal difference learning with function approximation. IEEE Transactions on Automatic Control\u00a042(5), 674\u2013690 (1997)","journal-title":"IEEE Transactions on Automatic Control"},{"issue":"9","key":"3_CR53","first-page":"54","volume":"11","author":"X. Xu","year":"2005","unstructured":"Xu, X., Xie, T., Hu, D., Lu, X.: Kernel least-squares temporal difference learning. International Journal of Information Technology\u00a011(9), 54\u201363 (2005)","journal-title":"International Journal of Information Technology"},{"issue":"4","key":"3_CR54","doi-asserted-by":"publisher","first-page":"973","DOI":"10.1109\/TNN.2007.899161","volume":"18","author":"X. Xu","year":"2007","unstructured":"Xu, X., Hu, D., Lu, X.: Kernel-based least-squares policy iteration for reinforcement learning. IEEE Transactions on Neural Networks\u00a018(4), 973\u2013992 (2007)","journal-title":"IEEE Transactions on Neural Networks"},{"key":"3_CR55","unstructured":"Yu, H.: Convergence of least squares temporal difference methods under general conditions. In: Proceedings 27th International Conference on Machine Learning (ICML-2010), Haifa, Israel, pp. 1207\u20131214 (2010)"},{"issue":"7","key":"3_CR56","doi-asserted-by":"publisher","first-page":"1515","DOI":"10.1109\/TAC.2009.2022097","volume":"54","author":"H. Yu","year":"2009","unstructured":"Yu, H., Bertsekas, D.P.: Convergence results for some temporal difference methods based on least squares. IEEE Transactions on Automatic Control\u00a054(7), 1515\u20131531 (2009)","journal-title":"IEEE Transactions on Automatic Control"},{"issue":"2","key":"3_CR57","doi-asserted-by":"publisher","first-page":"306","DOI":"10.1287\/moor.1100.0441","volume":"35","author":"H. Yu","year":"2010","unstructured":"Yu, H., Bertsekas, D.P.: Error bounds for approximations from projected linear equations. Mathematics of Operations Research\u00a035(2), 306\u2013329 (2010)","journal-title":"Mathematics of Operations Research"}],"container-title":["Adaptation, Learning, and Optimization","Reinforcement Learning"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-27645-3_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,22]],"date-time":"2025-03-22T13:01:17Z","timestamp":1742648477000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-27645-3_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012]]},"ISBN":["9783642276446","9783642276453"],"references-count":57,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-27645-3_3","relation":{},"ISSN":["1867-4534","1867-4542"],"issn-type":[{"type":"print","value":"1867-4534"},{"type":"electronic","value":"1867-4542"}],"subject":[],"published":{"date-parts":[[2012]]}}}