{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,14]],"date-time":"2026-02-14T03:13:20Z","timestamp":1771038800690,"version":"3.50.1"},"publisher-location":"Berlin, Heidelberg","reference-count":175,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"value":"9783642276446","type":"print"},{"value":"9783642276453","type":"electronic"}],"license":[{"start":{"date-parts":[[2012,1,1]],"date-time":"2012-01-01T00:00:00Z","timestamp":1325376000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2012,1,1]],"date-time":"2012-01-01T00:00:00Z","timestamp":1325376000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2012]]},"DOI":"10.1007\/978-3-642-27645-3_7","type":"book-chapter","created":{"date-parts":[[2012,3,5]],"date-time":"2012-03-05T22:18:12Z","timestamp":1330985892000},"page":"207-251","source":"Crossref","is-referenced-by-count":107,"title":["Reinforcement Learning in Continuous State and Action Spaces"],"prefix":"10.1007","author":[{"given":"Hado","family":"van Hasselt","sequence":"first","affiliation":[]}],"member":"297","reference":[{"key":"7_CR1","series-title":"Lecture Notes in Computer Science","first-page":"154","volume-title":"Parallel Problem Solving from Nature, PPSN XI","author":"Y. Akimoto","year":"2010","unstructured":"Akimoto, Y., Nagata, Y., Ono, I., Kobayashi, S.: Bidirectional Relation Between CMA Evolution Strategies and Natural Evolution Strategies. In: Schaefer, R., Cotta, C., Ko\u0142odziej, J., Rudolph, G. (eds.) PPSN XI. LNCS, vol.\u00a06238, pp. 154\u2013163. Springer, Heidelberg (2010)"},{"key":"7_CR2","doi-asserted-by":"crossref","first-page":"25","DOI":"10.1016\/0025-5564(71)90051-4","volume":"10","author":"J.S. Albus","year":"1971","unstructured":"Albus, J.S.: A theory of cerebellar function. Mathematical Biosciences\u00a010, 25\u201361 (1971)","journal-title":"Mathematical Biosciences"},{"key":"7_CR3","doi-asserted-by":"crossref","unstructured":"Albus, J.S.: A new approach to manipulator control: The cerebellar model articulation controller (CMAC). In: Dynamic Systems, Measurement and Control, pp. 220\u2013227 (1975)","DOI":"10.1115\/1.3426922"},{"issue":"2","key":"7_CR4","doi-asserted-by":"crossref","first-page":"251","DOI":"10.1162\/089976698300017746","volume":"10","author":"S.I. Amari","year":"1998","unstructured":"Amari, S.I.: Natural gradient works efficiently in learning. Neural Computation\u00a010(2), 251\u2013276 (1998)","journal-title":"Neural Computation"},{"issue":"3","key":"7_CR5","doi-asserted-by":"crossref","first-page":"31","DOI":"10.1109\/37.24809","volume":"9","author":"C.W. Anderson","year":"1989","unstructured":"Anderson, C.W.: Learning to control an inverted pendulum using neural networks. IEEE Control Systems Magazine\u00a09(3), 31\u201337 (1989)","journal-title":"IEEE Control Systems Magazine"},{"key":"7_CR6","unstructured":"Antos, A., Munos, R., Szepesv\u00e1ri, C.: Fitted Q-iteration in continuous action-space MDPs. In: Advances in Neural Information Processing Systems (NIPS-2007), vol.\u00a020, pp. 9\u201316 (2008a)"},{"issue":"1","key":"7_CR7","doi-asserted-by":"crossref","first-page":"89","DOI":"10.1007\/s10994-007-5038-2","volume":"71","author":"A. Antos","year":"2008","unstructured":"Antos, A., Szepesv\u00e1ri, C., Munos, R.: Learning near-optimal policies with Bellman-residual minimization based fitted policy iteration and a single sample path. Machine Learning\u00a071(1), 89\u2013129 (2008b)","journal-title":"Machine Learning"},{"key":"7_CR8","unstructured":"Babuska, R.: Fuzzy modeling for control. Kluwer Academic Publishers (1998)"},{"key":"7_CR9","doi-asserted-by":"crossref","DOI":"10.1093\/oso\/9780195099713.001.0001","volume-title":"Evolutionary algorithms in theory and practice: evolution strategies, evolutionary programming, genetic algorithms","author":"T. B\u00e4ck","year":"1996","unstructured":"B\u00e4ck, T.: Evolutionary algorithms in theory and practice: evolution strategies, evolutionary programming, genetic algorithms. Oxford University Press, USA (1996)"},{"issue":"1","key":"7_CR10","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1162\/evco.1993.1.1.1","volume":"1","author":"T. B\u00e4ck","year":"1993","unstructured":"B\u00e4ck, T., Schwefel, H.P.: An overview of evolutionary algorithms for parameter optimization. Evolutionary Computation\u00a01(1), 1\u201323 (1993)","journal-title":"Evolutionary Computation"},{"key":"7_CR11","first-page":"30","volume-title":"Machine Learning: Proceedings of the Twelfth International Conference","author":"L. Baird","year":"1995","unstructured":"Baird, L.: Residual algorithms: Reinforcement learning with function approximation. In: Prieditis, A., Russell, S. (eds.) Machine Learning: Proceedings of the Twelfth International Conference, pp. 30\u201337. Morgan Kaufmann Publishers, San Francisco (1995)"},{"key":"7_CR12","doi-asserted-by":"crossref","unstructured":"Baird, L.C., Klopf, A.H.: Reinforcement learning with high-dimensional, continuous actions. Tech. Rep. WL-TR-93-114, Wright Laboratory, Wright-Patterson Air Force Base, OH (1993)","DOI":"10.21236\/ADA280844"},{"key":"7_CR13","doi-asserted-by":"crossref","DOI":"10.1007\/978-0-8176-4755-1","volume-title":"Optimal control and viscosity solutions of Hamilton\u2013Jacobi\u2013Bellman equations","author":"M. Bardi","year":"1997","unstructured":"Bardi, M., Dolcetta, I.C.: Optimal control and viscosity solutions of Hamilton\u2013Jacobi\u2013Bellman equations. Springer, Heidelberg (1997)"},{"key":"7_CR14","doi-asserted-by":"crossref","first-page":"834","DOI":"10.1109\/TSMC.1983.6313077","volume":"SMC-13","author":"A.G. Barto","year":"1983","unstructured":"Barto, A.G., Sutton, R.S., Anderson, C.W.: Neuronlike adaptive elements that can solve difficult learning control problems. IEEE Transactions on Systems, Man, and Cybernetics\u00a0SMC-13, 834\u2013846 (1983)","journal-title":"IEEE Transactions on Systems, Man, and Cybernetics"},{"key":"7_CR15","doi-asserted-by":"crossref","first-page":"319","DOI":"10.1613\/jair.806","volume":"15","author":"J. Baxter","year":"2001","unstructured":"Baxter, J., Bartlett, P.L.: Infinite-horizon policy-gradient estimation. Journal of Artificial Intelligence Research\u00a015, 319\u2013350 (2001)","journal-title":"Journal of Artificial Intelligence Research"},{"issue":"3","key":"7_CR16","doi-asserted-by":"crossref","first-page":"589","DOI":"10.1023\/A:1022664528457","volume":"96","author":"R. Beard","year":"1998","unstructured":"Beard, R., Saridis, G., Wen, J.: Approximate solutions to the time-invariant Hamilton\u2013Jacobi\u2013Bellman equation. Journal of Optimization theory and Applications\u00a096(3), 589\u2013626 (1998)","journal-title":"Journal of Optimization theory and Applications"},{"key":"7_CR17","unstructured":"Bellman, R.: Dynamic Programming. Princeton University Press (1957)"},{"issue":"3-4","key":"7_CR18","doi-asserted-by":"crossref","first-page":"283","DOI":"10.1016\/S0921-8890(97)00043-2","volume":"22","author":"H. Benbrahim","year":"1997","unstructured":"Benbrahim, H., Franklin, J.A.: Biped dynamic walking using reinforcement learning. Robotics and Autonomous Systems\u00a022(3-4), 283\u2013302 (1997)","journal-title":"Robotics and Autonomous Systems"},{"key":"7_CR19","doi-asserted-by":"crossref","unstructured":"Berenji, H.: Fuzzy Q-learning: a new approach for fuzzy dynamic programming. In: Proceedings of the Third IEEE Conference on Fuzzy Systems, IEEE World Congress on Computational Intelligence, pp. 486\u2013491. IEEE (1994)","DOI":"10.1109\/FUZZY.1994.343737"},{"issue":"5","key":"7_CR20","doi-asserted-by":"crossref","first-page":"724","DOI":"10.1109\/72.159061","volume":"3","author":"H. Berenji","year":"1992","unstructured":"Berenji, H., Khedkar, P.: Learning and tuning fuzzy logic controllers through reinforcements. IEEE Transactions on Neural Networks\u00a03(5), 724\u2013740 (1992)","journal-title":"IEEE Transactions on Neural Networks"},{"key":"7_CR21","unstructured":"Bertsekas, D.P.: Dynamic Programming and Optimal Control, vol.\u00a0I. Athena Scientific (2005)"},{"key":"7_CR22","unstructured":"Bertsekas, D.P.: Dynamic Programming and Optimal Control, vol.\u00a0II. Athena Scientific (2007)"},{"key":"7_CR23","volume-title":"Neuro-dynamic Programming","author":"D.P. Bertsekas","year":"1996","unstructured":"Bertsekas, D.P., Tsitsiklis, J.N.: Neuro-dynamic Programming. Athena Scientific, Belmont (1996)"},{"key":"7_CR24","unstructured":"Bertsekas, D.P., Borkar, V.S., Nedic, A.: Improved temporal difference methods with linear function approximation. In: Handbook of Learning and Approximate Dynamic Programming, pp. 235\u2013260 (2004)"},{"issue":"1","key":"7_CR25","doi-asserted-by":"crossref","first-page":"3","DOI":"10.1023\/A:1015059928466","volume":"1","author":"H. Beyer","year":"2002","unstructured":"Beyer, H., Schwefel, H.: Evolution strategies\u2013a comprehensive introduction. Natural Computing\u00a01(1), 3\u201352 (2002)","journal-title":"Natural Computing"},{"issue":"11","key":"7_CR26","doi-asserted-by":"crossref","first-page":"2471","DOI":"10.1016\/j.automatica.2009.07.008","volume":"45","author":"S. Bhatnagar","year":"2009","unstructured":"Bhatnagar, S., Sutton, R.S., Ghavamzadeh, M., Lee, M.: Natural actor-critic algorithms. Automatica\u00a045(11), 2471\u20132482 (2009)","journal-title":"Automatica"},{"key":"7_CR27","doi-asserted-by":"crossref","DOI":"10.1093\/oso\/9780198538493.001.0001","volume-title":"Neural networks for pattern recognition","author":"C.M. Bishop","year":"1995","unstructured":"Bishop, C.M.: Neural networks for pattern recognition. Oxford University Press, USA (1995)"},{"key":"7_CR28","volume-title":"Pattern recognition and machine learning","author":"C.M. Bishop","year":"2006","unstructured":"Bishop, C.M.: Pattern recognition and machine learning. Springer, New York (2006)"},{"key":"7_CR29","series-title":"Studies in Fuzziness","first-page":"447","volume-title":"Genetic Algorithms and Soft Computing","author":"A. Bonarini","year":"1996","unstructured":"Bonarini, A.: Delayed reinforcement, fuzzy Q-learning and fuzzy logic controllers. In: Herrera, F., Verdegay, J.L. (eds.) Genetic Algorithms and Soft Computing. Studies in Fuzziness, vol.\u00a08, pp. 447\u2013466. Physica-Verlag, Berlin (1996)"},{"issue":"2","key":"7_CR30","doi-asserted-by":"crossref","first-page":"233","DOI":"10.1023\/A:1017936530646","volume":"49","author":"J.A. Boyan","year":"2002","unstructured":"Boyan, J.A.: Technical update: Least-squares temporal difference learning. Machine Learning\u00a049(2), 233\u2013246 (2002)","journal-title":"Machine Learning"},{"key":"7_CR31","first-page":"33","volume":"22","author":"S.J. Bradtke","year":"1996","unstructured":"Bradtke, S.J., Barto, A.G.: Linear least-squares algorithms for temporal difference learning. Machine Learning\u00a022, 33\u201357 (1996)","journal-title":"Machine Learning"},{"key":"7_CR32","unstructured":"Bryson, A., Ho, Y.: Applied Optimal Control. Blaisdell Publishing Co. (1969)"},{"key":"7_CR33","series-title":"Lecture Notes in Artificial Intelligence","doi-asserted-by":"crossref","first-page":"27","DOI":"10.1007\/978-3-540-77949-0_3","volume-title":"Adaptive Agents and Multi-Agent Systems III. Adaptation and Multi-Agent Learning","author":"L. Bu\u015foniu","year":"2008","unstructured":"Bu\u015foniu, L., Ernst, D., De Schutter, B., Babu\u0161ka, R.: Continuous-State Reinforcement Learning with Fuzzy Approximation. In: Tuyls, K., Nowe, A., Guessoum, Z., Kudenko, D. (eds.) ALAMAS 2005, ALAMAS 2006, and ALAMAS 2007. LNCS (LNAI), vol.\u00a04865, pp. 27\u201343. Springer, Heidelberg (2008)"},{"key":"7_CR34","volume-title":"Reinforcement Learning and Dynamic Programming Using Function Approximators","author":"L. Bu\u015foniu","year":"2010","unstructured":"Bu\u015foniu, L., Babu\u0161ka, R., De Schutter, B., Ernst, D.: Reinforcement Learning and Dynamic Programming Using Function Approximators. CRC Press, Boca Raton (2010)"},{"key":"7_CR35","unstructured":"Coulom, R.: Reinforcement learning using neural networks, with applications to motor control. PhD thesis, Institut National Polytechnique de Grenoble (2002)"},{"key":"7_CR36","first-page":"1017","volume-title":"Advances in Neural Information Processing Systems","author":"R.H. Crites","year":"1996","unstructured":"Crites, R.H., Barto, A.G.: Improving elevator performance using reinforcement learning. In: Touretzky, D.S., Mozer, M.C., Hasselmo, M.E. (eds.) Advances in Neural Information Processing Systems, vol.\u00a08, pp. 1017\u20131023. MIT Press, Cambridge (1996)"},{"issue":"2\/3","key":"7_CR37","doi-asserted-by":"crossref","first-page":"235","DOI":"10.1023\/A:1007518724497","volume":"33","author":"R.H. Crites","year":"1998","unstructured":"Crites, R.H., Barto, A.G.: Elevator group control using multiple reinforcement learning agents. Machine Learning\u00a033(2\/3), 235\u2013262 (1998)","journal-title":"Machine Learning"},{"key":"7_CR38","unstructured":"Davis, L.: Handbook of genetic algorithms. Arden Shakespeare (1991)"},{"key":"7_CR39","first-page":"341","volume":"8","author":"P. Dayan","year":"1992","unstructured":"Dayan, P.: The convergence of TD(\u03bb) for general lambda. Machine Learning\u00a08, 341\u2013362 (1992)","journal-title":"Machine Learning"},{"key":"7_CR40","first-page":"295","volume":"14","author":"P. Dayan","year":"1994","unstructured":"Dayan, P., Sejnowski, T.: TD(\u03bb): Convergence with probability 1. Machine Learning\u00a014, 295\u2013301 (1994)","journal-title":"Machine Learning"},{"key":"7_CR41","unstructured":"Dearden, R., Friedman, N., Russell, S.: Bayesian Q-learning. In: Proceedings of the Fifteenth National\/Tenth Conference on Artificial Intelligence\/Innovative Applications of Artificial Intelligence, pp. 761\u2013768. American Association for Artificial Intelligence (1998)"},{"key":"7_CR42","unstructured":"Dearden, R., Friedman, N., Andre, D.: Model based Bayesian exploration. In: Proceedings of the Fifteenth Conference on Uncertainty in Artificial Intelligence, pp. 150\u2013159 (1999)"},{"key":"7_CR43","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-662-05094-1","volume-title":"Introduction to evolutionary computing","author":"A.E. Eiben","year":"2003","unstructured":"Eiben, A.E., Smith, J.E.: Introduction to evolutionary computing. Springer, Heidelberg (2003)"},{"issue":"1","key":"7_CR44","first-page":"503","volume":"6","author":"D. Ernst","year":"2005","unstructured":"Ernst, D., Geurts, P., Wehenkel, L.: Tree-based batch mode reinforcement learning. Journal of Machine Learning Research\u00a06(1), 503\u2013556 (2005)","journal-title":"Journal of Machine Learning Research"},{"key":"7_CR45","first-page":"309","volume":"222","author":"R.A. Fisher","year":"1922","unstructured":"Fisher, R.A.: On the mathematical foundations of theoretical statistics. Philosophical Transactions of the Royal Society of London Series A, Containing Papers of a Mathematical or Physical Character\u00a0222, 309\u2013368 (1922)","journal-title":"Philosophical Transactions of the Royal Society of London Series A, Containing Papers of a Mathematical or Physical Character"},{"key":"7_CR46","volume-title":"Statistical methods for research workers","author":"R.A. Fisher","year":"1925","unstructured":"Fisher, R.A.: Statistical methods for research workers. Oliver & Boyd, Edinburgh (1925)"},{"key":"7_CR47","unstructured":"Fr\u00e4mling, K.: Replacing eligibility trace for action-value learning with function approximation. In: Proceedings of the 15th European Symposium on Artificial Neural Networks (ESANN-2007), pp. 313\u2013318. d-side publishing (2007)"},{"key":"7_CR48","doi-asserted-by":"crossref","unstructured":"Gaskett, C., Wettergreen, D., Zelinsky, A.: Q-learning in continuous state and action spaces. In: Advanced Topics in Artificial Intelligence, pp. 417\u2013428 (1999)","DOI":"10.1007\/3-540-46695-9_35"},{"key":"7_CR49","unstructured":"Geramifard, A., Bowling, M., Sutton, R.S.: Incremental least-squares temporal difference learning. In: Proceedings of the 21st National Conference on Artificial Intelligence, vol.\u00a01, pp. 356\u2013361. AAAI Press (2006)"},{"key":"7_CR50","doi-asserted-by":"crossref","unstructured":"Geramifard, A., Bowling, M., Zinkevich, M., Sutton, R.: ilstd: Eligibility traces and convergence analysis. In: Advances in Neural Information Processing Systems, vol.\u00a019, pp. 441\u2013448 (2007)","DOI":"10.7551\/mitpress\/7503.003.0060"},{"key":"7_CR51","doi-asserted-by":"crossref","unstructured":"Glasmachers, T., Schaul, T., Yi, S., Wierstra, D., Schmidhuber, J.: Exponential natural evolution strategies. In: Proceedings of the 12th Annual Conference on Genetic and Evolutionary Computation, pp. 393\u2013400. ACM (2010)","DOI":"10.1145\/1830483.1830557"},{"key":"7_CR52","doi-asserted-by":"crossref","unstructured":"Glorennec, P.: Fuzzy Q-learning and dynamical fuzzy Q-learning. In: Proceedings of the Third IEEE Conference on Fuzzy Systems, IEEE World Congress on Computational Intelligence, pp. 474\u2013479. IEEE (1994)","DOI":"10.1109\/FUZZY.1994.343739"},{"key":"7_CR53","doi-asserted-by":"crossref","DOI":"10.1007\/b101874","volume-title":"Handbook of metaheuristics","author":"F. Glover","year":"2003","unstructured":"Glover, F., Kochenberger, G.: Handbook of metaheuristics. Springer, Heidelberg (2003)"},{"key":"7_CR54","first-page":"937","volume":"9","author":"F. Gomez","year":"2008","unstructured":"Gomez, F., Schmidhuber, J., Miikkulainen, R.: Accelerated neural evolution through cooperatively coevolved synapses. The Journal of Machine Learning Research\u00a09, 937\u2013965 (2008)","journal-title":"The Journal of Machine Learning Research"},{"key":"7_CR55","first-page":"261","volume-title":"Proceedings of the Twelfth International Conference on Machine Learning (ICML 1995)","author":"G.J. Gordon","year":"1995","unstructured":"Gordon, G.J.: Stable function approximation in dynamic programming. In: Prieditis, A., Russell, S. (eds.) Proceedings of the Twelfth International Conference on Machine Learning (ICML 1995), pp. 261\u2013268. Morgan Kaufmann, San Francisco (1995)"},{"key":"7_CR56","unstructured":"Gordon, G.J.: Approximate solutions to Markov decision processes. PhD thesis, Carnegie Mellon University (1999)"},{"key":"7_CR57","first-page":"1471","volume":"5","author":"E. Greensmith","year":"2004","unstructured":"Greensmith, E., Bartlett, P.L., Baxter, J.: Variance reduction techniques for gradient estimates in reinforcement learning. The Journal of Machine Learning Research\u00a05, 1471\u20131530 (2004)","journal-title":"The Journal of Machine Learning Research"},{"issue":"2","key":"7_CR58","doi-asserted-by":"crossref","first-page":"159","DOI":"10.1162\/106365601750190398","volume":"9","author":"N. Hansen","year":"2001","unstructured":"Hansen, N., Ostermeier, A.: Completely derandomized self-adaptation in evolution strategies. Evolutionary Computation\u00a09(2), 159\u2013195 (2001)","journal-title":"Evolutionary Computation"},{"issue":"1","key":"7_CR59","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1162\/106365603321828970","volume":"11","author":"N. Hansen","year":"2003","unstructured":"Hansen, N., M\u00fcller, S.D., Koumoutsakos, P.: Reducing the time complexity of the derandomized evolution strategy with covariance matrix adaptation (CMA-ES). Evolutionary Computation\u00a011(1), 1\u201318 (2003)","journal-title":"Evolutionary Computation"},{"key":"7_CR60","doi-asserted-by":"crossref","first-page":"1689","DOI":"10.1145\/1830761.1830790","volume-title":"Proceedings of the 12th Annual Conference Companion on Genetic and Evolutionary Computation, GECCO 2010","author":"N. Hansen","year":"2010","unstructured":"Hansen, N., Auger, A., Ros, R., Finck, S., Po\u0161\u00edk, P.: Comparing results of 31 algorithms from the black-box optimization benchmarking BBOB-2009. In: Proceedings of the 12th Annual Conference Companion on Genetic and Evolutionary Computation, GECCO 2010, pp. 1689\u20131696. ACM, New York (2010)"},{"key":"7_CR61","unstructured":"Haykin, S.: Neural Networks: A Comprehensive Foundation. Prentice Hall PTR (1994)"},{"key":"7_CR62","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"crossref","first-page":"428","DOI":"10.1007\/978-3-540-87700-4_43","volume-title":"Parallel Problem Solving from Nature \u2013 PPSN X","author":"V. Heidrich-Meisner","year":"2008","unstructured":"Heidrich-Meisner, V., Igel, C.: Evolution Strategies for Direct Policy Search. In: Rudolph, G., Jansen, T., Lucas, S., Poloni, C., Beume, N. (eds.) PPSN 2008. LNCS, vol.\u00a05199, pp. 428\u2013437. Springer, Heidelberg (2008)"},{"issue":"3","key":"7_CR63","doi-asserted-by":"crossref","first-page":"297","DOI":"10.1145\/321127.321128","volume":"9","author":"J.H. Holland","year":"1962","unstructured":"Holland, J.H.: Outline for a logical theory of adaptive systems. Journal of the ACM (JACM)\u00a09(3), 297\u2013314 (1962)","journal-title":"Journal of the ACM (JACM)"},{"key":"7_CR64","volume-title":"Adaptation in Natural and Artificial Systems","author":"J.H. Holland","year":"1975","unstructured":"Holland, J.H.: Adaptation in Natural and Artificial Systems. University of Michigan Press, Ann Arbor (1975)"},{"key":"7_CR65","unstructured":"Howard, R.A.: Dynamic programming and Markov processes. MIT Press (1960)"},{"issue":"2","key":"7_CR66","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/1377612.1377613","volume":"35","author":"W. Huyer","year":"2008","unstructured":"Huyer, W., Neumaier, A.: SNOBFIT\u2013stable noisy optimization by branch and fit. ACM Transactions on Mathematical Software (TOMS)\u00a035(2), 1\u201325 (2008)","journal-title":"ACM Transactions on Mathematical Software (TOMS)"},{"key":"7_CR67","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"crossref","first-page":"215","DOI":"10.1007\/978-3-540-87700-4_22","volume-title":"Parallel Problem Solving from Nature \u2013 PPSN X","author":"F. Jiang","year":"2008","unstructured":"Jiang, F., Berry, H., Schoenauer, M.: Supervised and Evolutionary Learning of Echo State Networks. In: Rudolph, G., Jansen, T., Lucas, S., Poloni, C., Beume, N. (eds.) PPSN 2008. LNCS, vol.\u00a05199, pp. 215\u2013224. Springer, Heidelberg (2008)"},{"issue":"3","key":"7_CR68","doi-asserted-by":"crossref","first-page":"338","DOI":"10.1109\/5326.704563","volume":"28","author":"L. Jouffe","year":"1998","unstructured":"Jouffe, L.: Fuzzy inference system learning by reinforcement methods. IEEE Transactions on Systems, Man, and Cybernetics, Part C: Applications and Reviews\u00a028(3), 338\u2013355 (1998)","journal-title":"IEEE Transactions on Systems, Man, and Cybernetics, Part C: Applications and Reviews"},{"key":"7_CR69","unstructured":"Kakade, S.: A natural policy gradient. In: Dietterich, T.G., Becker, S., Ghahramani, Z. (eds.) Advances in Neural Information Processing Systems 14 (NIPS-2001), pp. 1531\u20131538. MIT Press (2001)"},{"key":"7_CR70","doi-asserted-by":"crossref","unstructured":"Kennedy, J., Eberhart, R.C.: Particle swarm optimization. In: Proceedings of IEEE International Conference on Neural Networks, Perth, Australia, vol.\u00a04, pp. 1942\u20131948 (1995)","DOI":"10.1109\/ICNN.1995.488968"},{"issue":"5","key":"7_CR71","doi-asserted-by":"crossref","first-page":"975","DOI":"10.1007\/BF01009452","volume":"34","author":"S. Kirkpatrick","year":"1984","unstructured":"Kirkpatrick, S.: Optimization by simulated annealing: Quantitative studies. Journal of Statistical Physics\u00a034(5), 975\u2013986 (1984)","journal-title":"Journal of Statistical Physics"},{"key":"7_CR72","volume-title":"Fuzzy sets and fuzzy logic: theory and applications","author":"G. Klir","year":"1995","unstructured":"Klir, G., Yuan, B.: Fuzzy sets and fuzzy logic: theory and applications. Prentice Hall PTR, Upper Saddle River (1995)"},{"key":"7_CR73","unstructured":"Konda, V.: Actor-critic algorithms. PhD thesis, Massachusetts Institute of Technology (2002)"},{"issue":"1","key":"7_CR74","doi-asserted-by":"crossref","first-page":"94","DOI":"10.1137\/S036301299731669X","volume":"38","author":"V.R. Konda","year":"1999","unstructured":"Konda, V.R., Borkar, V.: Actor-critic type learning algorithms for Markov decision processes. SIAM Journal on Control and Optimization\u00a038(1), 94\u2013123 (1999)","journal-title":"SIAM Journal on Control and Optimization"},{"issue":"4","key":"7_CR75","doi-asserted-by":"crossref","first-page":"1143","DOI":"10.1137\/S0363012901385691","volume":"42","author":"V.R. Konda","year":"2003","unstructured":"Konda, V.R., Tsitsiklis, J.N.: Actor-critic algorithms. SIAM Journal on Control and Optimization\u00a042(4), 1143\u20131166 (2003)","journal-title":"SIAM Journal on Control and Optimization"},{"key":"7_CR76","volume-title":"Statistics and Information Theory","author":"S. Kullback","year":"1959","unstructured":"Kullback, S.: Statistics and Information Theory. J. Wiley and Sons, New York (1959)"},{"key":"7_CR77","doi-asserted-by":"crossref","first-page":"79","DOI":"10.1214\/aoms\/1177729694","volume":"22","author":"S. Kullback","year":"1951","unstructured":"Kullback, S., Leibler, R.A.: On information and sufficiency. Annals of Mathematical Statistics\u00a022, 79\u201386 (1951)","journal-title":"Annals of Mathematical Statistics"},{"key":"7_CR78","first-page":"1107","volume":"4","author":"M. Lagoudakis","year":"2003","unstructured":"Lagoudakis, M., Parr, R.: Least-squares policy iteration. The Journal of Machine Learning Research\u00a04, 1107\u20131149 (2003)","journal-title":"The Journal of Machine Learning Research"},{"issue":"1","key":"7_CR79","doi-asserted-by":"crossref","first-page":"46","DOI":"10.1109\/91.273126","volume":"2","author":"C. Lin","year":"1994","unstructured":"Lin, C., Lee, C.: Reinforcement structure\/parameter learning for neural-network-based fuzzy logic control systems. IEEE Transactions on Fuzzy Systems\u00a02(1), 46\u201363 (1994)","journal-title":"IEEE Transactions on Fuzzy Systems"},{"issue":"5","key":"7_CR80","doi-asserted-by":"crossref","first-page":"530","DOI":"10.1109\/72.134290","volume":"2","author":"C.S. Lin","year":"1991","unstructured":"Lin, C.S., Kim, H.: CMAC-based adaptive critic self-learning control. IEEE Transactions on Neural Networks\u00a02(5), 530\u2013533 (1991)","journal-title":"IEEE Transactions on Neural Networks"},{"issue":"3","key":"7_CR81","first-page":"293","volume":"8","author":"L. Lin","year":"1992","unstructured":"Lin, L.: Self-improving reactive agents based on reinforcement learning, planning and teaching. Machine Learning\u00a08(3), 293\u2013321 (1992)","journal-title":"Machine Learning"},{"key":"7_CR82","unstructured":"Lin, L.J.: Reinforcement learning for robots using neural networks. PhD thesis, Carnegie Mellon University, Pittsburgh (1993)"},{"key":"7_CR83","first-page":"310","volume-title":"Proceedings of the 13th International Conference on Machine Learning (ICML 1996)","author":"M.L. Littman","year":"1996","unstructured":"Littman, M.L., Szepesv\u00e1ri, C.: A generalized reinforcement-learning model: Convergence and applications. In: Saitta, L. (ed.) Proceedings of the 13th International Conference on Machine Learning (ICML 1996), pp. 310\u2013318. Morgan Kaufmann, Bari (1996)"},{"key":"7_CR84","first-page":"91","volume-title":"Proceedings of the Third Conference On Artificial General Intelligence (AGI-2010)","author":"H.R. Maei","year":"2010","unstructured":"Maei, H.R., Sutton, R.S.: GQ (\u03bb): A general gradient algorithm for temporal-difference prediction learning with eligibility traces. In: Proceedings of the Third Conference On Artificial General Intelligence (AGI-2010), pp. 91\u201396. Atlantis Press, Lugano (2010)"},{"key":"7_CR85","unstructured":"Maei, H.R., Szepesv\u00e1ri, C., Bhatnagar, S., Precup, D., Silver, D., Sutton, R.: Convergent temporal-difference learning with arbitrary smooth function approximation. In: Advances in Neural Information Processing Systems 22 (NIPS-2009) (2009)"},{"key":"7_CR86","volume-title":"Proceedings of the 27th Annual International Conference on Machine Learning (ICML-2010)","author":"H.R. Maei","year":"2010","unstructured":"Maei, H.R., Szepesv\u00e1ri, C., Bhatnagar, S., Sutton, R.S.: Toward off-policy learning control with function approximation. In: Proceedings of the 27th Annual International Conference on Machine Learning (ICML-2010). ACM, New York (2010)"},{"key":"7_CR87","unstructured":"Maillard, O.A., Munos, R., Lazaric, A., Ghavamzadeh, M.: Finite sample analysis of Bellman residual minimization. In: Asian Conference on Machine Learning, ACML-2010 (2010)"},{"key":"7_CR88","volume-title":"Machine learning","author":"T.M. Mitchell","year":"1996","unstructured":"Mitchell, T.M.: Machine learning. McGraw Hill, New York (1996)"},{"key":"7_CR89","first-page":"11","volume":"22","author":"D.E. Moriarty","year":"1996","unstructured":"Moriarty, D.E., Miikkulainen, R.: Efficient reinforcement learning through symbiotic evolution. Machine Learning\u00a022, 11\u201332 (1996)","journal-title":"Machine Learning"},{"key":"7_CR90","doi-asserted-by":"crossref","first-page":"241","DOI":"10.1613\/jair.613","volume":"11","author":"D.E. Moriarty","year":"1999","unstructured":"Moriarty, D.E., Schultz, A.C., Grefenstette, J.J.: Evolutionary algorithms for reinforcement learning. Journal of Artificial Intelligence Research\u00a011, 241\u2013276 (1999)","journal-title":"Journal of Artificial Intelligence Research"},{"issue":"2","key":"7_CR91","doi-asserted-by":"crossref","first-page":"140","DOI":"10.1109\/TSMCC.2002.801727","volume":"32","author":"J.J. Murray","year":"2002","unstructured":"Murray, J.J., Cox, C.J., Lendaris, G.G., Saeks, R.: Adaptive dynamic programming. IEEE Transactions on Systems, Man, and Cybernetics, Part C: Applications and Reviews\u00a032(2), 140\u2013153 (2002)","journal-title":"IEEE Transactions on Systems, Man, and Cybernetics, Part C: Applications and Reviews"},{"key":"7_CR92","doi-asserted-by":"crossref","first-page":"323","DOI":"10.1109\/TSMC.1974.5408453","volume":"4","author":"K.S. Narendra","year":"1974","unstructured":"Narendra, K.S., Thathachar, M.A.L.: Learning automata - a survey. IEEE Transactions on Systems, Man, and Cybernetics\u00a04, 323\u2013334 (1974)","journal-title":"IEEE Transactions on Systems, Man, and Cybernetics"},{"key":"7_CR93","volume-title":"Learning automata: an introduction","author":"K.S. Narendra","year":"1989","unstructured":"Narendra, K.S., Thathachar, M.A.L.: Learning automata: an introduction. Prentice-Hall, Inc., Upper Saddle River (1989)"},{"issue":"1-2","key":"7_CR94","doi-asserted-by":"crossref","first-page":"79","DOI":"10.1023\/A:1022192903948","volume":"13","author":"A. Nedi\u0107","year":"2003","unstructured":"Nedi\u0107, A., Bertsekas, D.P.: Least squares policy evaluation algorithms with linear function approximation. Discrete Event Dynamic Systems\u00a013(1-2), 79\u2013110 (2003)","journal-title":"Discrete Event Dynamic Systems"},{"issue":"1","key":"7_CR95","first-page":"175","volume":"20","author":"J. Neyman","year":"1928","unstructured":"Neyman, J., Pearson, E.S.: On the use and interpretation of certain test criteria for purposes of statistical inference part i. Biometrika\u00a020(1), 175\u2013240 (1928)","journal-title":"Biometrika"},{"key":"7_CR96","unstructured":"Ng, A.Y., Parr, R., Koller, D.: Policy search via density estimation. In: Solla, S.A., Leen, T.K., M\u00fcller, K.R. (eds.) Advances in Neural Information Processing Systems, vol.\u00a013, pp. 1022\u20131028. The MIT Press (1999)"},{"key":"7_CR97","unstructured":"Nguyen-Tuong, D., Peters, J.: Model learning for robot control: a survey. Cognitive Processing, 1\u201322 (2011)"},{"issue":"2","key":"7_CR98","doi-asserted-by":"crossref","first-page":"161","DOI":"10.1023\/A:1017928328829","volume":"49","author":"D. Ormoneit","year":"2002","unstructured":"Ormoneit, D., Sen, \u015a.: Kernel-based reinforcement learning. Machine Learning\u00a049(2), 161\u2013178 (2002)","journal-title":"Machine Learning"},{"key":"7_CR99","doi-asserted-by":"crossref","unstructured":"Pazis, J., Lagoudakis, M.G.: Binary action search for learning continuous-action control policies. In: Proceedings of the 26th Annual International Conference on Machine Learning, pp. 793\u2013800. ACM (2009)","DOI":"10.1145\/1553374.1553476"},{"key":"7_CR100","unstructured":"Peng, J.: Efficient dynamic programming-based learning for control. PhD thesis, Northeastern University (1993)"},{"issue":"7-9","key":"7_CR101","doi-asserted-by":"crossref","first-page":"1180","DOI":"10.1016\/j.neucom.2007.11.026","volume":"71","author":"J. Peters","year":"2008","unstructured":"Peters, J., Schaal, S.: Natural actor-critic. Neurocomputing\u00a071(7-9), 1180\u20131190 (2008a)","journal-title":"Neurocomputing"},{"issue":"4","key":"7_CR102","doi-asserted-by":"crossref","first-page":"682","DOI":"10.1016\/j.neunet.2008.02.003","volume":"21","author":"J. Peters","year":"2008","unstructured":"Peters, J., Schaal, S.: Reinforcement learning of motor skills with policy gradients. Neural Networks\u00a021(4), 682\u2013697 (2008b)","journal-title":"Neural Networks"},{"key":"7_CR103","unstructured":"Peters, J., Vijayakumar, S., Schaal, S.: Reinforcement learning for humanoid robotics. In: IEEE-RAS International Conference on Humanoid Robots (Humanoids 2003). IEEE Press (2003)"},{"key":"7_CR104","doi-asserted-by":"crossref","unstructured":"Poupart, P., Vlassis, N., Hoey, J., Regan, K.: An analytic solution to discrete Bayesian reinforcement learning. In: Proceedings of the 23rd International Conference on Machine Learning, pp. 697\u2013704. ACM (2006)","DOI":"10.1145\/1143844.1143932"},{"issue":"3","key":"7_CR105","doi-asserted-by":"crossref","first-page":"555","DOI":"10.1007\/s101070100290","volume":"92","author":"M. Powell","year":"2002","unstructured":"Powell, M.: UOBYQA: unconstrained optimization by quadratic approximation. Mathematical Programming\u00a092(3), 555\u2013582 (2002)","journal-title":"Mathematical Programming"},{"key":"7_CR106","doi-asserted-by":"crossref","unstructured":"Powell, M.: The NEWUOA software for unconstrained optimization without derivatives. In: Large-Scale Nonlinear Optimization, pp. 255\u2013297 (2006)","DOI":"10.1007\/0-387-30065-1_16"},{"key":"7_CR107","doi-asserted-by":"crossref","unstructured":"Powell, W.B.: Approximate Dynamic Programming: Solving the Curses of Dimensionality. Wiley-Blackwell (2007)","DOI":"10.1002\/9780470182963"},{"key":"7_CR108","first-page":"417","volume-title":"Machine Learning: Proceedings of the Eighteenth International Conference (ICML 2001)","author":"D. Precup","year":"2001","unstructured":"Precup, D., Sutton, R.S.: Off-policy temporal-difference learning with function approximation. In: Machine Learning: Proceedings of the Eighteenth International Conference (ICML 2001), pp. 417\u2013424. Morgan Kaufmann, Williams College (2001)"},{"key":"7_CR109","first-page":"766","volume-title":"Proceedings of the Seventeenth International Conference on Machine Learning (ICML 2000)","author":"D. Precup","year":"2000","unstructured":"Precup, D., Sutton, R.S., Singh, S.P.: Eligibility traces for off-policy policy evaluation. In: Proceedings of the Seventeenth International Conference on Machine Learning (ICML 2000), pp. 766\u2013773. Morgan Kaufmann, Stanford University, Stanford, CA (2000)"},{"issue":"5","key":"7_CR110","doi-asserted-by":"crossref","first-page":"997","DOI":"10.1109\/72.623201","volume":"8","author":"D.V. Prokhorov","year":"2002","unstructured":"Prokhorov, D.V., Wunsch, D.C.: Adaptive critic designs. IEEE Transactions on Neural Networks\u00a08(5), 997\u20131007 (2002)","journal-title":"IEEE Transactions on Neural Networks"},{"key":"7_CR111","doi-asserted-by":"crossref","DOI":"10.1002\/9780470316887","volume-title":"Markov Decision Processes: Discrete Stochastic Dynamic Programming","author":"M.L. Puterman","year":"1994","unstructured":"Puterman, M.L.: Markov Decision Processes: Discrete Stochastic Dynamic Programming. John Wiley & Sons, Inc., New York (1994)"},{"issue":"11","key":"7_CR112","doi-asserted-by":"crossref","first-page":"1127","DOI":"10.1287\/mnsc.24.11.1127","volume":"24","author":"M.L. Puterman","year":"1978","unstructured":"Puterman, M.L., Shin, M.C.: Modified policy iteration algorithms for discounted Markov decision problems. Management Science\u00a024(11), 1127\u20131137 (1978)","journal-title":"Management Science"},{"key":"7_CR113","unstructured":"Rao, C.R., Poti, S.J.: On locally most powerful tests when alternatives are one sided. Sankhy\u0101: The Indian Journal of Statistics, 439\u2013439 (1946)"},{"key":"7_CR114","unstructured":"Rechenberg, I.: Evolutionsstrategie - Optimierung technischer Systeme nach Prinzipien der biologischen Evolution. Fromman-Holzboog (1971)"},{"key":"7_CR115","series-title":"Lecture Notes in Artificial Intelligence","doi-asserted-by":"crossref","first-page":"317","DOI":"10.1007\/11564096_32","volume-title":"Machine Learning: ECML 2005","author":"M. Riedmiller","year":"2005","unstructured":"Riedmiller, M.: Neural Fitted Q Iteration - First Experiences with a Data Efficient Neural Reinforcement Learning Method. In: Gama, J., Camacho, R., Brazdil, P.B., Jorge, A.M., Torgo, L. (eds.) ECML 2005. LNCS (LNAI), vol.\u00a03720, pp. 317\u2013328. Springer, Heidelberg (2005)"},{"key":"7_CR116","unstructured":"Ripley, B.D.: Pattern recognition and neural networks. Cambridge University Press (2008)"},{"issue":"2","key":"7_CR117","doi-asserted-by":"crossref","first-page":"127","DOI":"10.1023\/A:1010091220143","volume":"1","author":"R. Rubinstein","year":"1999","unstructured":"Rubinstein, R.: The cross-entropy method for combinatorial and continuous optimization. Methodology and Computing in Applied Probability\u00a01(2), 127\u2013190 (1999)","journal-title":"Methodology and Computing in Applied Probability"},{"key":"7_CR118","unstructured":"Rubinstein, R., Kroese, D.: The cross-entropy method: a unified approach to combinatorial optimization, Monte-Carlo simulation, and machine learning. Springer-Verlag New York Inc. (2004)"},{"issue":"1","key":"7_CR119","first-page":"14","volume":"1","author":"T. R\u00fcckstie\u00df","year":"2010","unstructured":"R\u00fcckstie\u00df, T., Sehnke, F., Schaul, T., Wierstra, D., Sun, Y., Schmidhuber, J.: Exploring parameter space in reinforcement learning. Paladyn\u00a01(1), 14\u201324 (2010)","journal-title":"Paladyn"},{"key":"7_CR120","doi-asserted-by":"crossref","unstructured":"Rumelhart, D.E., Hinton, G.E., Williams, R.J.: Learning internal representations by error propagation. In: Parallel Distributed Processing, vol.\u00a01, pp. 318\u2013362. MIT Press (1986)","DOI":"10.21236\/ADA164453"},{"key":"7_CR121","unstructured":"Rummery, G.A., Niranjan, M.: On-line Q-learning using connectionist sytems. Tech. Rep. CUED\/F-INFENG-TR 166, Cambridge University, UK (1994)"},{"issue":"2","key":"7_CR122","doi-asserted-by":"crossref","first-page":"163","DOI":"10.1177\/105971239700600201","volume":"6","author":"J.C. Santamaria","year":"1997","unstructured":"Santamaria, J.C., Sutton, R.S., Ram, A.: Experiments with reinforcement learning in problems with continuous state and action spaces. Adaptive Behavior\u00a06(2), 163\u2013217 (1997)","journal-title":"Adaptive Behavior"},{"key":"7_CR123","unstructured":"Scherrer, B.: Should one compute the temporal difference fix point or minimize the Bellman residual? The unified oblique projection view. In: F\u00fcrnkranz, J., Joachims, T. (eds.) Proceedings of the 27th International Conference on Machine Learning (ICML 2010), pp. 959\u2013966. Omnipress (2010)"},{"key":"7_CR124","volume-title":"Numerische Optimierung von Computer-Modellen, Interdisciplinary Systems Research","author":"H.P. Schwefel","year":"1977","unstructured":"Schwefel, H.P.: Numerische Optimierung von Computer-Modellen. Interdisciplinary Systems Research, vol.\u00a026. Birkh\u00e4user, Basel (1977)"},{"issue":"4","key":"7_CR125","doi-asserted-by":"crossref","first-page":"551","DOI":"10.1016\/j.neunet.2009.12.004","volume":"23","author":"F. Sehnke","year":"2010","unstructured":"Sehnke, F., Osendorfer, C., R\u00fcckstie\u00df, T., Graves, A., Peters, J., Schmidhuber, J.: Parameter-exploring policy gradients. Neural Networks\u00a023(4), 551\u2013559 (2010)","journal-title":"Neural Networks"},{"key":"7_CR126","first-page":"123","volume":"22","author":"S.P. Singh","year":"1996","unstructured":"Singh, S.P., Sutton, R.S.: Reinforcement learning with replacing eligibility traces. Machine Learning\u00a022, 123\u2013158 (1996)","journal-title":"Machine Learning"},{"issue":"1","key":"7_CR127","doi-asserted-by":"crossref","first-page":"195","DOI":"10.1613\/jair.1659","volume":"24","author":"M. Spaan","year":"2005","unstructured":"Spaan, M., Vlassis, N.: Perseus: Randomized point-based value iteration for POMDPs. Journal of Artificial Intelligence Research\u00a024(1), 195\u2013220 (2005)","journal-title":"Journal of Artificial Intelligence Research"},{"key":"7_CR128","first-page":"569","volume-title":"Proceedings of the Genetic and Evolutionary Computation Conference (GECCO-2002)","author":"K.O. Stanley","year":"2002","unstructured":"Stanley, K.O., Miikkulainen, R.: Efficient reinforcement learning through evolving neural network topologies. In: Proceedings of the Genetic and Evolutionary Computation Conference (GECCO-2002), pp. 569\u2013577. Morgan Kaufmann, San Francisco (2002)"},{"key":"7_CR129","doi-asserted-by":"crossref","unstructured":"Strehl, A.L., Li, L., Wiewiora, E., Langford, J., Littman, M.L.: PAC model-free reinforcement learning. In: Proceedings of the 23rd International Conference on Machine Learning, pp. 881\u2013888. ACM (2006)","DOI":"10.1145\/1143844.1143955"},{"key":"7_CR130","unstructured":"Strens, M.: A Bayesian framework for reinforcement learning. In: Proceedings of the Seventeenth International Conference on Machine Learning, p. 950. Morgan Kaufmann Publishers Inc. (2000)"},{"key":"7_CR131","doi-asserted-by":"crossref","unstructured":"Sun, Y., Wierstra, D., Schaul, T., Schmidhuber, J.: Efficient natural evolution strategies. In: Proceedings of the 11th Annual conference on Genetic and Evolutionary Computation (GECCO-2009), pp. 539\u2013546. ACM (2009)","DOI":"10.1145\/1569901.1569976"},{"key":"7_CR132","unstructured":"Sutton, R.S.: Temporal credit assignment in reinforcement learning. PhD thesis, University of Massachusetts, Dept. of Comp. and Inf. Sci. (1984)"},{"key":"7_CR133","first-page":"9","volume":"3","author":"R.S. Sutton","year":"1988","unstructured":"Sutton, R.S.: Learning to predict by the methods of temporal differences. Machine Learning\u00a03, 9\u201344 (1988)","journal-title":"Machine Learning"},{"key":"7_CR134","first-page":"1038","volume-title":"Advances in Neural Information Processing Systems","author":"R.S. Sutton","year":"1996","unstructured":"Sutton, R.S.: Generalization in reinforcement learning: Successful examples using sparse coarse coding. In: Touretzky, D.S., Mozer, M.C., Hasselmo, M.E. (eds.) Advances in Neural Information Processing Systems, vol.\u00a08, pp. 1038\u20131045. MIT Press, Cambridge (1996)"},{"key":"7_CR135","volume-title":"Reinforcement Learning: An Introduction","author":"R.S. Sutton","year":"1998","unstructured":"Sutton, R.S., Barto, A.G.: Reinforcement Learning: An Introduction. The MIT press, Cambridge (1998)"},{"key":"7_CR136","unstructured":"Sutton, R.S., McAllester, D., Singh, S., Mansour, Y.: Policy gradient methods for reinforcement learning with function approximation. In: Advances in Neural Information Processing Systems 13 (NIPS-2000), vol.\u00a012, pp. 1057\u20131063 (2000)"},{"key":"7_CR137","unstructured":"Sutton, R.S., Szepesv\u00e1ri, C., Maei, H.R.: A convergent O(n) algorithm for off-policy temporal-difference learning with linear function approximation. In: Advances in Neural Information Processing Systems 21 (NIPS-2008), vol.\u00a021, pp. 1609\u20131616 (2008)"},{"key":"7_CR138","doi-asserted-by":"crossref","unstructured":"Sutton, R.S., Maei, H.R., Precup, D., Bhatnagar, S., Silver, D., Szepesv\u00e1ri, C., Wiewiora, E.: Fast gradient-descent methods for temporal-difference learning with linear function approximation. In: Proceedings of the 26th Annual International Conference on Machine Learning (ICML 2009), pp. 993\u20131000. ACM (2009)","DOI":"10.1145\/1553374.1553501"},{"issue":"1","key":"7_CR139","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1007\/978-3-031-01551-9","volume":"4","author":"C. Szepesv\u00e1ri","year":"2010","unstructured":"Szepesv\u00e1ri, C.: Algorithms for reinforcement learning. Synthesis Lectures on Artificial Intelligence and Machine Learning\u00a04(1), 1\u2013103 (2010)","journal-title":"Synthesis Lectures on Artificial Intelligence and Machine Learning"},{"key":"7_CR140","doi-asserted-by":"crossref","unstructured":"Szepesv\u00e1ri, C., Smart, W.D.: Interpolation-based Q-learning. In: Proceedings of the Twenty-First International Conference on Machine Learning (ICML 2004), p. 100. ACM (2004)","DOI":"10.1145\/1015330.1015445"},{"issue":"12","key":"7_CR141","doi-asserted-by":"crossref","first-page":"2936","DOI":"10.1162\/neco.2006.18.12.2936","volume":"18","author":"I. Szita","year":"2006","unstructured":"Szita, I., L\u00f6rincz, A.: Learning tetris using the noisy cross-entropy method. Neural Computation\u00a018(12), 2936\u20132941 (2006)","journal-title":"Neural Computation"},{"key":"7_CR142","doi-asserted-by":"crossref","unstructured":"Taylor, M.E., Whiteson, S., Stone, P.: Comparing evolutionary and temporal difference methods in a reinforcement learning domain. In: Proceedings of the 8th Annual Conference on Genetic and Evolutionary Computation, p. 1328. ACM (2006)","DOI":"10.1145\/1143997.1144202"},{"key":"7_CR143","first-page":"259","volume-title":"Advances in Neural Information Processing Systems","author":"G. Tesauro","year":"1992","unstructured":"Tesauro, G.: Practical issues in temporal difference learning. In: Lippman, D.S., Moody, J.E., Touretzky, D.S. (eds.) Advances in Neural Information Processing Systems, vol.\u00a04, pp. 259\u2013266. Morgan Kaufmann, San Mateo (1992)"},{"issue":"2","key":"7_CR144","doi-asserted-by":"crossref","first-page":"215","DOI":"10.1162\/neco.1994.6.2.215","volume":"6","author":"G. Tesauro","year":"1994","unstructured":"Tesauro, G.: TD-Gammon, a self-teaching backgammon program, achieves master-level play. Neural Computation\u00a06(2), 215\u2013219 (1994)","journal-title":"Neural Computation"},{"key":"7_CR145","doi-asserted-by":"crossref","first-page":"58","DOI":"10.1145\/203330.203343","volume":"38","author":"G.J. Tesauro","year":"1995","unstructured":"Tesauro, G.J.: Temporal difference learning and TD-Gammon. Communications of the ACM\u00a038, 58\u201368 (1995)","journal-title":"Communications of the ACM"},{"key":"7_CR146","volume-title":"Proceedings of the 1993 Connectionist Models Summer School","author":"S. Thrun","year":"1993","unstructured":"Thrun, S., Schwartz, A.: Issues in using function approximation for reinforcement learning. In: Mozer, M., Smolensky, P., Touretzky, D., Elman, J., Weigend, A. (eds.) Proceedings of the 1993 Connectionist Models Summer School. Lawrence Erlbaum, Hillsdale (1993)"},{"issue":"3\/4","key":"7_CR147","doi-asserted-by":"crossref","first-page":"251","DOI":"10.1016\/S0921-8890(97)00042-0","volume":"22","author":"C.F. Touzet","year":"1997","unstructured":"Touzet, C.F.: Neural reinforcement learning for behaviour synthesis. Robotics and Autonomous Systems\u00a022(3\/4), 251\u2013281 (1997)","journal-title":"Robotics and Autonomous Systems"},{"key":"7_CR148","unstructured":"Tsitsiklis, J.N., Van Roy, B.: An analysis of temporal-difference learning with function approximation. Tech. Rep. LIDS-P-2322, MIT Laboratory for Information and Decision Systems, Cambridge, MA (1996)"},{"issue":"5","key":"7_CR149","doi-asserted-by":"crossref","first-page":"674","DOI":"10.1109\/9.580874","volume":"42","author":"J.N. Tsitsiklis","year":"1997","unstructured":"Tsitsiklis, J.N., Van Roy, B.: An analysis of temporal-difference learning with function approximation. IEEE Transactions on Automatic Control\u00a042(5), 674\u2013690 (1997)","journal-title":"IEEE Transactions on Automatic Control"},{"key":"7_CR150","unstructured":"van Hasselt, H.P.: Double Q-Learning. In: Advances in Neural Information Processing Systems, vol.\u00a023. The MIT Press (2010)"},{"key":"7_CR151","unstructured":"van Hasselt, H.P.: Insights in reinforcement learning. PhD thesis, Utrecht University (2011)"},{"key":"7_CR152","doi-asserted-by":"crossref","unstructured":"van Hasselt, H.P., Wiering, M.A.: Reinforcement learning in continuous action spaces. In: Proceedings of the IEEE International Symposium on Adaptive Dynamic Programming and Reinforcement Learning (ADPRL-2007), pp. 272\u2013279 (2007)","DOI":"10.1109\/ADPRL.2007.368199"},{"key":"7_CR153","doi-asserted-by":"crossref","unstructured":"van Hasselt, H.P., Wiering, M.A.: Using continuous action spaces to solve discrete problems. In: Proceedings of the International Joint Conference on Neural Networks (IJCNN 2009), pp. 1149\u20131156 (2009)","DOI":"10.1109\/IJCNN.2009.5178745"},{"key":"7_CR154","doi-asserted-by":"crossref","unstructured":"van Seijen, H., van Hasselt, H.P., Whiteson, S., Wiering, M.A.: A theoretical and empirical analysis of Expected Sarsa. In: Proceedings of the IEEE International Symposium on Adaptive Dynamic Programming and Reinforcement Learning, pp. 177\u2013184 (2009)","DOI":"10.1109\/ADPRL.2009.4927542"},{"key":"7_CR155","doi-asserted-by":"crossref","DOI":"10.1007\/978-1-4757-2440-0","volume-title":"The nature of statistical learning theory","author":"V.N. Vapnik","year":"1995","unstructured":"Vapnik, V.N.: The nature of statistical learning theory. Springer, Heidelberg (1995)"},{"issue":"2","key":"7_CR156","doi-asserted-by":"crossref","first-page":"477","DOI":"10.1016\/j.automatica.2008.08.017","volume":"45","author":"D. Vrabie","year":"2009","unstructured":"Vrabie, D., Pastravanu, O., Abu-Khalaf, M., Lewis, F.: Adaptive optimal control for continuous-time linear systems based on policy iteration. Automatica\u00a045(2), 477\u2013484 (2009)","journal-title":"Automatica"},{"issue":"2","key":"7_CR157","doi-asserted-by":"crossref","first-page":"39","DOI":"10.1109\/MCI.2009.932261","volume":"4","author":"F.Y. Wang","year":"2009","unstructured":"Wang, F.Y., Zhang, H., Liu, D.: Adaptive dynamic programming: An introduction. IEEE Computational Intelligence Magazine\u00a04(2), 39\u201347 (2009)","journal-title":"IEEE Computational Intelligence Magazine"},{"key":"7_CR158","unstructured":"Watkins, C.J.C.H.: Learning from delayed rewards. PhD thesis, King\u2019s College, Cambridge, England (1989)"},{"key":"7_CR159","first-page":"279","volume":"8","author":"C.J.C.H. Watkins","year":"1992","unstructured":"Watkins, C.J.C.H., Dayan, P.: Q-learning. Machine Learning\u00a08, 279\u2013292 (1992)","journal-title":"Machine Learning"},{"key":"7_CR160","unstructured":"Werbos, P.J.: Beyond regression: New tools for prediction and analysis in the behavioral sciences. PhD thesis, Harvard University (1974)"},{"key":"7_CR161","unstructured":"Werbos, P.J.: Advanced forecasting methods for global crisis warning and models of intelligence. In: General Systems, vol. XXII, pp. 25\u201338 (1977)"},{"key":"7_CR162","doi-asserted-by":"crossref","unstructured":"Werbos, P.J.: Backpropagation and neurocontrol: A review and prospectus. In: IEEE\/INNS International Joint Conference on Neural Networks, Washington, D.C, vol.\u00a01, pp. 209\u2013216 (1989a)","DOI":"10.1109\/IJCNN.1989.118583"},{"key":"7_CR163","unstructured":"Werbos, P.J.: Neural networks for control and system identification. In: Proceedings of IEEE\/CDC, Tampa, Florida (1989b)"},{"key":"7_CR164","doi-asserted-by":"crossref","first-page":"179","DOI":"10.1016\/0893-6080(90)90088-3","volume":"2","author":"P.J. Werbos","year":"1990","unstructured":"Werbos, P.J.: Consistency of HDP applied to a simple reinforcement learning problem. Neural Networks\u00a02, 179\u2013189 (1990)","journal-title":"Neural Networks"},{"issue":"10","key":"7_CR165","doi-asserted-by":"crossref","first-page":"1550","DOI":"10.1109\/5.58337","volume":"78","author":"P.J. Werbos","year":"2002","unstructured":"Werbos, P.J.: Backpropagation through time: What it does and how to do it. Proceedings of the IEEE\u00a078(10), 1550\u20131560 (2002)","journal-title":"Proceedings of the IEEE"},{"key":"7_CR166","first-page":"877","volume":"7","author":"S. Whiteson","year":"2006","unstructured":"Whiteson, S., Stone, P.: Evolutionary function approximation for reinforcement learning. Journal of Machine Learning Research\u00a07, 877\u2013917 (2006)","journal-title":"Journal of Machine Learning Research"},{"issue":"2","key":"7_CR167","doi-asserted-by":"crossref","first-page":"259","DOI":"10.1023\/A:1022674030396","volume":"13","author":"D. Whitley","year":"1993","unstructured":"Whitley, D., Dominic, S., Das, R., Anderson, C.W.: Genetic reinforcement learning for neurocontrol problems. Machine Learning\u00a013(2), 259\u2013284 (1993)","journal-title":"Machine Learning"},{"key":"7_CR168","first-page":"667","volume-title":"International Joint Conference on Neural Networks","author":"A.P. Wieland","year":"1991","unstructured":"Wieland, A.P.: Evolving neural network controllers for unstable systems. In: International Joint Conference on Neural Networks, vol.\u00a02, pp. 667\u2013673. IEEE, New York (1991)"},{"key":"7_CR169","doi-asserted-by":"crossref","unstructured":"Wiering, M.A., van Hasselt, H.P.: The QV family compared to other reinforcement learning algorithms. In: Proceedings of the IEEE International Symposium on Adaptive Dynamic Programming and Reinforcement Learning, pp. 101\u2013108 (2009)","DOI":"10.1109\/ADPRL.2009.4927532"},{"key":"7_CR170","doi-asserted-by":"crossref","unstructured":"Wierstra, D., Schaul, T., Peters, J., Schmidhuber, J.: Natural evolution strategies. In: IEEE Congress on Evolutionary Computation (CEC-2008), pp. 3381\u20133387. IEEE (2008)","DOI":"10.1109\/CEC.2008.4631255"},{"key":"7_CR171","first-page":"229","volume":"8","author":"R.J. Williams","year":"1992","unstructured":"Williams, R.J.: Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine Learning\u00a08, 229\u2013256 (1992)","journal-title":"Machine Learning"},{"issue":"2","key":"7_CR172","doi-asserted-by":"crossref","first-page":"270","DOI":"10.1162\/neco.1989.1.2.270","volume":"1","author":"R.J. Williams","year":"1989","unstructured":"Williams, R.J., Zipser, D.: A learning algorithm for continually running fully recurrent neural networks. Neural Computation\u00a01(2), 270\u2013280 (1989)","journal-title":"Neural Computation"},{"issue":"10","key":"7_CR173","doi-asserted-by":"crossref","first-page":"1429","DOI":"10.1016\/S0893-6080(03)00138-2","volume":"16","author":"D.R. Wilson","year":"2003","unstructured":"Wilson, D.R., Martinez, T.R.: The general inefficiency of batch training for gradient descent learning. Neural Networks\u00a016(10), 1429\u20131451 (2003)","journal-title":"Neural Networks"},{"issue":"3","key":"7_CR174","doi-asserted-by":"crossref","first-page":"338","DOI":"10.1016\/S0019-9958(65)90241-X","volume":"8","author":"L. Zadeh","year":"1965","unstructured":"Zadeh, L.: Fuzzy sets. Information and Control\u00a08(3), 338\u2013353 (1965)","journal-title":"Information and Control"},{"issue":"1","key":"7_CR175","doi-asserted-by":"crossref","first-page":"169","DOI":"10.1016\/S0165-0114(02)00236-1","volume":"134","author":"C. Zhou","year":"2003","unstructured":"Zhou, C., Meng, Q.: Dynamic balance of a biped robot using fuzzy reinforcement learning agents. Fuzzy Sets and Systems\u00a0134(1), 169\u2013187 (2003)","journal-title":"Fuzzy Sets and Systems"}],"container-title":["Adaptation, Learning, and Optimization","Reinforcement Learning"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-27645-3_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,22]],"date-time":"2025-03-22T13:06:13Z","timestamp":1742648773000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-642-27645-3_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012]]},"ISBN":["9783642276446","9783642276453"],"references-count":175,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-27645-3_7","relation":{},"ISSN":["1867-4534","1867-4542"],"issn-type":[{"value":"1867-4534","type":"print"},{"value":"1867-4542","type":"electronic"}],"subject":[],"published":{"date-parts":[[2012]]}}}