{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,28]],"date-time":"2025-09-28T20:42:46Z","timestamp":1759092166412},"reference-count":128,"publisher":"Springer Science and Business Media LLC","issue":"1-2","license":[{"start":{"date-parts":[[2011,6,3]],"date-time":"2011-06-03T00:00:00Z","timestamp":1307059200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2011,7]]},"DOI":"10.1007\/s10994-011-5251-x","type":"journal-article","created":{"date-parts":[[2011,6,3]],"date-time":"2011-06-03T06:48:57Z","timestamp":1307083737000},"page":"205-247","source":"Crossref","is-referenced-by-count":16,"title":["Characterizing reinforcement learning methods through parameterized learning problems"],"prefix":"10.1007","volume":"84","author":[{"given":"Shivaram","family":"Kalyanakrishnan","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Peter","family":"Stone","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2011,6,3]]},"reference":[{"key":"5251_CR1","volume-title":"Brains, behavior and robotics","author":"J. S. Albus","year":"1981","unstructured":"Albus, J. S. (1981). Brains, behavior and robotics. New York: McGraw-Hill."},{"key":"5251_CR2","doi-asserted-by":"crossref","first-page":"174","DOI":"10.1016\/0022-247X(65)90154-X","volume":"10","author":"K. J. \u00c5str\u00f6m","year":"1965","unstructured":"\u00c5str\u00f6m, K. J. (1965). Optimal control of Markov processes with incomplete state information. Journal of Mathematical Analysis and Applications, 10, 174\u2013205.","journal-title":"Journal of Mathematical Analysis and Applications"},{"key":"5251_CR3","first-page":"968","volume-title":"Advances in neural information processing systems 11 (NIPS 1998)","author":"L. Baird","year":"1999","unstructured":"Baird, L., & Moore, A. (1999). Gradient descent for general reinforcement learning. In M. J. Kearns, S. A. Solla, & D. A. Cohn (Eds.), Advances in neural information processing systems 11 (NIPS 1998) (pp. 968\u2013974). Cambridge: MIT Press."},{"key":"5251_CR4","doi-asserted-by":"crossref","first-page":"430","DOI":"10.1109\/IROS.2003.1250667","volume-title":"Proceedings of the 2003 IEEE\/RSJ international conference on intelligent robots and systems (IROS 2003)","author":"B. Bakker","year":"2003","unstructured":"Bakker, B., Zhumatiy, V., Gruener, G., & Schmidhuber, J. (2003). A robot that reinforcement-learns to identify and memorize important previous observations. In Proceedings of the 2003 IEEE\/RSJ international conference on intelligent robots and systems (IROS 2003) (pp. 430\u2013435). New York: IEEE Press."},{"key":"5251_CR5","doi-asserted-by":"crossref","first-page":"26","DOI":"10.3115\/1073012.1073017","volume-title":"Proceedings of 39th annual meeting of the association for computational linguistics (ACL 2001)","author":"M. Banko","year":"2001","unstructured":"Banko, M., & Brill, E. (2001). Scaling to very very large corpora for natural language disambiguation. In Proceedings of 39th annual meeting of the association for computational linguistics (ACL 2001) (pp. 26\u201333). Association for Computational Linguistics."},{"issue":"1\u20132","key":"5251_CR6","doi-asserted-by":"crossref","first-page":"105","DOI":"10.1023\/A:1007515423169","volume":"36","author":"E. Bauer","year":"1999","unstructured":"Bauer, E., & Kohavi, R. (1999). An empirical comparison of voting classification algorithms: bagging, boosting, and variants. Machine Learning, 36(1\u20132), 105\u2013139.","journal-title":"Machine Learning"},{"key":"5251_CR7","doi-asserted-by":"crossref","first-page":"319","DOI":"10.1016\/S0954-1810(01)00028-0","volume":"15","author":"J. Baxter","year":"2001","unstructured":"Baxter, J., & Bartlett, P. L. (2001). Infinite-horizon policy-gradient estimation. The Journal of Artificial Intelligence Research, 15, 319\u2013350.","journal-title":"The Journal of Artificial Intelligence Research"},{"key":"5251_CR8","first-page":"1","volume-title":"Dynamic programming","author":"R. Bellman","year":"1957","unstructured":"Bellman, R. (1957). Dynamic programming, 1st ed. (p. 1). Princeton: Princeton University Press.","edition":"1"},{"key":"5251_CR9","volume-title":"Neuro-dynamic programming","author":"D. P. Bertsekas","year":"1996","unstructured":"Bertsekas, D. P., & Tsitsiklis, J. N. (1996). Neuro-dynamic programming. Nashua: Athena Scientific."},{"issue":"2\u20134","key":"5251_CR10","doi-asserted-by":"crossref","first-page":"239","DOI":"10.1016\/S0045-7825(99)00386-2","volume":"186","author":"H.-G. Beyer","year":"2000","unstructured":"Beyer, H.-G. (2000). Evolutionary algorithms in noisy environments: theoretical issues and guidelines for practice. Computer Methods in Applied Mechanics and Engineering, 186(2\u20134), 239\u2013267.","journal-title":"Computer Methods in Applied Mechanics and Engineering"},{"key":"5251_CR11","doi-asserted-by":"crossref","first-page":"213","DOI":"10.1162\/153244303765208377","volume":"3","author":"R. I. Brafman","year":"2003","unstructured":"Brafman, R. I., & Tennenholtz, M. (2003). R-MAX\u2014a general polynomial time algorithm for near-optimal reinforcement learning. Journal of Machine Learning Research, 3, 213\u2013231.","journal-title":"Journal of Machine Learning Research"},{"issue":"1","key":"5251_CR12","doi-asserted-by":"crossref","first-page":"5","DOI":"10.1023\/A:1010933404324","volume":"45","author":"L. Breiman","year":"2001","unstructured":"Breiman, L. (2001). Random forests. Machine Learning, 45(1), 5\u201332.","journal-title":"Machine Learning"},{"issue":"1\u20132","key":"5251_CR13","first-page":"63","volume":"20","author":"C. E. Brodley","year":"1995","unstructured":"Brodley, C. E. (1995). Recursive automatic bias selection for classifier construction. Machine Learning, 20(1\u20132), 63\u201394.","journal-title":"Machine Learning"},{"key":"5251_CR14","first-page":"161","volume-title":"Proceedings of the twenty-third international conference on machine learning (ICML 2006)","author":"R. Caruana","year":"2006","unstructured":"Caruana, R., & Niculescu-Mizil, A. (2006). An empirical comparison of supervised learning algorithms. In W. W. Cohen & A. Moore (Eds.), Proceedings of the twenty-third international conference on machine learning (ICML 2006) (pp. 161\u2013168). New York: ACM."},{"key":"5251_CR15","doi-asserted-by":"crossref","first-page":"96","DOI":"10.1145\/1390156.1390169","volume-title":"Proceedings of the twenty-fifth international conference on machine learning (ICML 2008)","author":"R. Caruana","year":"2008","unstructured":"Caruana, R., Karampatziakis, N., & Yessenalina, A. (2008). An empirical evaluation of supervised learning in high dimensions. In W. W. Cohen, A. McCallum, & S. T. Roweis (Eds.), Proceedings of the twenty-fifth international conference on machine learning (ICML 2008) (pp. 96\u2013103). New York: ACM."},{"key":"5251_CR16","first-page":"1023","volume-title":"Proceedings of the twelfth national conference on artificial intelligence (AAAI 1994)","author":"A. R. Cassandra","year":"1994","unstructured":"Cassandra, A. R., Kaelbling, L. P., & Littman, M. L. (1994). Acting optimally in partially observable stochastic domains. In Proceedings of the twelfth national conference on artificial intelligence (AAAI 1994) (pp. 1023\u20131028). Menlo Park: AAAI Press."},{"key":"5251_CR17","first-page":"183","volume-title":"Proceedings of the tenth national conference on artificial intelligence (AAAI 1992)","author":"L. Chrisman","year":"1992","unstructured":"Chrisman, L. (1992). Reinforcement learning with perceptual aliasing: The perceptual distinctions approach. In W. R. Swartout (Ed.), Proceedings of the tenth national conference on artificial intelligence (AAAI 1992) (pp. 183\u2013188). Menlo Park: AAAI Press."},{"key":"5251_CR18","doi-asserted-by":"crossref","unstructured":"Cobb, H. G. (1992). Inductive biases in a reinforcement learner. Technical report AIC-92-013, Navy Center for Applied Research in Artificial Intelligence, Washington DC, USA.","DOI":"10.21236\/ADA294127"},{"key":"5251_CR19","doi-asserted-by":"crossref","first-page":"576","DOI":"10.1007\/3-540-58484-6_300","volume-title":"Proceedings of the third conference on parallel problem solving from nature (PPSN III)","author":"H. G. Cobb","year":"1994","unstructured":"Cobb, H. G., & Bock, P. (1994). Using a genetic algorithm to search for the representational bias of a collective reinforcement learner. In Y. Davidor, H.-P. Schwefel, & R. M\u00e4nner (Eds.), Proceedings of the third conference on parallel problem solving from nature (PPSN III) (pp. 576\u2013587). Berlin: Springer."},{"issue":"4","key":"5251_CR20","first-page":"35","volume":"9","author":"P. R. Cohen","year":"1988","unstructured":"Cohen, P. R., & Howe, A. E. (1988). How evaluation guides AI research: the message still counts more than the medium. The AI Magazine, 9(4), 35\u201343.","journal-title":"The AI Magazine"},{"key":"5251_CR21","first-page":"1017","volume-title":"Advances in neural information processing systems 8 (NIPS 1995)","author":"R. H. Crites","year":"1996","unstructured":"Crites, R. H., & Barto, A. G. (1996). Improving elevator performance using reinforcement learning. In D. S. Touretzky, M. Mozer, & M. E. Hasselmo (Eds.), Advances in neural information processing systems 8 (NIPS 1995) (pp. 1017\u20131023). Cambridge: MIT Press."},{"key":"5251_CR22","first-page":"295","volume":"14","author":"P. Dayan","year":"1994","unstructured":"Dayan, P., & Sejnowski, T. J. (1994). TD(\u03bb) converges with probability 1. Machine Learning, 14, 295\u2013301.","journal-title":"Machine Learning"},{"issue":"1","key":"5251_CR23","doi-asserted-by":"crossref","first-page":"19","DOI":"10.1007\/s10479-005-5724-z","volume":"134","author":"P.-T. Boer de","year":"2005","unstructured":"de Boer, P.-T., Kroese, D. P., Mannor, S., & Rubinstein, R. Y. (2005). A tutorial on the cross-entropy method. Annals of Operation Research, 134(1), 19\u201367.","journal-title":"Annals of Operation Research"},{"key":"5251_CR24","first-page":"257","volume-title":"Proceedings of the twenty-third international conference on machine learning (ICML 2006)","author":"T. Degris","year":"2006","unstructured":"Degris, T., Sigaud, O., & Wuillemin, P.-H. (2006). Learning the structure of factored Markov Decision Processes in reinforcement learning problems. In W. W. Cohen & A. Moore (Eds.), Proceedings of the twenty-third international conference on machine learning (ICML 2006) (pp. 257\u2013264). New York: ACM."},{"key":"5251_CR25","doi-asserted-by":"crossref","first-page":"249","DOI":"10.1145\/1553374.1553406","volume-title":"Proceedings of the twenty-sixth annual international conference on machine learning (ICML 2009)","author":"C. Diuk","year":"2009","unstructured":"Diuk, C., Li, L., & Leffler, B. R. (2009). The Adaptive k-Meteorologists problem and its application to structure learning and feature selection in reinforcement learning. In A. P. Danyluk, L. Bottou, & M. L. Littman (Eds.), Proceedings of the twenty-sixth annual international conference on machine learning (ICML 2009) (pp. 249\u2013256). New York: ACM."},{"key":"5251_CR26","first-page":"311","volume-title":"Proceedings of the twenty-seventh international conference on machine learning","author":"C. Downey","year":"2010","unstructured":"Downey, C., & Scanner, S. (2010). Temporal difference Bayesian model averaging: A Bayesian perspective on adapting lambda. In J. F\u00fcrnkranz, & T. Cohn (Joachims) (Eds.), Proceedings of the twenty-seventh international conference on machine learning (pp. 311\u2013318). Madison: Omnipress."},{"key":"5251_CR27","first-page":"1499","volume-title":"Advances in neural information processing systems 14 (NIPS 2001)","author":"E. Even-Dar","year":"2001","unstructured":"Even-Dar, E., & Mansour, Y. (2001). Convergence of optimistic and incremental Q-Learning. In T. G. Dietterich, S. Becker, & Z. Ghahramani (Eds.), Advances in neural information processing systems 14 (NIPS 2001) (pp. 1499\u20131506). Cambridge: MIT Press."},{"key":"5251_CR28","first-page":"259","volume-title":"Proceedings of the twenty-third AAAI conference on artificial intelligence (AAAI 2008)","author":"H. Finnsson","year":"2008","unstructured":"Finnsson, H., & Bj\u00f6rnsson, Y. (2008). Simulation-based approach to General Game Playing. In D. Fox & C.\u00a0P.\u00a0Gomes (Eds.), Proceedings of the twenty-third AAAI conference on artificial intelligence (AAAI 2008) (pp. 259\u2013264). Menlo Park: AAAI Press."},{"key":"5251_CR29","first-page":"148","volume-title":"Proceedings of the thirteenth international conference on machine learning (ICML 1996)","author":"Y. Freund","year":"1996","unstructured":"Freund, Y., & Schapire, R. E. (1996). Experiments with a new boosting algorithm. In L. Saitta (Ed.), Proceedings of the thirteenth international conference on machine learning (ICML 1996) (pp. 148\u2013156). Los Altos: Kaufmann."},{"key":"5251_CR30","doi-asserted-by":"crossref","first-page":"61","DOI":"10.1007\/978-3-642-02921-9_6","volume-title":"RoboCup 2008: robot soccer world cup XII","author":"T. Gabel","year":"2009","unstructured":"Gabel, T., Riedmiller, M., & Trost, F. (2009). A case study on improving defense behavior in soccer simulation 2D: The NeuroHassle approach. In L. Iocchi, H. Matsubara, A. Weitzenfeld, & Z. Changjiu (Eds.), RoboCup 2008: robot soccer world cup XII (pp. 61\u201372). Berlin: Springer."},{"issue":"1","key":"5251_CR31","doi-asserted-by":"crossref","first-page":"167","DOI":"10.1007\/s10994-006-8365-9","volume":"65","author":"A. P. George","year":"2006","unstructured":"George, A. P., & Powell, W. B. (2006). Adaptive stepsizes for recursive estimation with applications in approximate dynamic programming. Machine Learning, 65(1), 167\u2013198.","journal-title":"Machine Learning"},{"issue":"1\u20132","key":"5251_CR32","doi-asserted-by":"crossref","first-page":"43","DOI":"10.1016\/S0004-3702(00)00081-3","volume":"126","author":"C. P. Gomes","year":"2001","unstructured":"Gomes, C. P., & Selman, B. (2001). Algorithm portfolios. Artificial Intelligence, 126(1\u20132), 43\u201362.","journal-title":"Artificial Intelligence"},{"key":"5251_CR33","first-page":"937","volume":"9","author":"F. Gomez","year":"2008","unstructured":"Gomez, F., Schmidhuber, J., & Miikkulainen, R. (2008). Accelerated neural evolution through cooperatively coevolved synapses. Journal of Machine Learning Research, 9, 937\u2013965.","journal-title":"Journal of Machine Learning Research"},{"key":"5251_CR34","first-page":"1356","volume-title":"Proceedings of the sixteenth international joint conference on artificial intelligence (IJCAI 1999)","author":"F. J. Gomez","year":"1999","unstructured":"Gomez, F. J., & Miikkulainen, R. (1999). Solving non-Markovian control tasks with neuro-evolution. In T. Dean (Ed.), Proceedings of the sixteenth international joint conference on artificial intelligence (IJCAI 1999) (pp. 1356\u20131362). Los Altos: Kaufmann."},{"key":"5251_CR35","doi-asserted-by":"crossref","first-page":"2084","DOI":"10.1007\/3-540-45110-2_105","volume-title":"Proceedings of the genetic and evolutionary computation conference (GECCO 2003)","author":"F. J. Gomez","year":"2003","unstructured":"Gomez, F. J., & Miikkulainen, R. (2003). Active guidance for a finless rocket using neuroevolution. In E. Cant\u00fa-Paz, J. A. Foster, K. Deb, L. Davis, R. Roy, U.-M. O\u2019Reilly, H.-G. Beyer, R. K. Standish, G. Kendall, S. W. Wilson, M. Harman, J. Wegener, D. Dasgupta, M. A. Potter, A. C. Schultz, K. A. Dowsland, N. Jonoska, & J. F. Miller (Eds.), Proceedings of the genetic and evolutionary computation conference (GECCO 2003) (pp. 2084\u20132095). Berlin: Springer."},{"key":"5251_CR36","doi-asserted-by":"crossref","first-page":"360","DOI":"10.1007\/978-3-642-04921-7_37","volume-title":"Proceedings of the ninth international conference on adaptive and natural computing algorithms (ICANNGA 2009)","author":"M. Grze\u015b","year":"2009","unstructured":"Grze\u015b, M., & Kudenko, D. (2009). Improving optimistic exploration in model-free reinforcement learning. In M. Kolehmainen, P. J. Toivanen, & B. Beliczynski (Eds.), Proceedings of the ninth international conference on adaptive and natural computing algorithms (ICANNGA 2009) (pp. 360\u2013369). Berlin: Springer."},{"key":"5251_CR37","first-page":"227","volume-title":"Proceedings of the nineteenth international conference on machine learning (ICML 2002)","author":"C. Guestrin","year":"2002","unstructured":"Guestrin, C., Lagoudakis, M. G., & Parr, R. (2002). Coordinated reinforcement learning. In C. Sammut & A. G. Hoffman (Eds.), Proceedings of the nineteenth international conference on machine learning (ICML 2002) (pp. 227\u2013234). Los Altos: Kaufmann."},{"key":"5251_CR38","first-page":"1671","volume-title":"Proceedings of the twenty-third AAAI conference on artificial intelligence (AAAI 2008)","author":"A. Guez","year":"2008","unstructured":"Guez, A., Vincent, R. D., Avoli, M., & Pineau, J. (2008). Adaptive treatment of epilepsy via batch-mode reinforcement learning. In D. Fox & C. P. Gomes (Eds.), Proceedings of the twenty-third AAAI conference on artificial intelligence (AAAI 2008) (pp. 1671\u20131678). Menlo Park: AAAI Press."},{"key":"5251_CR39","unstructured":"Hansen, N. (2009). The CMA evolution strategy: a tutorial. http:\/\/www.lri.fr\/~hansen\/cmatutorial.pdf ."},{"issue":"1","key":"5251_CR40","doi-asserted-by":"crossref","first-page":"180","DOI":"10.1109\/TEVC.2008.924423","volume":"13","author":"N. Hansen","year":"2009","unstructured":"Hansen, N., Niederberger, A. S., Guzzella, L., & Koumoutsakos, P. (2009). A method for handling uncertainty in evolutionary optimization with an application to feedback control of combustion. IEEE Transactions on Evolutionary Computation, 13(1), 180\u2013197.","journal-title":"IEEE Transactions on Evolutionary Computation"},{"key":"5251_CR41","first-page":"149","volume-title":"Proceedings of the sixteenth European symposium on artificial neural networks (ESANN 2008)","author":"V. Heidrich-Meisner","year":"2008","unstructured":"Heidrich-Meisner, V., & Igel, C. (2008a). Similarities and differences between policy gradient methods and evolution strategies. In M. Verleysen (Ed.), Proceedings of the sixteenth European symposium on artificial neural networks (ESANN 2008) (pp. 149\u2013154). Evere: D-side Publication."},{"key":"5251_CR42","doi-asserted-by":"crossref","first-page":"136","DOI":"10.1007\/978-3-540-89722-4_11","volume-title":"Recent advances in reinforcement learning: eighth European workshop (EWRL 2008)","author":"V. Heidrich-Meisner","year":"2008","unstructured":"Heidrich-Meisner, V., & Igel, C. (2008b). Variable metric reinforcement learning methods applied to the noisy mountain car problem. In S. Girgin, M. Loth, R. Munos, P. Preux, & D. Ryabko (Eds.), Recent advances in reinforcement learning: eighth European workshop (EWRL 2008) (pp. 136\u2013150). Berlin: Springer."},{"key":"5251_CR43","first-page":"401","volume-title":"Proceedings of the twenty-sixth international conference on machine learning (ICML 2009)","author":"V. Heidrich-Meisner","year":"2009","unstructured":"Heidrich-Meisner, V., & Igel, C. (2009). Hoeffding and Bernstein races for selecting policies in evolutionary direct policy search. In A. P. Danyluk, L. Bottou, & M. L. Littman (Eds.), Proceedings of the twenty-sixth international conference on machine learning (ICML 2009) (pp. 401\u2013408). New York: ACM."},{"key":"5251_CR44","first-page":"705","volume-title":"Advances in neural information processing systems 20 (NIPS 2007)","author":"M. Hutter","year":"2008","unstructured":"Hutter, M., & Legg, S. (2008). Temporal difference updating without a learning rate. In J. C. Platt, D. Koller, Y. Singer, & S. T. Roweis (Eds.), Advances in neural information processing systems 20 (NIPS 2007) (pp. 705\u2013712). Cambridge: MIT Press."},{"key":"5251_CR45","doi-asserted-by":"crossref","first-page":"39","DOI":"10.1109\/ISCA.2008.21","volume-title":"Proceedings of the thirty-fifth international symposium on computer architecture (ISCA 2008)","author":"E. \u0130pek","year":"2008","unstructured":"\u0130pek, E., Mutlu, O., Mart\u00ednez, J., & Caruana, R. (2008). Self-optimizing memory controllers: a reinforcement learning approach. In Proceedings of the thirty-fifth international symposium on computer architecture (ISCA 2008) (pp. 39\u201350). New York: IEEE Press."},{"key":"5251_CR46","first-page":"585","volume-title":"Proceedings of the eighth international conference on autonomous agents and multiagent systems (AAMAS 2009)","author":"M. R. James","year":"2009","unstructured":"James, M. R., & Singh, S. (2009). SarsaLandmark: an algorithm for learning in POMDPs with landmarks. In C. Sierra, C. Castelfranchi, K. S. Decker, & J. S. Sichman (Eds.), Proceedings of the eighth international conference on autonomous agents and multiagent systems (AAMAS 2009) (pp. 585\u2013591). IFAAMAS."},{"key":"5251_CR47","first-page":"1531","volume-title":"Advances in neural information processing systems 14 (NIPS 2001)","author":"S. Kakade","year":"2001","unstructured":"Kakade, S. (2001). A natural policy gradient. In T. G. Dietterich, S. Becker, & Z. Ghahramani (Eds.), Advances in neural information processing systems 14 (NIPS 2001) (pp. 1531\u20131538). Cambridge: MIT Press."},{"key":"5251_CR48","first-page":"650","volume-title":"Proceedings of the sixth international joint conference on autonomous agents and multiagent systems (AAMAS 2007)","author":"S. Kalyanakrishnan","year":"2007","unstructured":"Kalyanakrishnan, S., & Stone, P. (2007). Batch reinforcement learning in a complex domain. In E. H. Durfee, M. Yokoo, M. N. Huhns, & O. Shehory (Eds.), Proceedings of the sixth international joint conference on autonomous agents and multiagent systems (AAMAS 2007) (pp. 650\u2013657). IFAAMAS."},{"issue":"2\u20133","key":"5251_CR49","doi-asserted-by":"crossref","first-page":"209","DOI":"10.1023\/A:1017984413808","volume":"49","author":"M. Kearns","year":"2002","unstructured":"Kearns, M., & Singh, S. (2002). Near-optimal reinforcement learning in polynomial time. Machine Learning, 49(2\u20133), 209\u2013232.","journal-title":"Machine Learning"},{"key":"5251_CR50","first-page":"611","volume-title":"Proceedings of the nineteenth national conference on artificial intelligence (AAAI 2004)","author":"N. Kohl","year":"2004","unstructured":"Kohl, N., & Stone, P. (2004). Machine learning for fast quadrupedal locomotion. In D. L. McGuinness & G. Ferguson (Eds.), Proceedings of the nineteenth national conference on artificial intelligence (AAAI 2004) (pp. 611\u2013616). Menlo Park: AAAI Press."},{"key":"5251_CR51","doi-asserted-by":"crossref","first-page":"521","DOI":"10.1145\/1553374.1553442","volume-title":"Proceedings of the twenty-sixth annual international conference on machine learning (ICML 2009)","author":"J. Z. Kolter","year":"2009","unstructured":"Kolter, J. Z., & Ng, A. Y. (2009). Regularization and feature selection in least-squares temporal difference learning. In A. P. Danyluk, L. Bottou, & M. L. Littman (Eds.), Proceedings of the twenty-sixth annual international conference on machine learning (ICML 2009) (pp. 521\u2013528). New York: ACM."},{"issue":"4","key":"5251_CR52","doi-asserted-by":"crossref","first-page":"1143","DOI":"10.1137\/S0363012901385691","volume":"42","author":"V. R. Konda","year":"2003","unstructured":"Konda, V. R., & Tsitsiklis, J. N. (2003). On actor-critic algorithms. SIAM Journal on Control and Optimization, 42(4), 1143\u20131166.","journal-title":"SIAM Journal on Control and Optimization"},{"key":"5251_CR53","first-page":"3158","volume-title":"Proceedings of the 2004 IEEE\/RSJ international conference on intelligent robots and systems (ICRA 2004)","author":"C. Kwok","year":"2004","unstructured":"Kwok, C., & Fox, D. (2004). Reinforcement learning for sensing strategies. In Proceedings of the 2004 IEEE\/RSJ international conference on intelligent robots and systems (ICRA 2004) (pp. 3158\u20133163). New York: IEEE Press."},{"key":"5251_CR54","doi-asserted-by":"crossref","first-page":"1107","DOI":"10.1162\/jmlr.2003.4.6.1107","volume":"4","author":"M. G. Lagoudakis","year":"2003","unstructured":"Lagoudakis, M. G., & Parr, R. (2003). Least-squares policy iteration. Journal of Machine Learning Research, 4, 1107\u20131149.","journal-title":"Journal of Machine Learning Research"},{"issue":"1","key":"5251_CR55","first-page":"5","volume":"3","author":"P. Langley","year":"1988","unstructured":"Langley, P. (1988). Machine learning as an experimental science. Machine Learning, 3(1), 5\u20138.","journal-title":"Machine Learning"},{"key":"5251_CR56","unstructured":"Langley, P., & Pendrith, M. (1998). Symposium on applications of reinforcement learning: final report for NSF Grant IIS-9810208. Technical report. Institute for the Study of Learning and Expertise."},{"key":"5251_CR57","first-page":"3003","volume-title":"Proceedings of the 2006 IEEE international conference on robotics and automation (ICRA 2006)","author":"H. Lee","year":"2006","unstructured":"Lee, H., Shen, Y., Yu, C.-H., Singh, G., & Ng, A. Y. (2006). Quadruped robot obstacle negotiation via reinforcement learning. In Proceedings of the 2006 IEEE international conference on robotics and automation (ICRA 2006) (pp. 3003\u20133010). New York: IEEE Press."},{"key":"5251_CR58","doi-asserted-by":"crossref","first-page":"899","DOI":"10.1007\/978-3-540-45193-8_75","volume-title":"Proceedings of the ninth international conference on principles and practice of constraint programming (CP 2003)","author":"K. Leyton-Brown","year":"2003","unstructured":"Leyton-Brown, K., Nudelman, E., Andrew, G., McFadden, J., & Shoham, Y. (2003). Boosting as a metaphor for algorithm design. In F. Rossi (Ed.), Proceedings of the ninth international conference on principles and practice of constraint programming (CP 2003) (pp. 899\u2013903). Berlin: Springer."},{"issue":"3\u20134","key":"5251_CR59","first-page":"293","volume":"8","author":"L.-J. Lin","year":"1992","unstructured":"Lin, L.-J. (1992). Self-improving reactive agents based on reinforcement learning, planning and teaching. Machine Learning, 8(3\u20134), 293\u2013321.","journal-title":"Machine Learning"},{"key":"5251_CR60","first-page":"271","volume-title":"From animals to animats 2","author":"L.-J. Lin","year":"1993","unstructured":"Lin, L.-J., & Mitchell, T. M. (1993). Reinforcement learning with hidden states. In J.-A. Meyer, H. L. Roitblat, & S. W. Wilson (Eds.), From animals to animats 2 (pp. 271\u2013280). Cambridge: MIT Press."},{"issue":"4","key":"5251_CR61","first-page":"285","volume":"2","author":"N. Littlestone","year":"1987","unstructured":"Littlestone, N. (1987). Learning quickly when irrelevant attributes abound: a new linear-threshold algorithm. Machine Learning, 2(4), 285\u2013318.","journal-title":"Machine Learning"},{"key":"5251_CR62","first-page":"262","volume-title":"From animals to animats 2","author":"M. L. Littman","year":"1993","unstructured":"Littman, M. L. (1993). An optimization-based categorization of reinforcement learning environments. In J.-A. Meyer, H. L. Roitblat, & S. W. Wilson (Eds.), From animals to animats 2 (pp. 262\u2013270). Cambridge: MIT Press."},{"key":"5251_CR63","first-page":"323","volume-title":"Proceedings of the fifteenth international conference on machine learning (ICML 1998)","author":"J. Loch","year":"1998","unstructured":"Loch, J., & Singh, S. (1998). Using eligibility traces to find the best memoryless policy in partially observable Markov decision processes. In J. W. Shavlik (Ed.), Proceedings of the fifteenth international conference on machine learning (ICML 1998) (pp. 323\u2013331). Los Altos: Kaufmann."},{"key":"5251_CR64","first-page":"719","volume-title":"Proceedings of the twenty-seventh international conference on machine learning (ICML 2010)","author":"H. R. Maei","year":"2010","unstructured":"Maei, H. R., Szepesv\u00e1ri, C., Bhatnagar, S., & Sutton, R. S. (2010). Toward off-policy learning control with function approximation. In J. F\u00fcrnkranz & T. Joachims (Eds.), Proceedings of the twenty-seventh international conference on machine learning (ICML 2010) (pp. 719\u2013726). Madison: Omnipress."},{"issue":"4","key":"5251_CR65","doi-asserted-by":"crossref","first-page":"403","DOI":"10.1561\/2200000003","volume":"1","author":"S. Mahadevan","year":"2009","unstructured":"Mahadevan, S. (2009). Learning representation and control in Markov decision processes: new frontiers. Foundations and Trends in Machine Learning, 1(4), 403\u2013565.","journal-title":"Foundations and Trends in Machine Learning"},{"key":"5251_CR66","unstructured":"McCallum, A. K. (1996). Reinforcement learning with selective perception and hidden state. Ph.D. thesis, Computer Science Department, University of Rochester."},{"key":"5251_CR67","doi-asserted-by":"crossref","first-page":"387","DOI":"10.1016\/B978-1-55860-377-6.50055-4","volume-title":"Proceedings of the twelfth international conference on machine learning (ICML 1995)","author":"R. A. McCallum","year":"1995","unstructured":"McCallum, R. A. (1995). Instance-based utile distinctions for reinforcement learning with hidden state. In A. Prieditis & S. J. Russell (Eds.), Proceedings of the twelfth international conference on machine learning (ICML 1995) (pp. 387\u2013395). Los Altos: Kaufmann."},{"key":"5251_CR68","doi-asserted-by":"crossref","first-page":"664","DOI":"10.1145\/1390156.1390240","volume-title":"Proceedings of the twenty-fifth international conference on machine learning (ICML 2008)","author":"F. S. Melo","year":"2008","unstructured":"Melo, F. S., Meyn, S. P., & Ribeiro, M. I. (2008). An analysis of reinforcement learning with function approximation. In Proceedings of the twenty-fifth international conference on machine learning (ICML 2008) (pp. 664\u2013671). New York: ACM."},{"key":"5251_CR69","first-page":"291","volume-title":"Proceedings of the seventh international joint conference on autonomous agents and multiagent systems (AAMAS 2008)","author":"J. H. Metzen","year":"2008","unstructured":"Metzen, J. H., Edgington, M., Kassahun, Y., & Kirchner, F. (2008). Analysis of an evolutionary reinforcement learning method in a multiagent domain. In L. Padgham, D. C. Parkes, J. M\u00fcller, & S. Parsons (Eds.), Proceedings of the seventh international joint conference on autonomous agents and multiagent systems (AAMAS 2008) (pp. 291\u2013298). IFAAMAS."},{"key":"5251_CR70","doi-asserted-by":"crossref","first-page":"241","DOI":"10.1613\/jair.613","volume":"11","author":"D. E. Moriarty","year":"1999","unstructured":"Moriarty, D. E., Schultz, A. C., & Grefenstette, J. J. (1999). Evolutionary algorithms for reinforcement learning. The Journal of Artificial Intelligence Research, 11, 241\u2013276.","journal-title":"The Journal of Artificial Intelligence Research"},{"issue":"2\u20133","key":"5251_CR71","doi-asserted-by":"crossref","first-page":"291","DOI":"10.1023\/A:1017992615625","volume":"49","author":"R. Munos","year":"2002","unstructured":"Munos, R., & Moore, A. W. (2002). Variable resolution discretization in optimal control. Machine Learning, 49(2\u20133), 291\u2013323.","journal-title":"Machine Learning"},{"key":"5251_CR72","first-page":"673","volume-title":"Proceedings of the twenty-third international conference on machine learning (ICML 2006)","author":"Y. Nevmyvaka","year":"2006","unstructured":"Nevmyvaka, Y., Feng, Y., & Kearns, M. (2006). Reinforcement learning for optimized trade execution. In W. W. Cohen & A. Moore (Eds.), Proceedings of the twenty-third international conference on machine learning (ICML 2006) (pp. 673\u2013680). New York: ACM."},{"key":"5251_CR73","volume-title":"Advances in neural information processing systems 16 (NIPS 2003)","author":"A. Y. Ng","year":"2004","unstructured":"Ng, A. Y., Kim, H. J., Jordan, M. I., & Sastry, S. (2004). Autonomous helicopter flight via reinforcement learning. In S. Thrun, L. K. Saul, & B. Sch\u00f6lkopf (Eds.), Advances in neural information processing systems 16 (NIPS 2003) Cambridge: MIT Press."},{"issue":"2\u20133","key":"5251_CR74","doi-asserted-by":"crossref","first-page":"161","DOI":"10.1023\/A:1017928328829","volume":"49","author":"D. Ormoneit","year":"2002","unstructured":"Ormoneit, D., & Sen, S. (2002). Kernel-based reinforcement learning. Machine Learning, 49(2\u20133), 161\u2013178.","journal-title":"Machine Learning"},{"issue":"1\u20133","key":"5251_CR75","first-page":"283","volume":"22","author":"J. Peng","year":"1996","unstructured":"Peng, J., & Williams, R. J. (1996). Incremental multi-step Q-learning. Machine Learning, 22(1\u20133), 283\u2013290.","journal-title":"Machine Learning"},{"key":"5251_CR76","first-page":"490","volume-title":"Proceedings of the nineteenth international conference on machine learning (ICML 2002)","author":"T. J. Perkins","year":"2002","unstructured":"Perkins, T. J., & Pendrith, M. D. (2002). On the existence of fixed points for Q-Learning and Sarsa in partially observable domains. In C. Sammut & A. Hoffman (Eds.), Proceedings of the nineteenth international conference on machine learning (ICML 2002) (pp. 490\u2013497). Los Altos: Kaufmann."},{"key":"5251_CR77","first-page":"1595","volume-title":"Advances in neural information processing systems 15 (NIPS 2002)","author":"T. J. Perkins","year":"2003","unstructured":"Perkins, T. J., & Precup, D. (2003). A convergent form of approximate policy iteration. In S. Becker, S. Thrun, & K. Obermayer (Eds.), Advances in neural information processing systems 15 (NIPS 2002) (pp. 1595\u20131602). Cambridge: MIT Press."},{"issue":"4","key":"5251_CR78","doi-asserted-by":"crossref","first-page":"682","DOI":"10.1016\/j.neunet.2008.02.003","volume":"21","author":"J. Peters","year":"2008","unstructured":"Peters, J., & Schaal, S. (2008). Reinforcement learning of motor skills with policy gradients. Neural Networks, 21(4), 682\u2013697.","journal-title":"Neural Networks"},{"key":"5251_CR79","first-page":"871","volume-title":"Proceedings of the twenty-seventh international conference on machine learning (ICML 2010)","author":"M. Petrik","year":"2010","unstructured":"Petrik, M., Taylor, G., Parr, R., & Zilberstein, S. (2010). Feature selection using regularization in approximate linear programs for Markov Decision Processes. In J. F\u00fcrnkranz & T. Joachims (Eds.), Proceedings of the twenty-seventh international conference on machine learning (ICML 2010) (pp. 871\u2013878). Madison: Omnipress."},{"key":"5251_CR80","first-page":"743","volume-title":"Proceedings of the seventeenth international conference on machine learning (ICML 2000)","author":"B. Pfahringer","year":"2000","unstructured":"Pfahringer, B., Bensusan, H., & Giraud-Carrier, C. (2000). Meta-learning by landmarking various learning algorithms. In P. Langley (Ed.), Proceedings of the seventeenth international conference on machine learning (ICML 2000) (pp. 743\u2013750). Los Altos: Kaufmann."},{"key":"5251_CR81","doi-asserted-by":"crossref","first-page":"335","DOI":"10.1613\/jair.2078","volume":"27","author":"J. Pineau","year":"2006","unstructured":"Pineau, J., Gordon, G. J., & Thrun, S. (2006). Anytime point-based approximations for large POMDPs. The Journal of Artificial Intelligence Research, 27, 335\u2013380.","journal-title":"The Journal of Artificial Intelligence Research"},{"key":"5251_CR82","first-page":"417","volume-title":"Proceedings of the eighteenth international conference on machine learning (ICML 2001)","author":"D. Precup","year":"2001","unstructured":"Precup, D., Sutton, R. S., & Dasgupta, S. (2001). Off-policy temporal difference learning with function approximation. In C. E. Brodley & A. P. Danyluk (Eds.), Proceedings of the eighteenth international conference on machine learning (ICML 2001) (pp. 417\u2013424). Los Altos: Kaufmann."},{"key":"5251_CR83","first-page":"725","volume-title":"Proceedings of the thirteenth national conference on artificial intelligence (AAAI 1996)","author":"J. R. Quinlan","year":"1996","unstructured":"Quinlan, J. R. (1996). Bagging, boosting, and C4.5. In Proceedings of the thirteenth national conference on artificial intelligence (AAAI 1996) (pp. 725\u2013730). Berlin: Springer."},{"key":"5251_CR84","doi-asserted-by":"crossref","first-page":"313","DOI":"10.1007\/978-3-540-39857-8_29","volume-title":"Proceedings of the fourteenth European conference on machine learning (ECML 2003)","author":"B. Ratitch","year":"2003","unstructured":"Ratitch, B., & Precup, D. (2003). Using MDP characteristics to guide exploration in reinforcement learning. In N. Lavrac, D. Gamberger, L. Todorovski, & H. Blockeel (Eds.), Proceedings of the fourteenth European conference on machine learning (ECML 2003) (pp. 313\u2013324). Berlin: Springer."},{"key":"5251_CR85","doi-asserted-by":"crossref","first-page":"1895","DOI":"10.1109\/IROS.2007.4399531","volume-title":"Proceedings of the 2007 IEEE\/RSJ international conference on intelligent robots and systems (IROS 2007), pages","author":"A. Rottmann","year":"2007","unstructured":"Rottmann, A., Plagemann, C., Hilgers, P., & Burgard, W. (2007). Autonomous blimp control using model-free reinforcement learning in a continuous state and action space. In Proceedings of the 2007 IEEE\/RSJ international conference on intelligent robots and systems (IROS 2007), pages (pp. 1895\u20131900). New York: IEEE Press."},{"key":"5251_CR86","unstructured":"Rummery, G. A. (1995). Problem solving with reinforcement learning. Ph.D. thesis, Cambridge University Engineering Department, Cambridge, UK."},{"key":"5251_CR87","unstructured":"Rummery, G. A., & Niranjan, M. (1994). On-line Q-learning using connectionist systems. CUED\/F-INFENG\/TR 166, Cambridge University Engineering Department."},{"key":"5251_CR88","first-page":"264","volume-title":"Proceedings of the 1993 connectionist models summer school","author":"P. Sabes","year":"1993","unstructured":"Sabes, P. (1993). Approximating Q-values with basis function representations. In M. Mozer, P. Smolensky, D. Touretzky, J. Elman, & A. Weigend (Eds.), Proceedings of the 1993 connectionist models summer school (pp. 264\u2013271). Hillsdale: Erlbaum."},{"key":"5251_CR89","doi-asserted-by":"crossref","first-page":"194","DOI":"10.1007\/11527862_14","volume-title":"Proceedings of the sixth international symposium on abstraction, reformulation and approximation (SARA 2005)","author":"A. A. Sherstov","year":"2005","unstructured":"Sherstov, A. A., & Stone, P. (2005). Function approximation via tile coding: Automating parameter choice. In J.-D. Zucker & L. Saitta (Eds.), Proceedings of the sixth international symposium on abstraction, reformulation and approximation (SARA 2005) (pp. 194\u2013205). Berlin: Springer."},{"key":"5251_CR90","first-page":"1053","volume-title":"Proceedings of the twentieth international joint conference on artificial intelligence (IJCAI 2007)","author":"D. Silver","year":"2007","unstructured":"Silver, D., Sutton, R. S., & M\u00fcller, M. (2007). Reinforcement learning of local shape in the game of Go. In M. M. Veloso (Ed.), Proceedings of the twentieth international joint conference on artificial intelligence (IJCAI 2007) (pp. 1053\u20131058). IJCAI."},{"key":"5251_CR91","first-page":"974","volume-title":"Advances in neural information processing systems 9 (NIPS 1996)","author":"S. Singh","year":"1997","unstructured":"Singh, S., & Bertsekas, D. (1997). Reinforcement learning for dynamic channel allocation in cellular telephone systems. In M. Mozer, M. I. Jordan, & T. Petsche (Eds.), Advances in neural information processing systems 9 (NIPS 1996) (pp. 974\u2013980). Cambridge: MIT Press."},{"issue":"3","key":"5251_CR92","doi-asserted-by":"crossref","first-page":"287","DOI":"10.1023\/A:1007678930559","volume":"38","author":"S. Singh","year":"2000","unstructured":"Singh, S., Jaakkola, T., Littman, M. L., & Szepesv\u00e1ri, C. (2000). Convergence results for single-step on-policy reinforcement-learning algorithms. Machine Learning, 38(3), 287\u2013308.","journal-title":"Machine Learning"},{"issue":"1\u20133","key":"5251_CR93","first-page":"123","volume":"22","author":"S. P. Singh","year":"1996","unstructured":"Singh, S. P., & Sutton, R. S. (1996). Reinforcement learning with replacing eligibility traces. Machine Learning, 22(1\u20133), 123\u2013158.","journal-title":"Machine Learning"},{"key":"5251_CR94","first-page":"284","volume-title":"Proceedings of the eleventh international conference on machine learning (ICML 1994)","author":"S. P. Singh","year":"1994","unstructured":"Singh, S. P., Jaakkola, T., & Jordan, M. I. (1994). Learning without state-estimation in partially observable Markovian decision processes. In W. W. Cohen & H. Hirsch (Eds.), Proceedings of the eleventh international conference on machine learning (ICML 1994) (pp. 284\u2013292). Los Altos: Kaufmann."},{"key":"5251_CR95","doi-asserted-by":"crossref","DOI":"10.1002\/0471722138","volume-title":"Introduction to stochastic search and optimization","author":"J. C. Spall","year":"2003","unstructured":"Spall, J. C. (2003). Introduction to stochastic search and optimization. Hoboken: Wiley."},{"key":"5251_CR96","unstructured":"Stanley, K. O. (2004). Efficient evolution of neural networks through complexification. Ph.D. thesis, Department of Computer Sciences, University of Texas at Austin. Published as technical report AI-TR-04-314."},{"issue":"3","key":"5251_CR97","doi-asserted-by":"crossref","first-page":"165","DOI":"10.1177\/105971230501300301","volume":"13","author":"P. Stone","year":"2005","unstructured":"Stone, P., Sutton, R. S., & Kuhlmann, G. (2005). Reinforcement learning for RoboCup-soccer keepaway. Adaptive Behavior, 13(3), 165\u2013188.","journal-title":"Adaptive Behavior"},{"key":"5251_CR98","doi-asserted-by":"crossref","first-page":"856","DOI":"10.1145\/1102351.1102459","volume-title":"Proceedings of the twenty-second international conference on machine learning (ICML 2005)","author":"A. L. Strehl","year":"2005","unstructured":"Strehl, A. L., & Littman, M. L. (2005). A theoretical analysis of model-based interval estimation. In L. De Raedt & S. Wrobel (Eds.), Proceedings of the twenty-second international conference on machine learning (ICML 2005) (pp. 856\u2013863). New York: ACM."},{"key":"5251_CR99","first-page":"881","volume-title":"Proceedings of the twenty-third international conference on machine learning (ICML 2006)","author":"A. L. Strehl","year":"2006","unstructured":"Strehl, A. L., Li, L., Wiewiora, E., Langford, J., & Littman, M. L. (2006). PAC model-free reinforcement learning. In W. W. Cohen & A. Moore (Eds.), Proceedings of the twenty-third international conference on machine learning (ICML 2006) (pp. 881\u2013888). New York: ACM."},{"issue":"1","key":"5251_CR100","first-page":"9","volume":"3","author":"R. S. Sutton","year":"1988","unstructured":"Sutton, R. S. (1988). Learning to predict by the methods of temporal differences. Machine Learning, 3(1), 9\u201344.","journal-title":"Machine Learning"},{"key":"5251_CR101","first-page":"216","volume-title":"Proceedings of the seventh international conference on machine learning (ICML 1990)","author":"R. S. Sutton","year":"1990","unstructured":"Sutton, R. S. (1990). Integrated architectures for learning, planning, and reacting based on Approximating Dynamic Programming. In B. W. Porter & R. J. Mooney (Eds.), Proceedings of the seventh international conference on machine learning (ICML 1990) (pp. 216\u2013224). Los Altos: Kaufmann."},{"key":"5251_CR102","first-page":"1038","volume-title":"Advances in neural information processing systems 8 (NIPS 1995)","author":"R. S. Sutton","year":"1996","unstructured":"Sutton, R. S. (1996). Generalization in reinforcement learning: Successful examples using sparse coarse coding. In D. S. Touretzky, M. Mozer, & M. E. Hasselmo (Eds.), Advances in neural information processing systems 8 (NIPS 1995) (pp. 1038\u20131044). Cambridge: MIT Press."},{"key":"5251_CR103","volume-title":"Reinforcement learning: an introduction","author":"R. S. Sutton","year":"1998","unstructured":"Sutton, R. S., & Barto, A. G. (1998). Reinforcement learning: an introduction. Cambridge: MIT Press."},{"key":"5251_CR104","first-page":"91","volume-title":"Proceedings of the eighth Yale workshop on adaptive and learning systems","author":"R. S. Sutton","year":"1994","unstructured":"Sutton, R. S., & Singh, S. P. (1994). On bias and step size in temporal-difference learning. In Proceedings of the eighth Yale workshop on adaptive and learning systems (pp. 91\u201396). New Haven, CT, USA. Center for Systems Science, Yale University."},{"key":"5251_CR105","first-page":"1057","volume-title":"Advances in neural information processing systems 12 (NIPS 1999)","author":"R. S. Sutton","year":"2000","unstructured":"Sutton, R. S., McAllester, D. A., Singh, S. P., & Mansour, Y. (2000). Policy gradient methods for reinforcement learning with function approximation. In S. A. Solla, T. K. Leen, & K.-R. M\u00fcller (Eds.), Advances in neural information processing systems 12 (NIPS 1999) (pp. 1057\u20131063). Cambridge: MIT Press."},{"key":"5251_CR106","doi-asserted-by":"crossref","first-page":"993","DOI":"10.1145\/1553374.1553501","volume-title":"Proceedings of the twenty-sixth annual international conference on machine learning (ICML 2009)","author":"R. S. Sutton","year":"2009","unstructured":"Sutton, R. S., Maei, H. R., Precup, D., Bhatnagar, S., Silver, D., Szepesv\u00e1ri, C., & Wiewiora, E. (2009). Fast gradient-descent methods for temporal-difference learning with linear function approximation. In A. P. Danyluk, L. Bottou, & M. L. Littman (Eds.), Proceedings of the twenty-sixth annual international conference on machine learning (ICML 2009) (pp. 993\u20131000). New York: ACM."},{"issue":"2","key":"5251_CR107","doi-asserted-by":"crossref","first-page":"167","DOI":"10.1007\/s10994-009-5102-1","volume":"75","author":"T. Suttorp","year":"2009","unstructured":"Suttorp, T., Hansen, N., & Igel, C. (2009). Efficient covariance matrix update for variable metric evolution strategies. Machine Learning, 75(2), 167\u2013197.","journal-title":"Machine Learning"},{"issue":"12","key":"5251_CR108","doi-asserted-by":"crossref","first-page":"2936","DOI":"10.1162\/neco.2006.18.12.2936","volume":"18","author":"I. Szita","year":"2006","unstructured":"Szita, I., & L\u0151rincz, A. (2006). Learning Tetris using the noisy cross-entropy method. Neural Computation, 18(12), 2936\u20132941.","journal-title":"Neural Computation"},{"key":"5251_CR109","doi-asserted-by":"crossref","first-page":"659","DOI":"10.1613\/jair.2368","volume":"30","author":"I. Szita","year":"2007","unstructured":"Szita, I., & L\u0151rincz, A. (2007). Learning to play using low-complexity rule-based policies: Illustrations through Ms. Pac-Man. The Journal of Artificial Intelligence Research, 30, 659\u2013684.","journal-title":"The Journal of Artificial Intelligence Research"},{"key":"5251_CR110","doi-asserted-by":"crossref","first-page":"1048","DOI":"10.1145\/1390156.1390288","volume-title":"Proceedings of the twenty-fifth international conference on machine learning (ICML 2008)","author":"I. Szita","year":"2008","unstructured":"Szita, I., & L\u0151rincz, A. (2008). The many faces of optimism: a unifying approach. In W. W. Cohen, A. McCallum, & S. T. Roweis (Eds.), Proceedings of the twenty-fifth international conference on machine learning (ICML 2008) (pp. 1048\u20131055). New York: ACM."},{"key":"5251_CR111","first-page":"2849","volume-title":"Proceedings of the 2004 IEEE\/RSJ international conference on intelligent robots and systems (IROS 2004)","author":"R. Tedrake","year":"2004","unstructured":"Tedrake, R., Zhang, T. W., & Seung, H. S. (2004). Stochastic policy gradient reinforcement learning on a simple 3D biped. In Proceedings of the 2004 IEEE\/RSJ international conference on intelligent robots and systems (IROS 2004) (pp. 2849\u20132854). New York: IEEE Press."},{"issue":"3\u20134","key":"5251_CR112","first-page":"257","volume":"8","author":"G. Tesauro","year":"1992","unstructured":"Tesauro, G. (1992). Practical issues in temporal difference learning. Machine Learning, 8(3\u20134), 257\u2013277.","journal-title":"Machine Learning"},{"issue":"3","key":"5251_CR113","doi-asserted-by":"crossref","first-page":"287","DOI":"10.1007\/s10586-007-0035-6","volume":"10","author":"G. Tesauro","year":"2007","unstructured":"Tesauro, G., Jong, N. K., Das, R., & Bennani, M. N. (2007). On the use of hybrid reinforcement learning for autonomic resource allocation. Cluster Computing, 10(3), 287\u2013299.","journal-title":"Cluster Computing"},{"key":"5251_CR114","first-page":"255","volume-title":"Proceedings of the 1993 connectionist models summer school","author":"S. Thrun","year":"1993","unstructured":"Thrun, S., & Schwartz, A. (1993). Issues in using function approximation for reinforcement learning. In M. Mozer, P. Smolensky, D. Touretzky, J. Elman, & A. Weigend (Eds.), Proceedings of the 1993 connectionist models summer school (pp. 255\u2013263). Hillsdale: Lawrence Erlbaum."},{"key":"5251_CR115","first-page":"30","volume-title":"K\u00fcnstliche Intelligenz","author":"J. Togelius","year":"2009","unstructured":"Togelius, J., Schaul, T., Wierstra, D., Igel, C., Gomez, F., & Schmidhuber, J. (2009). Ontogenetic and phylogenetic reinforcement learning. In K\u00fcnstliche Intelligenz (pp. 30\u201333)."},{"key":"5251_CR116","doi-asserted-by":"crossref","first-page":"674","DOI":"10.1109\/9.580874","volume":"42","author":"J. N. Tsitsiklis","year":"1997","unstructured":"Tsitsiklis, J. N., & Van Roy, B. (1997). An analysis of temporal-difference learning with function approximation. IEEE Transactions on Automatic Control, 42, 674\u2013690.","journal-title":"IEEE Transactions on Automatic Control"},{"key":"5251_CR117","doi-asserted-by":"crossref","first-page":"177","DOI":"10.1109\/ADPRL.2009.4927542","volume-title":"Proceedings of the 2009 IEEE symposium on adaptive dynamic programming and reinforcement learning (ADPRL 2009)","author":"H. Seijen van","year":"2009","unstructured":"van Seijen, H., van Hasselt, H., Whiteson, S., & Wiering, M. (2009). A theoretical and empirical analysis of Expected Sarsa. In Proceedings of the 2009 IEEE symposium on adaptive dynamic programming and reinforcement learning (ADPRL 2009) (pp. 177\u2013184). New York: IEEE Press."},{"issue":"2","key":"5251_CR118","doi-asserted-by":"crossref","first-page":"77","DOI":"10.1023\/A:1019956318069","volume":"18","author":"R. Vilalta","year":"2002","unstructured":"Vilalta, R., & Drissi, Y. (2002). A perspective view and survey of meta-learning. Artificial Intelligence Review, 18(2), 77\u201395.","journal-title":"Artificial Intelligence Review"},{"key":"5251_CR119","unstructured":"Watkins, C. J. C. H. (1989). Learning from delayed rewards. Ph.D. thesis, King\u2019s College, Cambridge, UK."},{"issue":"3\u20134","key":"5251_CR120","first-page":"279","volume":"8","author":"C. J. C. H. Watkins","year":"1992","unstructured":"Watkins, C. J. C. H., & Dayan, P. (1992). Q-learning. Machine Learning, 8(3\u20134), 279\u2013292.","journal-title":"Machine Learning"},{"issue":"1","key":"5251_CR121","first-page":"45","volume":"7","author":"S. D. Whitehead","year":"1991","unstructured":"Whitehead, S. D., & Ballard, D. H. (1991). Learning to perceive and act by trial and error. Machine Learning, 7(1), 45\u201383.","journal-title":"Machine Learning"},{"issue":"7","key":"5251_CR122","doi-asserted-by":"crossref","first-page":"855","DOI":"10.1016\/S0952-1976(04)00109-5","volume":"17","author":"S. Whiteson","year":"2004","unstructured":"Whiteson, S., & Stone, P. (2004). Adaptive job routing and scheduling. Engineering Applications of Artificial Intelligence, 17(7), 855\u2013869. Special issue on Autonomic Computing and Automation.","journal-title":"Engineering Applications of Artificial Intelligence"},{"key":"5251_CR123","first-page":"877","volume":"7","author":"S. Whiteson","year":"2006","unstructured":"Whiteson, S., & Stone, P. (2006). Evolutionary function approximation for reinforcement learning. Journal of Machine Learning Research, 7, 877\u2013917.","journal-title":"Journal of Machine Learning Research"},{"key":"5251_CR124","doi-asserted-by":"crossref","first-page":"120","DOI":"10.1109\/ADPRL.2011.5967363","volume-title":"Proceedings of the 2011 IEEE Symposium on Adaptive Dynamic Programming and Reinforcement Learning (ADPRL 2011)","author":"S. Whiteson","year":"2011","unstructured":"Whiteson, S., Tanner, B., Taylor, M. E., & Stone, P. (2011). Protecting against evaluation overfitting in empirical reinforcement learning. In Proceedings of the 2011 IEEE Symposium on Adaptive Dynamic Programming and Reinforcement Learning (ADPRL 2011) (pp. 120\u2013127). New York: IEEE Press."},{"issue":"1","key":"5251_CR125","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1007\/s10458-009-9100-2","volume":"21","author":"S. Whiteson","year":"2010","unstructured":"Whiteson, S., Taylor, M. E., & Stone, P. (2010). Critical factors in the empirical performance of temporal difference and evolutionary methods for reinforcement learning. Autonomous Agents and Multi-Agent Systems, 21(1), 1\u201335.","journal-title":"Autonomous Agents and Multi-Agent Systems"},{"issue":"1","key":"5251_CR126","doi-asserted-by":"crossref","first-page":"67","DOI":"10.1109\/4235.585893","volume":"1","author":"D. H. Wolpert","year":"1997","unstructured":"Wolpert, D. H., & Macready, W. G. (1997). No free lunch theorems for optimization. IEEE Transactions on Evolutionary Computation, 1(1), 67\u201382.","journal-title":"IEEE Transactions on Evolutionary Computation"},{"key":"5251_CR127","doi-asserted-by":"crossref","first-page":"565","DOI":"10.1613\/jair.2490","volume":"32","author":"L. Xu","year":"2008","unstructured":"Xu, L., Hutter, F., Hoos, H. H., & Leyton-Brown, K. (2008). SATzilla: portfolio-based algorithm selection for SAT. The Journal of Artificial Intelligence Research, 32, 565\u2013606.","journal-title":"The Journal of Artificial Intelligence Research"},{"key":"5251_CR128","first-page":"1114","volume-title":"Proceedings of the fourteenth international joint conference on artificial intelligence (IJCAI 1995)","author":"W. Zhang","year":"1995","unstructured":"Zhang, W., & Dietterich, T. G. (1995). A reinforcement learning approach to job-shop scheduling. In Proceedings of the fourteenth international joint conference on artificial intelligence (IJCAI 1995) (pp. 1114\u20131120). Los Altos: Kaufmann."}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-011-5251-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10994-011-5251-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-011-5251-x","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,6,7]],"date-time":"2023-06-07T05:20:09Z","timestamp":1686115209000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10994-011-5251-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2011,6,3]]},"references-count":128,"journal-issue":{"issue":"1-2","published-print":{"date-parts":[[2011,7]]}},"alternative-id":["5251"],"URL":"https:\/\/doi.org\/10.1007\/s10994-011-5251-x","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2011,6,3]]}}}