{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,13]],"date-time":"2026-05-13T06:36:14Z","timestamp":1778654174668,"version":"3.51.4"},"reference-count":83,"publisher":"Springer Science and Business Media LLC","issue":"1-2","license":[{"start":{"date-parts":[[2012,8,10]],"date-time":"2012-08-10T00:00:00Z","timestamp":1344556800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2012,10]]},"DOI":"10.1007\/s10994-012-5313-8","type":"journal-article","created":{"date-parts":[[2012,8,9]],"date-time":"2012-08-09T18:05:57Z","timestamp":1344535557000},"page":"123-156","source":"Crossref","is-referenced-by-count":71,"title":["Preference-based reinforcement learning: a formal framework and a policy iteration algorithm"],"prefix":"10.1007","volume":"89","author":[{"given":"Johannes","family":"F\u00fcrnkranz","sequence":"first","affiliation":[]},{"given":"Eyke","family":"H\u00fcllermeier","sequence":"additional","affiliation":[]},{"given":"Weiwei","family":"Cheng","sequence":"additional","affiliation":[]},{"given":"Sang-Hyeun","family":"Park","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2012,8,10]]},"reference":[{"key":"5313_CR1","first-page":"554","volume-title":"Encyclopedia of machine learning","author":"P. Abbeel","year":"2010","unstructured":"Abbeel, P., & Ng, A. Y. (2010). Inverse reinforcement learning. In C. Sammut & G. I. Webb (Eds.), Encyclopedia of machine learning (pp. 554\u2013558). Berlin: Springer."},{"key":"5313_CR2","doi-asserted-by":"crossref","first-page":"12","DOI":"10.1007\/978-3-642-23780-5_11","volume-title":"Proceedings of the European conference on machine learning and knowledge discovery in databases (ECML-PKDD-11), part\u00a0I","author":"R. Akrour","year":"2011","unstructured":"Akrour, R., Schoenauer, M., & Sebag, M. (2011). Preference-based policy learning. In D.\u00a0Gunopulos, T.\u00a0Hofmann, D.\u00a0Malerba, & M.\u00a0Vazirgiannis (Eds.), Proceedings of the European conference on machine learning and knowledge discovery in databases (ECML-PKDD-11), part\u00a0I, Athens, Greece (pp.\u00a012\u201327). Berlin: Springer."},{"issue":"2","key":"5313_CR3","doi-asserted-by":"crossref","first-page":"251","DOI":"10.1162\/089976698300017746","volume":"10","author":"S. Amari","year":"1998","unstructured":"Amari, S. (1998). Natural gradient works efficiently in learning. Neural Computation, 10(2), 251\u2013276.","journal-title":"Neural Computation"},{"key":"5313_CR4","unstructured":"Arenz, O. (2012). Monte-Carlo chess. Bachelor\u2019s thesis. TU Darmstadt: Knowledge Engineering Group."},{"key":"5313_CR5","first-page":"41","volume-title":"Proceedings of the 23rd conference on learning theory (COLT-10)","author":"J.-Y. Audibert","year":"2010","unstructured":"Audibert, J.-Y., Bubeck, S., & Munos, R. (2010). Best arm identification in multi-armed bandits. In A. T. Kalai & M. Mohri (Eds.), Proceedings of the 23rd conference on learning theory (COLT-10), Haifa, Israel (pp. 41\u201353). Hawthorne: Omnipress."},{"key":"5313_CR6","first-page":"322","volume-title":"Proceedings of the 36th annual symposium on foundations of computer science","author":"P. Auer","year":"1995","unstructured":"Auer, P., Cesa-Bianchi, N., Freund, Y., & Schapire, R. E. (1995). Gambling in a rigged casino: the adversarial multi-arm bandit problem. In Proceedings of the 36th annual symposium on foundations of computer science (pp. 322\u2013331). Los Alamitos: IEEE Computer Society Press."},{"issue":"2\u20133","key":"5313_CR7","doi-asserted-by":"crossref","first-page":"235","DOI":"10.1023\/A:1013689704352","volume":"47","author":"P. Auer","year":"2002","unstructured":"Auer, P., Cesa-Bianchi, N., & Fischer, P. (2002). Finite-time analysis of the multiarmed bandit problem. Machine Learning, 47(2\u20133), 235\u2013256.","journal-title":"Machine Learning"},{"key":"5313_CR8","first-page":"835","volume":"13","author":"A. G. Barto","year":"1983","unstructured":"Barto, A. G., Sutton, R. S., & Anderson, C. (1983). Neuron-like elements that can solve difficult learning control problems. IEEE Transactions on Systems, Man and Cybernetics, 13, 835\u2013846.","journal-title":"IEEE Transactions on Systems, Man and Cybernetics"},{"issue":"3","key":"5313_CR9","doi-asserted-by":"crossref","first-page":"243","DOI":"10.1023\/A:1007634325138","volume":"40","author":"J. Baxter","year":"2000","unstructured":"Baxter, J., Tridgell, A., & Weaver, L. (2000). Learning to play chess using temporal differences. Machine Learning, 40(3), 243\u2013263.","journal-title":"Machine Learning"},{"issue":"1\u20132","key":"5313_CR10","doi-asserted-by":"crossref","first-page":"105","DOI":"10.1016\/S0304-3975(00)00078-5","volume":"252","author":"D. F. Beal","year":"2001","unstructured":"Beal, D. F., & Smith, M. C. (2001). Temporal difference learning applied to game playing and the results of application to Shogi. Theoretical Computer Science, 252(1\u20132), 105\u2013119. Special Issue on Papers from the Computers and Games 1998 Conference.","journal-title":"Theoretical Computer Science"},{"issue":"11","key":"5313_CR11","doi-asserted-by":"crossref","first-page":"2471","DOI":"10.1016\/j.automatica.2009.07.008","volume":"45","author":"S. Bhatnagar","year":"2009","unstructured":"Bhatnagar, S., Sutton, R. S., Ghavamzadeh, M., & Lee, M. (2009). Natural actor-critic algorithms. Automatica, 45(11), 2471\u20132482.","journal-title":"Automatica"},{"key":"5313_CR12","first-page":"61","volume-title":"Proc. 18th conference in uncertainty in artificial intelligence (UAI-02)","author":"B. Bonet","year":"2002","unstructured":"Bonet, B., & Pearl, J. (2002). Qualitative mdps and pomdps: an order-of-magnitude approximation. In Proc. 18th conference in uncertainty in artificial intelligence (UAI-02), Alberta, Canada (pp. 61\u201368)."},{"issue":"1\u20132","key":"5313_CR13","doi-asserted-by":"crossref","first-page":"217","DOI":"10.1016\/S0004-3702(97)00024-6","volume":"94","author":"R. I. Brafman","year":"1997","unstructured":"Brafman, R. I., & Tennenholtz, M. (1997). Modeling agents as qualitative decision makers. Artificial Intelligence, 94(1\u20132), 217\u2013268.","journal-title":"Artificial Intelligence"},{"issue":"4","key":"5313_CR14","first-page":"107","volume":"24","author":"I. Bratko","year":"2003","unstructured":"Bratko, I., & Suc, D. (2003). Learning qualitative models. AI Magazine, 24(4), 107\u2013119.","journal-title":"AI Magazine"},{"key":"5313_CR15","doi-asserted-by":"crossref","first-page":"312","DOI":"10.1007\/978-3-642-23780-5_30","volume-title":"Proceedings of the European conference on machine learning and knowledge discovery in databases (ECML-PKDD-11), part I","author":"W. Cheng","year":"2011","unstructured":"Cheng, W., F\u00fcrnkranz, J., H\u00fcllermeier, E., & Park, S.-H. (2011). Preference-based policy iteration: leveraging preference learning for reinforcement learning. In D. Gunopulos, T. Hofmann, D. Malerba, & M. Vazirgiannis (Eds.), Proceedings of the European conference on machine learning and knowledge discovery in databases (ECML-PKDD-11), part I, Athens, Greece (pp. 312\u2013327). Berlin: Springer."},{"key":"5313_CR16","doi-asserted-by":"crossref","first-page":"235","DOI":"10.1023\/A:1007518724497","volume":"33","author":"R. Crites","year":"1998","unstructured":"Crites, R., & Barto, A. (1998). Elevator group control using multiple reinforcement learning agents. Machine Learning, 33, 235\u2013262.","journal-title":"Machine Learning"},{"key":"5313_CR17","first-page":"1","volume":"7","author":"J. Dems\u0306ar","year":"2006","unstructured":"Dems\u0306ar, J. (2006). Statistical comparisons of classifiers over multiple data sets. Journal of Machine Learning Research, 7, 1\u201330.","journal-title":"Journal of Machine Learning Research"},{"issue":"3","key":"5313_CR18","doi-asserted-by":"crossref","first-page":"157","DOI":"10.1007\/s10994-008-5069-3","volume":"72","author":"C. Dimitrakakis","year":"2008","unstructured":"Dimitrakakis, C., & Lagoudakis, M. G. (2008). Rollout sampling approximate policy iteration. Machine Learning, 72(3), 157\u2013171.","journal-title":"Machine Learning"},{"issue":"2","key":"5313_CR19","first-page":"55","volume":"20","author":"J. Doyle","year":"1999","unstructured":"Doyle, J., & Thomason, R. (1999). Background to qualitative decision theory. AI Magazine, 20(2), 55\u201368.","journal-title":"AI Magazine"},{"issue":"3","key":"5313_CR20","doi-asserted-by":"crossref","first-page":"271","DOI":"10.1023\/B:MACH.0000039779.47329.3a","volume":"57","author":"K. Driessens","year":"2004","unstructured":"Driessens, K., & D\u017eeroski, S. (2004). Integrating guidance into relational reinforcement learning. Machine Learning, 57(3), 271\u2013304.","journal-title":"Machine Learning"},{"issue":"4","key":"5313_CR21","first-page":"209","volume":"31","author":"S. Droste","year":"2008","unstructured":"Droste, S., & F\u00fcrnkranz, J. (2008). Learning the piece values for three chess variants. International Computer Games Association Journal, 31(4), 209\u2013233.","journal-title":"International Computer Games Association Journal"},{"issue":"1\u20132","key":"5313_CR22","doi-asserted-by":"crossref","first-page":"219","DOI":"10.1016\/S0004-3702(03)00037-7","volume":"148","author":"D. Dubois","year":"2003","unstructured":"Dubois, D., Fargier, H., & Perny, P. (2003). Qualitative decision theory with preference relations and comparative uncertainty: an axiomatic approach. Artificial Intelligence, 148(1\u20132), 219\u2013260.","journal-title":"Artificial Intelligence"},{"issue":"1\u20132","key":"5313_CR23","doi-asserted-by":"crossref","first-page":"7","DOI":"10.1023\/A:1007694015589","volume":"43","author":"S. D\u017eeroski","year":"2001","unstructured":"D\u017eeroski, S., De Raedt, L., & Driessens, K. (2001). Relational reinforcement learning. Machine Learning, 43(1\u20132), 7\u201352.","journal-title":"Machine Learning"},{"key":"5313_CR24","unstructured":"Edwards, S. J. (1994). Portable game notation. http:\/\/folk.uio.no\/andreio\/docs\/pgnspec.pdf ."},{"key":"5313_CR25","doi-asserted-by":"crossref","first-page":"305","DOI":"10.1145\/1143844.1143883","volume-title":"Proceedings of the 23rd international conference on machine learning (ICML-06)","author":"A. Epshteyn","year":"2006","unstructured":"Epshteyn, A., & DeJong, G. (2006). Qualitative reinforcement learning. In W. Cohen & A. Moore (Eds.), Proceedings of the 23rd international conference on machine learning (ICML-06), Pittsburgh, Pennsylvania, USA (pp. 305\u2013312)."},{"key":"5313_CR26","first-page":"162","volume-title":"Proceedings of the 20th international conference on machine learning (ICML-03)","author":"E. Even-Dar","year":"2003","unstructured":"Even-Dar, E., Mannor, S., & Mansour, Y. (2003). Action elimination and stopping conditions for reinforcement learning. In T. Fawcett & N. Mishra (Eds.), Proceedings of the 20th international conference on machine learning (ICML-03), Washington, DC (pp. 162\u2013169). Menlo Park: AAAI Press."},{"issue":"1\u20132","key":"5313_CR27","doi-asserted-by":"crossref","first-page":"245","DOI":"10.1016\/j.artint.2004.12.002","volume":"164","author":"H. Fargier","year":"2005","unstructured":"Fargier, H., & Sabbadin, R. (2005). Qualitative decision under uncertainty: back to expected utility. Artificial Intelligence, 164(1\u20132), 245\u2013280.","journal-title":"Artificial Intelligence"},{"key":"5313_CR28","doi-asserted-by":"crossref","first-page":"75","DOI":"10.1613\/jair.1700","volume":"25","author":"A. Fern","year":"2006","unstructured":"Fern, A., Yoon, S. W., & Givan, R. (2006). Approximate policy iteration with a policy language bias: solving relational Markov decision processes. The Journal of Artificial Intelligence Research, 25, 75\u2013118.","journal-title":"The Journal of Artificial Intelligence Research"},{"key":"5313_CR29","doi-asserted-by":"crossref","first-page":"633","DOI":"10.1007\/978-0-387-30164-8_504","volume-title":"Encyclopedia of machine learning","author":"J. F\u00fcrnkranz","year":"2011","unstructured":"F\u00fcrnkranz, J. (2011). Machine learning and game playing. In C. Sammut & G. I. Webb (Eds.), Encyclopedia of machine learning (pp. 633\u2013637). Berlin: Springer."},{"key":"5313_CR30","volume-title":"Preference learning","year":"2010","unstructured":"F\u00fcrnkranz, J. & H\u00fcllermeier, E. (Eds.) (2010). Preference learning. Berlin: Springer."},{"key":"5313_CR31","first-page":"291","volume-title":"Proceedings of the 14th European conference on artificial intelligence (ECAI-00)","author":"J. F\u00fcrnkranz","year":"2000","unstructured":"F\u00fcrnkranz, J., Pfahringer, B., Kaindl, H., & Kramer, S. (2000). Learning to use operational advice. In W.\u00a0Horn (Ed.), Proceedings of the 14th European conference on artificial intelligence (ECAI-00), Berlin (pp.\u00a0291\u2013295). Amsterdam: IOS press."},{"key":"5313_CR32","volume-title":"Proceedings of the ICML-10 workshop on reinforcement learning and search in very large spaces","author":"V. Gabillon","year":"2010","unstructured":"Gabillon, V., Lazaric, A., & Ghavamzadeh, M. (2010). Rollout allocation strategies for classification-based policy iteration. In P. Auer, S. Kaski, & C. Szepesv\u00e0ri (Eds.), Proceedings of the ICML-10 workshop on reinforcement learning and search in very large spaces."},{"key":"5313_CR33","first-page":"1049","volume-title":"Proceedings of the 28th international conference on machine learning (ICML-11)","author":"V. Gabillon","year":"2011","unstructured":"Gabillon, V., Lazaric, A., Ghavamzadeh, M., & Scherrer, B. (2011). Classification-based policy iteration with a critic. In L. Getoor & T. Scheffer (Eds.), Proceedings of the 28th international conference on machine learning (ICML-11), New York, NY, USA (pp. 1049\u20131056). New York: ACM Press."},{"key":"5313_CR34","first-page":"197","volume-title":"Proceedings of the 15th international conference on machine learning (ICML-98)","author":"Z. G\u00e1bor","year":"1998","unstructured":"G\u00e1bor, Z., Kalm\u00e1r, Z., & Szepesv\u00e0ri, C. (1998). Multi-criteria reinforcement learning. In Proceedings of the 15th international conference on machine learning (ICML-98), Madison, WI (pp. 197\u2013205). San Mateo: Morgan Kaufmann."},{"issue":"1","key":"5313_CR35","doi-asserted-by":"crossref","first-page":"10","DOI":"10.1145\/1656274.1656278","volume":"11","author":"M. Hall","year":"2009","unstructured":"Hall, M., Frank, E., Holmes, G., Pfahringer, B., Reutemann, P., & Witten, I. H. (2009). The weka data mining software: an update. SIGKDD Explorations, 11(1), 10\u201318.","journal-title":"SIGKDD Explorations"},{"key":"5313_CR36","series-title":"ACM international conference proceeding series","first-page":"401","volume-title":"Proceedings of the 26th international conference on machine learning (ICML-09)","author":"V. Heidrich-Meisner","year":"2009","unstructured":"Heidrich-Meisner, V., & Igel, C. (2009). Hoeffding and Bernstein races for selecting policies in evolutionary direct policy search. In A. P. Danyluk, L. Bottou, & M. L. Littman (Eds.), ACM international conference proceeding series: Proceedings of the 26th international conference on machine learning (ICML-09), Montreal, Canada (pp. 401\u2013408)."},{"key":"5313_CR37","doi-asserted-by":"crossref","first-page":"1897","DOI":"10.1016\/j.artint.2008.08.002","volume":"172","author":"E. H\u00fcllermeier","year":"2008","unstructured":"H\u00fcllermeier, E., F\u00fcrnkranz, J., Cheng, W., & Brinker, K. (2008). Label ranking by learning pairwise preferences. Artificial Intelligence, 172, 1897\u20131916.","journal-title":"Artificial Intelligence"},{"key":"5313_CR38","first-page":"1531","volume-title":"Advances in neural information processing systems 14 (NIPS-2001)","author":"S. Kakade","year":"2001","unstructured":"Kakade, S. (2001). A natural policy gradient. In T. G. Dietterich, S. Becker, & Z. Ghahramani (Eds.), Advances in neural information processing systems 14 (NIPS-2001), Vancouver, British Columbia, Canada (pp. 1531\u20131538). Cambridge: MIT Press."},{"key":"5313_CR39","first-page":"181","volume-title":"Preference leaning","author":"T. Kamishima","year":"2011","unstructured":"Kamishima, T., Kazawa, H., & Akaho, S. (2011). A survey and empirical comparison of object ranking methods. In J. F\u00fcrnkranz & E. H\u00fcllermeier (Eds.), Preference leaning (pp. 181\u2013201). Berlin: Springer."},{"key":"5313_CR40","doi-asserted-by":"crossref","first-page":"456","DOI":"10.1145\/1390156.1390214","volume-title":"Proceedings of the 25th international conference on machine learning (ICML 2008)","author":"K. Kersting","year":"2008","unstructured":"Kersting, K., & Driessens, K. (2008). Non-parametric policy gradients: a unified treatment of propositional and relational domains. In W. W. Cohen, A. McCallum, & S. T. Roweis (Eds.), Proceedings of the 25th international conference on machine learning (ICML 2008), Helsinki, Finland (pp. 456\u2013463). New York: ACM Press."},{"issue":"1\u20132","key":"5313_CR41","doi-asserted-by":"crossref","first-page":"171","DOI":"10.1007\/s10994-010-5223-6","volume":"84","author":"J. Kober","year":"2011","unstructured":"Kober, J., & Peters, J. (2011). Policy search for motor primitives in robotics. Machine Learning, 84(1\u20132), 171\u2013203.","journal-title":"Machine Learning"},{"issue":"4","key":"5313_CR42","doi-asserted-by":"crossref","first-page":"1143","DOI":"10.1137\/S0363012901385691","volume":"42","author":"V. R. Konda","year":"2003","unstructured":"Konda, V. R., & Tsitsiklis, J. N. (2003). On actor-critic algorithms. SIAM Journal on Control and Optimization, 42(4), 1143\u20131166.","journal-title":"SIAM Journal on Control and Optimization"},{"key":"5313_CR43","volume-title":"Qualitative reasoning","author":"B. Kuipers","year":"1994","unstructured":"Kuipers, B. (1994). Qualitative reasoning. Cambridge: MIT Press."},{"key":"5313_CR44","first-page":"424","volume-title":"Proceedings of the 20th international conference on machine learning (ICML-03)","author":"M. G. Lagoudakis","year":"2003","unstructured":"Lagoudakis, M. G., & Parr, R. (2003). Reinforcement learning as classification: leveraging modern classifiers. In T. E. Fawcett & N. Mishra (Eds.), Proceedings of the 20th international conference on machine learning (ICML-03), Washington, DC, USA: (pp. 424\u2013431). Menlo Park: AAAI Press."},{"key":"5313_CR45","volume-title":"Advances in neural information processing systems 20 (NIPS-21)","author":"J. Langford","year":"2008","unstructured":"Langford, J., & Zhang, T. (2008). The epoch-greedy algorithm for multi-armed bandits with side information. In J. C. Platt, D. Koller, Y. Singer, & S. T. Roweis (Eds.), Advances in neural information processing systems 20 (NIPS-21), Vancouver, Canada. Cambridge: MIT Press"},{"key":"5313_CR46","series-title":"ACM international conference proceeding series","doi-asserted-by":"crossref","first-page":"528","DOI":"10.1145\/1390156.1390223","volume-title":"Proceedings of the 25th international conference on machine learning (ICML-08)","author":"J. Langford","year":"2008","unstructured":"Langford, J., Strehl, A. L., & Wortman, J. (2008). Exploration scavenging. In W. W. Cohen, A. McCallum, & S. T. Roweis (Eds.), ACM international conference proceeding series: Vol.\u00a0307. Proceedings of the 25th international conference on machine learning (ICML-08), Helsinki, Finland (pp. 528\u2013535). New York: ACM Press."},{"key":"5313_CR47","first-page":"607","volume-title":"Proceedings of the 27th international conference on machine learning (ICML-10)","author":"A. Lazaric","year":"2010","unstructured":"Lazaric, A., Ghavamzadeh, M., & Munos, R. (2010). Analysis of a classification-based policy iteration algorithm. In J. F\u00fcrnkranz & T. Joachims (Eds.), Proceedings of the 27th international conference on machine learning (ICML-10) (pp. 607\u2013614). Hawthorne: Omnipress."},{"key":"5313_CR48","doi-asserted-by":"crossref","first-page":"661","DOI":"10.1145\/1772690.1772758","volume-title":"Proceedings of the 19th international conference on world wide web (WWW-10)","author":"L. Li","year":"2010","unstructured":"Li, L., Chu, W., Langford, J., & Schapire, R. E. (2010). A contextual-bandit approach to personalized news article recommendation. In M. Rappa, P. Jones, J. Freire, & S. Chakrabarti (Eds.), Proceedings of the 19th international conference on world wide web (WWW-10), Raleigh, North Carolina (pp. 661\u2013670). New York: ACM Press."},{"issue":"7\u20139","key":"5313_CR49","doi-asserted-by":"crossref","first-page":"1164","DOI":"10.1016\/j.neucom.2009.11.024","volume":"73","author":"E. Loza Menc\u00eda","year":"2010","unstructured":"Loza Menc\u00eda, E., Park, S.-H., & F\u00fcrnkranz, J. (2010). Efficient voting prediction for pairwise multilabel classification. Neurocomputing, 73(7\u20139), 1164\u20131176.","journal-title":"Neurocomputing"},{"issue":"1\u20133","key":"5313_CR50","first-page":"251","volume":"22","author":"R. Maclin","year":"1996","unstructured":"Maclin, R., & Shavlik, J. W. (1996). Creating advice-taking reinforcement learners. Machine Learning, 22(1\u20133), 251\u2013281.","journal-title":"Machine Learning"},{"key":"5313_CR51","first-page":"819","volume-title":"Proceedings of the 20th national conference on artificial intelligence (AAAI-05)","author":"R. Maclin","year":"2005","unstructured":"Maclin, R., Shavlik, J. W., Torrey, L., Walker, T., & Wild, E. W. (2005). Giving advice about preferred actions to reinforcement learners via knowledge-based kernel regression. In M. M. Veloso & S. Kambhampati (Eds.), Proceedings of the 20th national conference on artificial intelligence (AAAI-05), Pittsburgh, Pennsylvania (pp. 819\u2013824). Menlo Park\/Cambridge: AAAI Press\/MIT Press."},{"key":"5313_CR52","first-page":"719","volume-title":"Proceedings of the 27th international conference on machine learning (ICML-10)","author":"H. R. Maei","year":"2010","unstructured":"Maei, H. R., Szepesv\u00e0ri, C., Bhatnagar, S., & Sutton, R. S. (2010). Toward off-policy learning control with function approximation. In J. F\u00fcrnkranz & T. Joachims (Eds.), Proceedings of the 27th international conference on machine learning (ICML-10), Haifa, Israel (pp. 719\u2013726). Hawthorne: Omnipress."},{"key":"5313_CR53","unstructured":"Maes, F. (2009). Learning in Markov decision processes for structured prediction. PhD thesis, University Pierre et Marie Curie, Paris, France."},{"key":"5313_CR54","first-page":"325","volume":"5","author":"S. Mannor","year":"2004","unstructured":"Mannor, S., & Shimkin, N. (2004). A geometric approach to multi-criterion reinforcement learning. Journal of Machine Learning Research, 5, 325\u2013360.","journal-title":"Journal of Machine Learning Research"},{"key":"5313_CR55","first-page":"512","volume-title":"Proceedings of the 20th international conference on machine learning (ICML-03)","author":"S. Mannor","year":"2003","unstructured":"Mannor, S., Rubinstein, R. Y., & Gat, Y. (2003). The cross entropy method for fast policy search. In T. Fawcett & N. Mishra (Eds.), Proceedings of the 20th international conference on machine learning (ICML-03), Washington, DC (pp. 512\u2013519). Menlo Park: AAAI Press."},{"issue":"2","key":"5313_CR56","doi-asserted-by":"crossref","first-page":"350","DOI":"10.1287\/moor.12.2.350","volume":"12","author":"W. A. Massey","year":"1987","unstructured":"Massey, W. A. (1987). Stochastic orderings for Markov processes on partially ordered spaces. Mathematics of Operations Research, 12(2), 350\u2013367.","journal-title":"Mathematics of Operations Research"},{"key":"5313_CR57","first-page":"406","volume-title":"Proceedings of the 16th conference in uncertainty in artificial intelligence (UAI-00)","author":"A. Y. Ng","year":"2000","unstructured":"Ng, A. Y., & Jordan, M. I. (2000). Pegasus: a policy search method for large mdps and pomdps. In C. Boutilier & M. Goldszmidt (Eds.), Proceedings of the 16th conference in uncertainty in artificial intelligence (UAI-00), Stanford University, Stanford, California (pp. 406\u2013415). San Mateo: Morgan Kaufmann."},{"key":"5313_CR58","first-page":"663","volume-title":"Proceedings of the 17th international conference on machine learning (ICML-00)","author":"A. Y. Ng","year":"2000","unstructured":"Ng, A. Y., & Russell, S. (2000). Algorithms for inverse reinforcement learning. In P. Langley (Ed.), Proceedings of the 17th international conference on machine learning (ICML-00), Stanford University, Stanford, California (pp. 663\u2013670). San Mateo: Morgan Kaufmann."},{"issue":"1","key":"5313_CR59","doi-asserted-by":"crossref","first-page":"40","DOI":"10.1007\/s10618-011-0219-9","volume":"24","author":"S.-H. Park","year":"2012","unstructured":"Park, S.-H., & F\u00fcrnkranz, J. (2012). Efficient prediction algorithms for binary decomposition techniques. Data Mining and Knowledge Discovery, 24(1), 40\u201377.","journal-title":"Data Mining and Knowledge Discovery"},{"key":"5313_CR60","doi-asserted-by":"crossref","first-page":"745","DOI":"10.1145\/1273496.1273590","volume-title":"Proceedings of the 24th international conference on machine learning (ICML-07)","author":"J. Peters","year":"2007","unstructured":"Peters, J., & Schaal, S. (2007). Reinforcement learning by reward-weighted regression for operational space control. In Z. Ghahramani (Ed.), Proceedings of the 24th international conference on machine learning (ICML-07), Corvallis, Oregon, USA (pp.\u00a0745\u2013750)."},{"issue":"7\u20139","key":"5313_CR61","doi-asserted-by":"crossref","first-page":"1180","DOI":"10.1016\/j.neucom.2007.11.026","volume":"71","author":"J. Peters","year":"2008","unstructured":"Peters, J., & Schaal, S. (2008a) Natural actor-critic. Neurocomputing, 71(7\u20139), 1180\u20131190.","journal-title":"Neurocomputing"},{"issue":"4","key":"5313_CR62","doi-asserted-by":"crossref","first-page":"682","DOI":"10.1016\/j.neunet.2008.02.003","volume":"21","author":"J. Peters","year":"2008","unstructured":"Peters, J., & Schaal, S. (2008b). Reinforcement learning of motor skills with policy gradients. Neural Networks, 21(4), 682\u2013697.","journal-title":"Neural Networks"},{"key":"5313_CR63","first-page":"417","volume-title":"Proceedings of the 18th international conference on machine learning (ICML-01)","author":"D. Precup","year":"2001","unstructured":"Precup, D., Sutton, R. S., & Dasgupta, S. (2001). Off-policy temporal difference learning with function approximation. In C. E. Brodley & A. P. Danyluk (Eds.), Proceedings of the 18th international conference on machine learning (ICML-01), Williams College, Williamstown, MA (pp. 417\u2013424). San Mateo: Morgan Kaufmann."},{"key":"5313_CR64","volume-title":"Markov decision processes: discrete stochastic dynamic programming","author":"M. L. Puterman","year":"2005","unstructured":"Puterman, M. L. (2005). Markov decision processes: discrete stochastic dynamic programming (2nd edn.). New York: Wiley.","edition":"2"},{"key":"5313_CR65","first-page":"242","volume-title":"Proceedings of 20th international conference on automated planning and scheduling (ICAPS-10)","author":"R. Ramanujan","year":"2010","unstructured":"Ramanujan, R., Sabharwal, A., & Selman, B. (2010). On adversarial search spaces and sampling-based planning. In R. I. Brafman, H. Geffner, J. Hoffmann, & H. A. Kautz (Eds.), Proceedings of 20th international conference on automated planning and scheduling (ICAPS-10), Toronto, Ontario, Canada (pp. 242\u2013245)."},{"key":"5313_CR66","first-page":"263","volume-title":"Proc. 3rd European workshop on probabilistic graphical models","author":"A. Reyes","year":"2006","unstructured":"Reyes, A., Ibarguengoytia, P., Sucar, L., & Morales, E. (2006). Abstraction and refinement for solving continuous Markov decision processes. In Proc. 3rd European workshop on probabilistic graphical models, Prague, Czech Republic (pp. 263\u2013270)."},{"key":"5313_CR67","first-page":"567","volume-title":"Proc. UAI, 15th conference on uncertainty in artificial intelligence","author":"R. Sabbadin","year":"1999","unstructured":"Sabbadin, R. (1999). A possibilistic model for qualitative sequential decision problems under uncertainty in partially observable environments. In Proc. UAI, 15th conference on uncertainty in artificial intelligence, Stockholm, Sweden (pp. 567\u2013574)."},{"issue":"1","key":"5313_CR68","doi-asserted-by":"crossref","first-page":"27","DOI":"10.1017\/S0269888900007669","volume":"11","author":"C. Sammut","year":"1996","unstructured":"Sammut, C. (1996). Automatic construction of reactive control systems using symbolic machine learning. Knowledge Engineering Review, 11(1), 27\u201342.","journal-title":"Knowledge Engineering Review"},{"issue":"3","key":"5313_CR69","doi-asserted-by":"crossref","first-page":"221","DOI":"10.1162\/evco.1994.2.3.221","volume":"2","author":"N. Srinivas","year":"1995","unstructured":"Srinivas, N., & Deb, K. (1995). Multiobjective optimization using nondominant sorting in genetic algorithms. Evolutionary Computation, 2(3), 221\u2013248.","journal-title":"Evolutionary Computation"},{"key":"5313_CR70","first-page":"9","volume":"3","author":"R. S. Sutton","year":"1988","unstructured":"Sutton, R. S. (1988). Learning to predict by the methods of temporal differences. Machine Learning, 3, 9\u201344.","journal-title":"Machine Learning"},{"key":"5313_CR71","first-page":"1057","volume-title":"Advances in neural information processing systems 12 (NIPS-99)","author":"R. S. Sutton","year":"2000","unstructured":"Sutton, R. S., McAllester, D. A., Singh, S. P., & Mansour, Y. (2000). Policy gradient methods for reinforcement learning with function approximation. In S. A. Solla, T. K. Leen, & K.-R. M\u00fcller (Eds.), Advances in neural information processing systems 12 (NIPS-99), Denver, Colorado, USA (pp. 1057\u20131063). Cambridge: MIT Press."},{"key":"5313_CR72","first-page":"1","volume-title":"Proceedings of the ICML\u201904 workshop on relational reinforcement learning","author":"P. Tadepalli","year":"2004","unstructured":"Tadepalli, P., Givan, R., & Driessens, K. (2004). Relational reinforcement learning: an overview. In P. Tadepalli, R. Givan, & K. Driessens (Eds.), Proceedings of the ICML\u201904 workshop on relational reinforcement learning (pp. 1\u20139)."},{"key":"5313_CR73","first-page":"1633","volume":"10","author":"M. E. Taylor","year":"2009","unstructured":"Taylor, M. E., & Stone, P. (2009). Transfer learning for reinforcement learning domains: a\u00a0survey. Journal of Machine Learning Research, 10, 1633\u20131685.","journal-title":"Journal of Machine Learning Research"},{"key":"5313_CR74","first-page":"257","volume":"8","author":"G. Tesauro","year":"1992","unstructured":"Tesauro, G. (1992). Practical issues in temporal difference learning. Machine Learning, 8, 257\u2013278.","journal-title":"Machine Learning"},{"issue":"1\u20132","key":"5313_CR75","doi-asserted-by":"crossref","first-page":"181","DOI":"10.1016\/S0004-3702(01)00110-2","volume":"134","author":"G. Tesauro","year":"2002","unstructured":"Tesauro, G. (2002). Programming backgammon using self-teaching neural nets. Artificial Intelligence, 134(1\u20132), 181\u2013199. Special Issue on Games, Computers and Artificial Intelligence.","journal-title":"Artificial Intelligence"},{"key":"5313_CR76","first-page":"3137","volume":"11","author":"E. Theodorou","year":"2010","unstructured":"Theodorou, E., Buchli, J., & Schaal, S. (2010). A generalized path integral control approach to reinforcement learning. Journal of Machine Learning Research, 11, 3137\u20133181.","journal-title":"Journal of Machine Learning Research"},{"key":"5313_CR77","first-page":"412","volume-title":"Proceedings of the 16th European conference on machine learning (ECML-05)","author":"L. Torrey","year":"2005","unstructured":"Torrey, L., Walker, T., Shavlik, J. W., & Maclin, R. (2005). Using advice to transfer knowledge acquired in one reinforcement learning task to another. In J. Gama, R. Camacho, P. Brazdil, A. Jorge, & L. Torgo (Eds.), Proceedings of the 16th European conference on machine learning (ECML-05), Porto, Portugal (pp. 412\u2013424). Berlin: Springer."},{"issue":"1\u20132","key":"5313_CR78","first-page":"51","volume":"84","author":"P. Vamplew","year":"2010","unstructured":"Vamplew, P., Dazeley, R., Berry, A., Issabekov, R., & Dekker, E. (2010). Empirical evaluation methods for multiobjective reinforcement learning algorithms. Machine Learning, 84(1\u20132), 51\u201380.","journal-title":"Machine Learning"},{"key":"5313_CR79","first-page":"45","volume-title":"Preference leaning","author":"S. Vembu","year":"2011","unstructured":"Vembu, S., & G\u00e4rtner, T. (2011). Label ranking algorithms: a survey. In J. F\u00fcrnkranz & E. H\u00fcllermeier (Eds.), Preference leaning (pp. 45\u201364). Berlin: Springer."},{"key":"5313_CR80","first-page":"279","volume":"8","author":"C. J. Watkins","year":"1992","unstructured":"Watkins, C. J., & Dayan, P. (1992) Q-learning. Machine Learning, 8, 279\u2013292.","journal-title":"Machine Learning"},{"key":"5313_CR81","first-page":"229","volume":"8","author":"R. J. Williams","year":"1992","unstructured":"Williams, R. J. (1992). Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine Learning, 8, 229\u2013256.","journal-title":"Machine Learning"},{"key":"5313_CR82","volume-title":"Proc. 22nd international workshop on qualitative reasoning","author":"J. Zabkar","year":"2008","unstructured":"Zabkar, J., Bratko, I., & Mohan, A. (2008). Learning qualitative models by an autonomous robot. In Proc. 22nd international workshop on qualitative reasoning, Boulder, Colorado."},{"key":"5313_CR83","first-page":"3295","volume":"28","author":"Y. Zhao","year":"2009","unstructured":"Zhao, Y., Kosorok, M., & Zeng, D. (2009). Reinforcement learning design for cancer clinical trials. Statistics in Medicine, 28, 3295\u20133315.","journal-title":"Statistics in Medicine"}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-012-5313-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10994-012-5313-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-012-5313-8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,28]],"date-time":"2024-04-28T08:56:31Z","timestamp":1714294591000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10994-012-5313-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012,8,10]]},"references-count":83,"journal-issue":{"issue":"1-2","published-print":{"date-parts":[[2012,10]]}},"alternative-id":["5313"],"URL":"https:\/\/doi.org\/10.1007\/s10994-012-5313-8","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2012,8,10]]}}}