{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T13:21:38Z","timestamp":1762521698248},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2014,7,2]],"date-time":"2014-07-02T00:00:00Z","timestamp":1404259200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2014,12]]},"DOI":"10.1007\/s10994-014-5458-8","type":"journal-article","created":{"date-parts":[[2014,7,2]],"date-time":"2014-07-02T03:34:55Z","timestamp":1404272095000},"page":"327-351","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":21,"title":["Preference-based reinforcement learning: evolutionary direct policy search using a preference-based racing algorithm"],"prefix":"10.1007","volume":"97","author":[{"given":"R\u00f3bert","family":"Busa-Fekete","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bal\u00e1zs","family":"Sz\u00f6r\u00e9nyi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Paul","family":"Weng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Weiwei","family":"Cheng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Eyke","family":"H\u00fcllermeier","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2014,7,2]]},"reference":[{"key":"5458_CR1","doi-asserted-by":"crossref","unstructured":"Abbeel, P., & Ng, A. (2004). Apprenticeship learning via inverse reinforcement learning. In Proceedings of the 21th international conference on machine learning. New York, NY: ACM.","DOI":"10.1145\/1015330.1015430"},{"key":"5458_CR2","doi-asserted-by":"crossref","unstructured":"Akrour, R., Schoenauer, M., & Sebag, M. (2011). Preference-based policy learning. In Proceedings ECMLPKDD 2011, European conference on machine learning and principles and practice of knowledge discovery in databases (pp. 12\u201327). Berlin: Springer.","DOI":"10.1007\/978-3-642-23780-5_11"},{"key":"5458_CR3","doi-asserted-by":"crossref","unstructured":"Akrour, R., Schoenauer, M., & Sebag, M. (2012). April: Active preference-learning based reinforcement learning. In Proceedings ECMLPKDD 2012, European conference on machine learning and principles and practice of knowledge discovery in databases (pp. 116\u2013131). Berlin: Springer.","DOI":"10.1007\/978-3-642-33486-3_8"},{"key":"5458_CR4","unstructured":"Akrour, R., Schoenauer, M., & Sebag, M. (2013). Interactive robot education. In ECML workshop on reinforcement learning with generalized feedback: Beyond numeric rewards."},{"issue":"3","key":"5458_CR5","doi-asserted-by":"crossref","first-page":"239","DOI":"10.1016\/0167-7152(94)00072-G","volume":"22","author":"MA Arcones","year":"1995","unstructured":"Arcones, M. A. (1995). A Bernstein-type inequality for u-statistics and u-processes. Statistics & Probability Letters, 22(3), 239\u2013247.","journal-title":"Statistics & Probability Letters"},{"key":"5458_CR6","doi-asserted-by":"crossref","unstructured":"Audibert, J., Munos, R., & Szepesv\u00e1ri, C. (2007). Tuning bandit algorithms in stochastic environments. In Proceedings of the algorithmic learning theory (pp. 150\u2013165).","DOI":"10.1007\/978-3-540-75225-7_15"},{"key":"5458_CR7","doi-asserted-by":"crossref","first-page":"235","DOI":"10.1023\/A:1013689704352","volume":"47","author":"P Auer","year":"2002","unstructured":"Auer, P., Cesa-Bianchi, N., & Fischer, P. (2002). Finite-time analysis of the multiarmed bandit problem. Machine Learning, 47, 235\u2013256.","journal-title":"Machine Learning"},{"key":"5458_CR8","doi-asserted-by":"crossref","first-page":"3","DOI":"10.1023\/A:1015059928466","volume":"1","author":"H Beyer","year":"2002","unstructured":"Beyer, H., & Schwefel, H. (2002). Evolution strategies-a comprehensive introduction. Natural computing, 1, 3\u201352.","journal-title":"Natural computing"},{"key":"5458_CR9","doi-asserted-by":"crossref","unstructured":"Cheng, W., F\u00fcrnkranz, J., H\u00fcllermeier, E., & Park, S. (2011). Preference-based policy iteration: Leveraging preference learning for reinforcement learning. In Proceedings ECMLPKDD 2011, European conference on machine learning and principles and practice of knowledge discovery in databases (pp. 414\u2013429). Berlin: Springer.","DOI":"10.1007\/978-3-642-23780-5_30"},{"key":"5458_CR10","volume-title":"Evolutionary algorithms for solving multi-objective problems","author":"C Coello","year":"2007","unstructured":"Coello, C., Lamont, G., & Van Veldhuizen, D. (2007). Evolutionary algorithms for solving multi-objective problems. Berlin: Springer."},{"key":"5458_CR11","doi-asserted-by":"crossref","unstructured":"Even-Dar, E., Mannor, S., & Mansour, Y. (2002). PAC bounds for multi-armed bandit and markov decision processes. In Proceedings of the 15th annual conference on computational learning theory (pp. 255\u2013270). Berlin: Springer.","DOI":"10.1007\/3-540-45435-7_18"},{"key":"5458_CR12","doi-asserted-by":"crossref","first-page":"31","DOI":"10.1016\/0022-2496(82)90034-7","volume":"26","author":"P Fishburn","year":"1982","unstructured":"Fishburn, P. (1982). Nontransitive measurable utility. Journal of Mathematical Psychology, 26, 31\u201367.","journal-title":"Journal of Mathematical Psychology"},{"key":"5458_CR13","volume-title":"Game theory","author":"D Fudenberg","year":"1991","unstructured":"Fudenberg, D., & Tirole, J. (1991). Game theory. Cambridge, MA: MIT."},{"key":"5458_CR14","volume-title":"Preference learning","year":"2011","unstructured":"F\u00fcrnkranz, J., & H\u00fcllermeier, E. (Eds.). (2011). Preference learning. Berlin: Springer."},{"issue":"1\u20132","key":"5458_CR15","doi-asserted-by":"crossref","first-page":"123","DOI":"10.1007\/s10994-012-5313-8","volume":"89","author":"J F\u00fcrnkranz","year":"2012","unstructured":"F\u00fcrnkranz, J., H\u00fcllermeier, E., Cheng, W., & Park, S. (2012). Preference-based reinforcement learning: A formal framework and a policy iteration algorithm. Machine Learning, 89(1\u20132), 123\u2013156.","journal-title":"Machine Learning"},{"key":"5458_CR16","doi-asserted-by":"crossref","unstructured":"Hansen, N., & Kern, S. (2004). Evaluating the CMA evolution strategy on multimodal test functions. In Parallel problem solving from nature-PPSN VIII (pp. 282\u2013291). Berlin: Springer.","DOI":"10.1007\/978-3-540-30217-9_29"},{"key":"5458_CR17","unstructured":"Heidrich-Meisner, V., & Igel, C. (2008). Variable metric reinforcement learning methods applied to the noisy mountain car problem. Recent advances in reinforcement learning (pp. 136\u2013150). Berlin: Springer."},{"key":"5458_CR18","doi-asserted-by":"crossref","unstructured":"Heidrich-Meisner, V., & Igel, C. (2009). Hoeffding and Bernstein races for selecting policies in evolutionary direct policy search. In Proceedings of the 26th international conference on machine learning (pp. 401\u2013408). New York, NY: ACM.","DOI":"10.1145\/1553374.1553426"},{"issue":"4","key":"5458_CR19","doi-asserted-by":"crossref","first-page":"152","DOI":"10.1016\/j.jalgor.2009.04.002","volume":"64","author":"V Heidrich-Meisner","year":"2009","unstructured":"Heidrich-Meisner, V., & Igel, C. (2009). Neuroevolution strategies for episodic reinforcement learning. Journal of Algorithms, 64(4), 152\u2013168.","journal-title":"Journal of Algorithms"},{"issue":"1","key":"5458_CR20","doi-asserted-by":"crossref","first-page":"133","DOI":"10.1214\/aoms\/1177729491","volume":"23","author":"J Hemelrijk","year":"1952","unstructured":"Hemelrijk, J. (1952). Note on Wilcoxon\u2019s two-sample test when ties are present. The Annals of Mathematical Statistics, 23(1), 133\u2013135.","journal-title":"The Annals of Mathematical Statistics"},{"key":"5458_CR21","doi-asserted-by":"crossref","first-page":"13","DOI":"10.1080\/01621459.1963.10500830","volume":"58","author":"W Hoeffding","year":"1963","unstructured":"Hoeffding, W. (1963). Probability inequalities for sums of bounded random variables. Journal of the American Statistical Association, 58, 13\u201330.","journal-title":"Journal of the American Statistical Association"},{"key":"5458_CR22","unstructured":"Kalyanakrishnan, S., Tewari, A., Auer, P., & Stone, P. (2012). PAC subset selection in stochastic multi-armed bandits. In Proceedings of the Twenty-ninth International Conference on Machine Learning (ICML 2012) (pp. 655\u2013662). Omnipress."},{"key":"5458_CR23","unstructured":"Kreweras, G. (1961). Sur une possibilit\u00e9 de rationaliser les intransitivit\u00e9s. In La d\u00e9cision, CNRS."},{"key":"5458_CR24","unstructured":"Lagoudakis, M., & Parr, R. (2003). Reinforcement learning as classification: Leveraging modern classifiers. In Proceedings of the 20th international conference on machine learning (pp. 424\u2013431). AAAI Press."},{"key":"5458_CR25","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-642-60805-6","volume-title":"Tournament solutions and majority voting","author":"J Laslier","year":"1997","unstructured":"Laslier, J. (1997). Tournament solutions and majority voting. Berlin: Springer."},{"key":"5458_CR26","unstructured":"Lazaric, A., Ghavamzadeh, M., & Munos, R. (2010). Analysis of a classification-based policy iteration algorithm. In Proceedings of the 27th international conference on machine learning (pp. 607\u2013614). Omnipress."},{"key":"5458_CR27","unstructured":"Maron, O., & Moore, A. (1994). Hoeffding races: accelerating model selection search for classification and function approximation. In Advances in neural information processing systems (pp. 59\u201366). Morgan Kaufmann."},{"issue":"1","key":"5458_CR28","doi-asserted-by":"crossref","first-page":"193","DOI":"10.1023\/A:1006556606079","volume":"5","author":"O Maron","year":"1997","unstructured":"Maron, O., & Moore, A. (1997). The racing algorithm: Model selection for lazy learners. Artificial Intelligence Review, 5(1), 193\u2013225.","journal-title":"Artificial Intelligence Review"},{"key":"5458_CR29","doi-asserted-by":"crossref","unstructured":"Mnih, V., Szepesv\u00e1ri, C., & Audibert, J. (2008). Empirical Bernstein stopping. In Proceedings of the 25th international conference on Machine learning (pp. 672\u2013679). New York, NY: ACM.","DOI":"10.1145\/1390156.1390241"},{"key":"5458_CR30","doi-asserted-by":"crossref","DOI":"10.1017\/CCOL0521360552","volume-title":"Axioms of cooperative decision making","author":"H Moulin","year":"1988","unstructured":"Moulin, H. (1988). Axioms of cooperative decision making. Cambridge: Cambridge University Press."},{"issue":"4","key":"5458_CR31","doi-asserted-by":"crossref","first-page":"369","DOI":"10.1162\/evco.1994.2.4.369","volume":"2","author":"A Ostermeier","year":"1994","unstructured":"Ostermeier, A., Gawelczyk, A., & Hansen, N. (1994). A derandomized approach to self adaptation of evolution strategies. Evolutionary Computation, 2(4), 369\u2013380.","journal-title":"Evolutionary Computation"},{"key":"5458_CR32","first-page":"1903","volume":"23","author":"T Peel","year":"2010","unstructured":"Peel, T., Anthoine, S., & Ralaivola, L. (2010). Empirical Bernstein inequalities for u-statistics. Advances in Neural Information Processing Systems, 23, 1903\u20131911.","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"1","key":"5458_CR33","doi-asserted-by":"crossref","first-page":"188","DOI":"10.1016\/j.orl.2011.03.007","volume":"39","author":"A Pr\u00e9kopa","year":"2011","unstructured":"Pr\u00e9kopa, A., Yoda, K., & Subasi, M. (2011). Uniform quasi-concavity in probabilistic constrained stochastic programming. Operations Research Letters, 39(1), 188\u2013192.","journal-title":"Operations Research Letters"},{"key":"5458_CR34","doi-asserted-by":"crossref","DOI":"10.1002\/9780470316887","volume-title":"Markov decision processes: Discrete stochastic dynamic programming","author":"M Puterman","year":"1994","unstructured":"Puterman, M. (1994). Markov decision processes: Discrete stochastic dynamic programming. New York: Wiley."},{"key":"5458_CR35","unstructured":"Rummery, G. A., & Niranjan, M. (1994). On-line Q-learning using connectionist systems. Tech. Rep. CUED\/F-INFENG\/TR 166, Cambridge University, Engineering Department."},{"key":"5458_CR36","doi-asserted-by":"crossref","unstructured":"Serfling, R. (1980). Approximation theorems of mathematical statistics (Vol. 34). Wiley Online Library.","DOI":"10.1002\/9780470316481"},{"key":"5458_CR37","doi-asserted-by":"crossref","unstructured":"Szepesv\u00e1ri, C. (2010). Algorithms for reinforcement learning. Morgan and Claypool.","DOI":"10.2200\/S00268ED1V01Y201005AIM009"},{"key":"5458_CR38","unstructured":"Weng, P., Busa-Fekete, R., & H\u00fcllermeier, E. (2013). Interactive q-learning with ordinal rewards and unreliable tutor. In ECML workshop on reinforcement learning with generalized feedback: Beyond numeric rewards."},{"issue":"3","key":"5458_CR39","first-page":"229","volume":"8","author":"R Williams","year":"1992","unstructured":"Williams, R. (1992). Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine Learning, 8(3), 229\u2013256.","journal-title":"Machine Learning"},{"key":"5458_CR40","first-page":"1142","volume":"25","author":"A Wilson","year":"2012","unstructured":"Wilson, A., Fern, A., & Tadepalli, P. (2012). A bayesian approach for policy learning from trajectory preference queries. Advances in Neural Information Processing Systems, 25, 1142\u20131150.","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"5","key":"5458_CR41","doi-asserted-by":"crossref","first-page":"1538","DOI":"10.1016\/j.jcss.2011.12.028","volume":"78","author":"Y Yue","year":"2012","unstructured":"Yue, Y., Broder, J., Kleinberg, R., & Joachims, T. (2012). The k-armed dueling bandits problem. Journal of Computer and System Sciences, 78(5), 1538\u20131556.","journal-title":"Journal of Computer and System Sciences"},{"issue":"26","key":"5458_CR42","doi-asserted-by":"crossref","first-page":"3294","DOI":"10.1002\/sim.3720","volume":"28","author":"Y Zhao","year":"2009","unstructured":"Zhao, Y., Kosorok, M., & Zeng, D. (2009). Reinforcement learning design for cancer clinical trials. Statistics in Medicine, 28(26), 3294\u20133315.","journal-title":"Statistics in Medicine"}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-014-5458-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10994-014-5458-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-014-5458-8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,8,12]],"date-time":"2019-08-12T05:36:49Z","timestamp":1565588209000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10994-014-5458-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2014,7,2]]},"references-count":42,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2014,12]]}},"alternative-id":["5458"],"URL":"https:\/\/doi.org\/10.1007\/s10994-014-5458-8","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2014,7,2]]}}}