{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T13:14:13Z","timestamp":1762521253108},"publisher-location":"Berlin, Heidelberg","reference-count":19,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783540897217"},{"type":"electronic","value":"9783540897224"}],"license":[{"start":{"date-parts":[[2008,1,1]],"date-time":"2008-01-01T00:00:00Z","timestamp":1199145600000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2008]]},"DOI":"10.1007\/978-3-540-89722-4_21","type":"book-chapter","created":{"date-parts":[[2008,11,26]],"date-time":"2008-11-26T13:57:14Z","timestamp":1227707834000},"page":"268-281","source":"Crossref","is-referenced-by-count":1,"title":["Markov Decision Processes with Arbitrary Reward Processes"],"prefix":"10.1007","author":[{"given":"Jia Yuan","family":"Yu","sequence":"first","affiliation":[]},{"given":"Shie","family":"Mannor","sequence":"additional","affiliation":[]},{"given":"Nahum","family":"Shimkin","sequence":"additional","affiliation":[]}],"member":"297","reference":[{"issue":"1","key":"21_CR1","doi-asserted-by":"publisher","first-page":"48","DOI":"10.1137\/S0097539701398375","volume":"32","author":"P. Auer","year":"2002","unstructured":"Auer, P., Cesa-Bianchi, N., Freund, Y., Schapire, R.E.: The nonstochastic multiarmed bandit problem. SIAM J. Computing\u00a032(1), 48\u201377 (2002)","journal-title":"SIAM J. Computing"},{"key":"21_CR2","doi-asserted-by":"publisher","first-page":"39","DOI":"10.2307\/1913732","volume":"32","author":"R.J. Aumann","year":"1964","unstructured":"Aumann, R.J.: Markets with a continuum of traders. Econometrica\u00a032, 39\u201350 (1964)","journal-title":"Econometrica"},{"key":"21_CR3","doi-asserted-by":"crossref","unstructured":"Bertsekas, D.P.: Dynamic programming and optimal control, 2nd edn., vol.\u00a02. Athena Scientific (2001)","DOI":"10.1007\/0-306-48332-7_333"},{"key":"21_CR4","unstructured":"Bertsekas, D.P., Tsitsiklis, J.N.: Neuro-dynamic programming. Athena Scientific (1996)"},{"issue":"2","key":"21_CR5","doi-asserted-by":"publisher","first-page":"289","DOI":"10.1007\/s10959-006-0016-3","volume":"19","author":"S.G. Bobkov","year":"2006","unstructured":"Bobkov, S.G., Tetali, P.: Modified logarithmic Sobolev inequalities in discrete settings. Journal of Theoretical Probability\u00a019(2), 289\u2013336 (2006)","journal-title":"Journal of Theoretical Probability"},{"issue":"2","key":"21_CR6","doi-asserted-by":"publisher","first-page":"447","DOI":"10.1137\/S0363012997331639","volume":"38","author":"V.S. Borkar","year":"2000","unstructured":"Borkar, V.S., Meyn, S.P.: The O.D.E. method for convergence of stochastic approximation and reinforcement learning. SIAM J. Control and Optimization\u00a038(2), 447\u2013469 (2000)","journal-title":"SIAM J. Control and Optimization"},{"key":"21_CR7","first-page":"213","volume":"3","author":"R.I. Brafman","year":"2003","unstructured":"Brafman, R.I., Tennenholtz, M.: R-max\u2014a general polynomial time algorithm for near-optimal reinforcement learning. Journal of Machine Learning Research\u00a03, 213\u2013231 (2003)","journal-title":"Journal of Machine Learning Research"},{"key":"21_CR8","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511546921","volume-title":"Prediction, learning, and games","author":"N. Cesa-Bianchi","year":"2006","unstructured":"Cesa-Bianchi, N., Lugosi, G.: Prediction, learning, and games. Cambridge University Press, Cambridge (2006)"},{"key":"21_CR9","unstructured":"Crites, R.H., Barto, A.G.: An actor\/critic algorithm that is equivalent to Q-learning. In: Advances in Neural Information Processing Systems, pp. 401\u2013408 (1995)"},{"issue":"3\u20134","key":"21_CR10","doi-asserted-by":"publisher","first-page":"271","DOI":"10.1023\/A:1016654625257","volume":"16","author":"N.G. Duffield","year":"2001","unstructured":"Duffield, N.G., Massey, W.A., Whitt, W.: A nonstationary offered-load model for packet networks. Telecommunication Systems\u00a016(3\u20134), 271\u2013296 (2001)","journal-title":"Telecommunication Systems"},{"key":"21_CR11","unstructured":"Even-Dar, E., Kakade, S., Mansour, Y.: Experts in a Markov decision process. In: NIPS, pp. 401\u2013408 (2004)"},{"issue":"3","key":"21_CR12","doi-asserted-by":"publisher","first-page":"320","DOI":"10.1006\/game.1993.1021","volume":"5","author":"D. Fudenberg","year":"1993","unstructured":"Fudenberg, D., Kreps, D.M.: Learning mixed equilibria. Games and Economic Behavior\u00a05(3), 320\u2013367 (1993)","journal-title":"Games and Economic Behavior"},{"key":"21_CR13","first-page":"97","volume-title":"Contributions to the Theory of Games","author":"J. Hannan","year":"1957","unstructured":"Hannan, J.: Approximation to Bayes risk in repeated play. In: Contributions to the Theory of Games, vol.\u00a03, pp. 97\u2013139. Princeton University Press, Princeton (1957)"},{"issue":"2","key":"21_CR14","doi-asserted-by":"publisher","first-page":"151","DOI":"10.1023\/A:1007424614876","volume":"32","author":"M. Herbster","year":"1998","unstructured":"Herbster, M., Warmuth, M.K.: Tracking the best expert. Machine Learning\u00a032(2), 151\u2013178 (1998)","journal-title":"Machine Learning"},{"issue":"3","key":"21_CR15","doi-asserted-by":"publisher","first-page":"291","DOI":"10.1016\/j.jcss.2004.10.016","volume":"71","author":"A. Kalai","year":"2005","unstructured":"Kalai, A., Vempala, S.: Efficient algorithms for online decision problems. Journal of Computer and System Sciences\u00a071(3), 291\u2013307 (2005)","journal-title":"Journal of Computer and System Sciences"},{"issue":"2","key":"21_CR16","doi-asserted-by":"publisher","first-page":"327","DOI":"10.1287\/moor.28.2.327.14483","volume":"28","author":"S. Mannor","year":"2003","unstructured":"Mannor, S., Shimkin, N.: The empirical Bayes envelope and regret minimization in competitive Markov decision processes. Mathematics of Operations Research\u00a028(2), 327\u2013345 (2003)","journal-title":"Mathematics of Operations Research"},{"issue":"7","key":"21_CR17","doi-asserted-by":"publisher","first-page":"1947","DOI":"10.1109\/TIT.2002.1013135","volume":"48","author":"N. Merhav","year":"2002","unstructured":"Merhav, N., Ordentlich, E., Seroussi, G., Weinberger, M.J.: On sequential strategies for loss functions with memory. IEEE Trans. Inf. Theory\u00a048(7), 1947\u20131958 (2002)","journal-title":"IEEE Trans. Inf. Theory"},{"issue":"10","key":"21_CR18","doi-asserted-by":"publisher","first-page":"1095","DOI":"10.1073\/pnas.39.10.1953","volume":"39","author":"L. Shapley","year":"1953","unstructured":"Shapley, L.: Stochastic games. PNAS\u00a039(10), 1095\u20131100 (1953)","journal-title":"PNAS"},{"key":"21_CR19","unstructured":"Yu, J.Y., Mannor, S., Shimkin, N.: Markov decision processes with arbitrarily varying rewards (Preprint, 2008), http:\/\/www.cim.mcgill.ca\/~jiayuan\/mdp.pdf"}],"container-title":["Lecture Notes in Computer Science","Recent Advances in Reinforcement Learning"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-540-89722-4_21","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,5,21]],"date-time":"2023-05-21T14:00:52Z","timestamp":1684677652000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-540-89722-4_21"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2008]]},"ISBN":["9783540897217","9783540897224"],"references-count":19,"URL":"https:\/\/doi.org\/10.1007\/978-3-540-89722-4_21","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2008]]}}}