{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,15]],"date-time":"2025-12-15T19:30:26Z","timestamp":1765827026290},"reference-count":35,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2012,4,18]],"date-time":"2012-04-18T00:00:00Z","timestamp":1334707200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Ann Oper Res"],"published-print":{"date-parts":[[2013,9]]},"DOI":"10.1007\/s10479-012-1128-z","type":"journal-article","created":{"date-parts":[[2012,4,17]],"date-time":"2012-04-17T07:52:10Z","timestamp":1334649130000},"page":"95-132","source":"Crossref","is-referenced-by-count":25,"title":["Q-learning and policy iteration algorithms for stochastic shortest path problems"],"prefix":"10.1007","volume":"208","author":[{"given":"Huizhen","family":"Yu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dimitri P.","family":"Bertsekas","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2012,4,18]]},"reference":[{"key":"1128_CR1","doi-asserted-by":"crossref","first-page":"226","DOI":"10.1145\/322063.322067","volume":"25","author":"G. M. Baudet","year":"1978","unstructured":"Baudet, G. M. (1978). Asynchronous iterative methods for multiprocessors. Journal of the Association for Computing Machinery, 25, 226\u2013244.","journal-title":"Journal of the Association for Computing Machinery"},{"key":"1128_CR2","doi-asserted-by":"crossref","first-page":"610","DOI":"10.1109\/TAC.1982.1102980","volume":"27","author":"D. P. Bertsekas","year":"1982","unstructured":"Bertsekas, D. P. (1982). Distributed dynamic programming. IEEE Transactions on Automatic Control, 27, 610\u2013616.","journal-title":"IEEE Transactions on Automatic Control"},{"key":"1128_CR3","doi-asserted-by":"crossref","first-page":"107","DOI":"10.1007\/BF02591967","volume":"27","author":"D. P. Bertsekas","year":"1983","unstructured":"Bertsekas, D. P. (1983). Asynchronous distributed computation of fixed points. Mathematical Programming, 27, 107\u2013120.","journal-title":"Mathematical Programming"},{"key":"1128_CR4","volume-title":"Dynamic programming and optimal control","author":"D. P. Bertsekas","year":"2007","unstructured":"Bertsekas, D. P. (2007). Dynamic programming and optimal control (Vol.\u00a0II, 3rd ed.). Belmont: Athena Scientific.","edition":"3"},{"key":"1128_CR5","unstructured":"Bertsekas, D. P. (2011). Approximate dynamic programming. Book chapter. On-line at: http:\/\/web.mit.edu\/dimitrib\/www\/dpchapter.html ."},{"key":"1128_CR6","volume-title":"Parallel and distributed computation: numerical methods","author":"D. P. Bertsekas","year":"1989","unstructured":"Bertsekas, D. P., & Tsitsiklis, J. N. (1989). Parallel and distributed computation: numerical methods. Englewood Cliffs: Prentice-Hall. Republished by Athena Scientific, Belmont, 1997."},{"issue":"3","key":"1128_CR7","doi-asserted-by":"crossref","first-page":"580","DOI":"10.1287\/moor.16.3.580","volume":"16","author":"D. P. Bertsekas","year":"1991","unstructured":"Bertsekas, D. P., & Tsitsiklis, J. N. (1991). An analysis of stochastic shortest path problems. Mathematics of Operations Research, 16(3), 580\u2013595.","journal-title":"Mathematics of Operations Research"},{"key":"1128_CR8","volume-title":"Neuro-dynamic programming","author":"D. P. Bertsekas","year":"1996","unstructured":"Bertsekas, D. P., & Tsitsiklis, J. N. (1996). Neuro-dynamic programming. Belmont: Athena Scientific."},{"key":"1128_CR9","doi-asserted-by":"crossref","first-page":"27","DOI":"10.1016\/j.cam.2008.07.037","volume":"227","author":"D. P. Bertsekas","year":"2009","unstructured":"Bertsekas, D. P., & Yu, H. (2009). Projected equation methods for approximate solution of large linear systems. Journal of Computational and Applied Mathematics, 227, 27\u201350.","journal-title":"Journal of Computational and Applied Mathematics"},{"key":"1128_CR10","first-page":"1368","volume-title":"Proc. of the 48th Allerton conference on communication, control and computing","author":"D. P. Bertsekas","year":"2010","unstructured":"Bertsekas, D. P., & Yu, H. (2010). Distributed asynchronous policy iteration in dynamic programming. In Proc. of the 48th Allerton conference on communication, control and computing, Allerton, IL (pp.\u00a01368\u20131375)."},{"issue":"1","key":"1128_CR11","doi-asserted-by":"crossref","first-page":"66","DOI":"10.1287\/moor.1110.0532","volume":"37","author":"D. P. Bertsekas","year":"2012","unstructured":"Bertsekas, D. P., & Yu, H. (2012). Q-learning and enhanced policy iteration in discounted dynamic programming. Mathematics of Operations Research, 37(1), 66\u201394. (Technical Report 2831). LIDS, MIT, April 2010.","journal-title":"Mathematics of Operations Research"},{"key":"1128_CR12","author":"P. G. Canbolat","year":"2012","unstructured":"Canbolat, P. G., & Rothblum, U. G. (2012). (Approximate) iterated successive approximations algorithm for sequential decision processes. Annals of Operations Research. doi: 10.1007\/s10479-012-1073-x .","journal-title":"Annals of Operations Research"},{"key":"1128_CR13","doi-asserted-by":"crossref","first-page":"207","DOI":"10.1007\/s10626-006-8134-8","volume":"16","author":"D. S. Choi","year":"2006","unstructured":"Choi, D. S., & Van Roy, B. (2006). A generalized Kalman filter for fixed point approximation and efficient temporal-difference learning. Discrete Event Dynamic Systems, 16, 207\u2013239.","journal-title":"Discrete Event Dynamic Systems"},{"key":"1128_CR14","volume-title":"Finite state Markovian decision processes","author":"C. Derman","year":"1970","unstructured":"Derman, C. Finite state Markovian decision processes. New York: Academic Press (1970)."},{"key":"1128_CR15","doi-asserted-by":"crossref","first-page":"23","DOI":"10.1115\/1.3657260","volume":"84","author":"J. H. Eaton","year":"1962","unstructured":"Eaton, J. H., & Zadeh, L. A. (1962). Optimal pursuit strategies in discrete state probabilistic systems. Transactions of the ASME. Series D. Journal of Basic Engineering, 84, 23\u201329.","journal-title":"Transactions of the ASME. Series D. Journal of Basic Engineering"},{"key":"1128_CR16","doi-asserted-by":"crossref","first-page":"392","DOI":"10.1287\/moor.17.2.392","volume":"17","author":"E. A. Feinberg","year":"1992","unstructured":"Feinberg, E. A. (1992). Stationary strategies in Borel dynamic programming. Mathematics of Operations Research, 17, 392\u2013397.","journal-title":"Mathematics of Operations Research"},{"key":"1128_CR17","first-page":"261","volume-title":"Proc. of the 12th int. conf. on machine learning","author":"G. J. Gordon","year":"1995","unstructured":"Gordon, G. J. (1995). Stable function approximation in dynamic programming. In Proc. of the 12th int. conf. on machine learning, San Francisco, CA (pp.\u00a0261\u2013268)."},{"key":"1128_CR18","doi-asserted-by":"crossref","first-page":"1185","DOI":"10.1162\/neco.1994.6.6.1185","volume":"6","author":"T. S. Jaakkola","year":"1994","unstructured":"Jaakkola, T. S., Jordan, M. I., & Singh, S. P. (1994). On the convergence of stochastic iterative dynamic programming algorithms. Neural Computation, 6, 1185\u20131201.","journal-title":"Neural Computation"},{"key":"1128_CR19","first-page":"345","volume":"7","author":"T. S. Jaakkola","year":"1995","unstructured":"Jaakkola, T. S., Singh, S. P., & Jordan, M. I. (1995). Reinforcement learning algorithm for partially observable Markov decision problems. Advances in Neural Information Processing Systems, 7, 345\u2013352.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"1128_CR20","doi-asserted-by":"crossref","DOI":"10.1002\/9780470316887","volume-title":"Markov decision processes: discrete stochastic dynamic programming","author":"M. L. Puterman","year":"1994","unstructured":"Puterman, M. L. (1994). Markov decision processes: discrete stochastic dynamic programming. New York: Wiley."},{"key":"1128_CR21","volume-title":"Stochastic control and optimization","author":"U. G. Rothblum","year":"1979","unstructured":"Rothblum, U. G. (1979). Iterated successive approximation for sequential decision processes. In J.\u00a0W.\u00a0B. van\u00a0Overhagen & H.\u00a0C. Tijms (Eds.), Stochastic control and optimization. Amsterdam: Vrije University."},{"key":"1128_CR22","volume-title":"Reinforcement learning","author":"R. S. Sutton","year":"1998","unstructured":"Sutton, R. S., & Barto, A. G. (1998). Reinforcement learning. Cambridge: MIT Press."},{"key":"1128_CR23","first-page":"1071","volume-title":"Proc. of the 27th int. conf. machine learning","author":"C. Thiery","year":"2010","unstructured":"Thiery, C., & Scherrer, B. (2010a). Least-squares \u03bb policy iteration: bias-variance trade-off in control problems. In Proc. of the 27th int. conf. machine learning, Haifa, Israel (pp.\u00a01071\u20131078)."},{"key":"1128_CR24","unstructured":"Thiery, C., & Scherrer, B. (2010b). Performance bound for approximate optimistic policy iteration (Technical report). INRIA."},{"key":"1128_CR25","first-page":"185","volume":"16","author":"J. N. Tsitsiklis","year":"1994","unstructured":"Tsitsiklis, J. N. (1994). Asynchronous stochastic approximation and Q-learning. Machine Learning, 16, 185\u2013202.","journal-title":"Machine Learning"},{"key":"1128_CR26","first-page":"59","volume":"22","author":"J. N. Tsitsiklis","year":"1996","unstructured":"Tsitsiklis, J. N., & Van\u00a0Roy, B. (1996). Feature-based methods for large-scale dynamic programming. Machine Learning, 22, 59\u201394.","journal-title":"Machine Learning"},{"key":"1128_CR27","doi-asserted-by":"crossref","first-page":"1840","DOI":"10.1109\/9.793723","volume":"44","author":"J. N. Tsitsiklis","year":"1999","unstructured":"Tsitsiklis, J. N., & Van\u00a0Roy, B. (1999). Optimal stopping of Markov processes: Hilbert space theory, approximation algorithms, and an application to pricing financial derivatives. IEEE Transactions on Automatic Control, 44, 1840\u20131851.","journal-title":"IEEE Transactions on Automatic Control"},{"key":"1128_CR28","doi-asserted-by":"crossref","first-page":"1635","DOI":"10.1214\/aoms\/1177697379","volume":"40","author":"A. F. Veinott Jr.","year":"1969","unstructured":"Veinott, A. F. Jr. (1969). Discrete dynamic programming with sensitive discount optimality criteria. Annals of Mathematical Statistics, 40, 1635\u20131660.","journal-title":"Annals of Mathematical Statistics"},{"key":"1128_CR29","unstructured":"Watkins, C. J. C. H. (1989). Learning from delayed rewards. Ph.D. thesis, Cambridge University, England."},{"key":"1128_CR30","unstructured":"Williams, R. J., & Baird, L. C. (1993). Analysis of some incremental variants of policy iteration: first steps toward understanding actor-critic learning systems (Report NU-CCS-93-11). College of Computer Science, Northeastern University."},{"key":"1128_CR31","volume-title":"Optimization over time","author":"P. Whittle","year":"1983","unstructured":"Whittle, P. (1983). Optimization over time (Vol. 2) New York: Wiley."},{"key":"1128_CR32","doi-asserted-by":"crossref","unstructured":"Yu, H., & Bertsekas, D. P. (2007a). A least squares Q-learning algorithm for optimal stopping problems (Technical Report 2731). LIDS, MIT.","DOI":"10.23919\/ECC.2007.7068523"},{"key":"1128_CR33","first-page":"2368","volume-title":"Proc. European control conference (ECC)","author":"H. Yu","year":"2007","unstructured":"Yu, H., & Bertsekas, D. P. (2007b). Q-learning algorithms for optimal stopping based on least squares. In Proc. European control conference (ECC), Kos, Greece (pp. 2368\u20132375)."},{"key":"1128_CR34","unstructured":"Yu, H., & Bertsekas, D. P. (2011). On boundedness of Q-learning iterates for stochastic shortest path problems (Technical Report 2859). LIDS, MIT. Accepted by Mathematics of Operations Research."},{"key":"1128_CR35","unstructured":"Yu, H. (2011). Some proof details for asynchronous stochastic approximation algorithms. On-line at: http:\/\/www.mit.edu\/~janey_yu\/note_asaproofs.pdf ."}],"container-title":["Annals of Operations Research"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10479-012-1128-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10479-012-1128-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10479-012-1128-z","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,6,27]],"date-time":"2019-06-27T07:34:32Z","timestamp":1561620872000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10479-012-1128-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012,4,18]]},"references-count":35,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2013,9]]}},"alternative-id":["1128"],"URL":"https:\/\/doi.org\/10.1007\/s10479-012-1128-z","relation":{},"ISSN":["0254-5330","1572-9338"],"issn-type":[{"value":"0254-5330","type":"print"},{"value":"1572-9338","type":"electronic"}],"subject":[],"published":{"date-parts":[[2012,4,18]]}}}