{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T02:55:26Z","timestamp":1779245726714,"version":"3.51.4"},"reference-count":74,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2016,8,5]],"date-time":"2016-08-05T00:00:00Z","timestamp":1470355200000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2016,12]]},"DOI":"10.1007\/s10994-016-5569-5","type":"journal-article","created":{"date-parts":[[2016,8,5]],"date-time":"2016-08-05T20:56:15Z","timestamp":1470430575000},"page":"367-417","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":35,"title":["Variance-constrained actor-critic algorithms for discounted and average reward MDPs"],"prefix":"10.1007","volume":"105","author":[{"given":"L. A.","family":"Prashanth","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mohammad","family":"Ghavamzadeh","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2016,8,5]]},"reference":[{"key":"5569_CR1","volume-title":"Constrained Markov decision processes","author":"E Altman","year":"1999","unstructured":"Altman, E. (1999). Constrained Markov decision processes (Vol. 7). Boca Raton: CRC Press."},{"key":"5569_CR2","first-page":"835","volume":"13","author":"A Barto","year":"1983","unstructured":"Barto, A., Sutton, R., & Anderson, C. (1983). Neuron-like elements that can solve difficult learning control problems. IEEE Transaction on Systems, Man and Cybernetics, 13, 835\u2013846.","journal-title":"IEEE Transaction on Systems, Man and Cybernetics"},{"issue":"4","key":"5569_CR3","doi-asserted-by":"crossref","first-page":"880","DOI":"10.1287\/moor.1080.0324","volume":"33","author":"A Basu","year":"2008","unstructured":"Basu, A., Bhattacharyya, T., & Borkar, V. (2008). A learning algorithm for risk-sensitive cost. Mathematics of Operations Research, 33(4), 880\u2013898.","journal-title":"Mathematics of Operations Research"},{"key":"5569_CR4","doi-asserted-by":"crossref","first-page":"319","DOI":"10.1613\/jair.806","volume":"15","author":"J Baxter","year":"2001","unstructured":"Baxter, J., & Bartlett, P. (2001). Infinite-horizon policy-gradient estimation. Journal of Artificial Intelligence Research, 15, 319\u2013350.","journal-title":"Journal of Artificial Intelligence Research"},{"key":"5569_CR5","volume-title":"Dynamic programming and optimal control","author":"D Bertsekas","year":"1995","unstructured":"Bertsekas, D. (1995). Dynamic programming and optimal control. Belmont, MA: Athena Scientific."},{"key":"5569_CR6","volume-title":"Nonlinear programming","author":"D Bertsekas","year":"1999","unstructured":"Bertsekas, D. (1999). Nonlinear programming. Belmont, MA: Athena Scientific."},{"key":"5569_CR7","volume-title":"Neuro-dynamic programming","author":"D Bertsekas","year":"1996","unstructured":"Bertsekas, D., & Tsitsiklis, J. (1996). Neuro-dynamic programming. Belmont, MA: Athena Scientific."},{"issue":"1","key":"5569_CR8","doi-asserted-by":"crossref","first-page":"74","DOI":"10.1145\/1044322.1044326","volume":"15","author":"S Bhatnagar","year":"2005","unstructured":"Bhatnagar, S. (2005). Adaptive multivariate three-timescale stochastic approximation algorithms for simulation based optimization. ACM Transactions on Modeling and Computer Simulation, 15(1), 74\u2013107.","journal-title":"ACM Transactions on Modeling and Computer Simulation"},{"issue":"1","key":"5569_CR9","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/1315575.1315577","volume":"18","author":"S Bhatnagar","year":"2007","unstructured":"Bhatnagar, S. (2007). Adaptive Newton-based multivariate smoothed functional algorithms for simulation optimization. ACM Transactions on Modeling and Computer Simulation, 18(1), 1\u201335.","journal-title":"ACM Transactions on Modeling and Computer Simulation"},{"issue":"12","key":"5569_CR10","doi-asserted-by":"crossref","first-page":"760","DOI":"10.1016\/j.sysconle.2010.08.013","volume":"59","author":"S Bhatnagar","year":"2010","unstructured":"Bhatnagar, S. (2010). An actor-critic algorithm with function approximation for discounted cost constrained Markov decision processes. Systems & Control Letters, 59(12), 760\u2013766.","journal-title":"Systems & Control Letters"},{"issue":"3","key":"5569_CR11","doi-asserted-by":"crossref","first-page":"688","DOI":"10.1007\/s10957-012-9989-5","volume":"153","author":"S Bhatnagar","year":"2012","unstructured":"Bhatnagar, S., & Lakshmanan, K. (2012). An online actor-critic algorithm with function approximation for constrained Markov decision processes. Journal of Optimization Theory and Applications, 153(3), 688\u2013708.","journal-title":"Journal of Optimization Theory and Applications"},{"issue":"2","key":"5569_CR12","doi-asserted-by":"crossref","first-page":"180","DOI":"10.1145\/858481.858486","volume":"13","author":"S Bhatnagar","year":"2003","unstructured":"Bhatnagar, S., Fu, M., Marcus, S., & Wang, I. (2003). Two-timescale simultaneous perturbation stochastic approximation using deterministic perturbation sequences. ACM Transactions on Modeling and Computer Simulation, 13(2), 180\u2013209.","journal-title":"ACM Transactions on Modeling and Computer Simulation"},{"key":"5569_CR13","unstructured":"Bhatnagar, S., Sutton, R., Ghavamzadeh, M., & Lee, M. (2007). Incremental natural actor-critic algorithms. In: Proceedings of advances in neural information processing systems (Vol. 20, pp. 105\u2013112)."},{"issue":"11","key":"5569_CR14","doi-asserted-by":"crossref","first-page":"2471","DOI":"10.1016\/j.automatica.2009.07.008","volume":"45","author":"S Bhatnagar","year":"2009","unstructured":"Bhatnagar, S., Sutton, R., Ghavamzadeh, M., & Lee, M. (2009a). Natural actor-critic algorithms. Automatica, 45(11), 2471\u20132482.","journal-title":"Automatica"},{"key":"5569_CR15","unstructured":"Bhatnagar, S., Sutton, R., Ghavamzadeh, M., & Lee, M. (2009b) Natural actor-critic algorithms. Technical report TR09-10, Department of Computing Science, University of Alberta."},{"issue":"3","key":"5569_CR16","doi-asserted-by":"crossref","first-page":"15","DOI":"10.1145\/1921598.1921599","volume":"21","author":"S Bhatnagar","year":"2011","unstructured":"Bhatnagar, S., Hemachandra, N., & Mishra, V. (2011). Stochastic approximation algorithms for constrained optimization via simulation. ACM Transactions on Modeling and Computer Simulation, 21(3), 15.","journal-title":"ACM Transactions on Modeling and Computer Simulation"},{"key":"5569_CR17","doi-asserted-by":"crossref","DOI":"10.1007\/978-1-4471-4285-0","volume-title":"Stochastic recursive algorithms for optimization","author":"S Bhatnagar","year":"2013","unstructured":"Bhatnagar, S., Prasad, H., & Prashanth, L. (2013). Stochastic recursive algorithms for optimization (Vol. 434). Berlin: Springer."},{"key":"5569_CR18","doi-asserted-by":"crossref","first-page":"339","DOI":"10.1016\/S0167-6911(01)00152-9","volume":"44","author":"V Borkar","year":"2001","unstructured":"Borkar, V. (2001). A sensitivity formula for the risk-sensitive cost and the actor-critic algorithm. Systems & Control Letters, 44, 339\u2013346.","journal-title":"Systems & Control Letters"},{"key":"5569_CR19","doi-asserted-by":"crossref","first-page":"294","DOI":"10.1287\/moor.27.2.294.324","volume":"27","author":"V Borkar","year":"2002","unstructured":"Borkar, V. (2002). Q-learning for risk-sensitive control. Mathematics of Operations Research, 27, 294\u2013311.","journal-title":"Mathematics of Operations Research"},{"issue":"3","key":"5569_CR20","doi-asserted-by":"crossref","first-page":"207","DOI":"10.1016\/j.sysconle.2004.08.007","volume":"54","author":"V Borkar","year":"2005","unstructured":"Borkar, V. (2005). An actor-critic algorithm for constrained Markov decision processes. Systems & Control Letters, 54(3), 207\u2013213.","journal-title":"Systems & Control Letters"},{"key":"5569_CR21","doi-asserted-by":"crossref","DOI":"10.1007\/978-93-86279-38-5","volume-title":"Stochastic approximation: A dynamical systems viewpoint","author":"V Borkar","year":"2008","unstructured":"Borkar, V. (2008). Stochastic approximation: A dynamical systems viewpoint. Cambridge: Cambridge University Press."},{"key":"5569_CR22","unstructured":"Borkar, V. (2010). Learning algorithms for risk-sensitive control. In Proceedings of the nineteenth international symposium on mathematical theory of networks and systems (pp. 1327\u20131332)."},{"issue":"2","key":"5569_CR23","doi-asserted-by":"crossref","first-page":"447","DOI":"10.1137\/S0363012997331639","volume":"38","author":"VS Borkar","year":"2000","unstructured":"Borkar, V. S., & Meyn, S. P. (2000). The ode method for convergence of stochastic approximation and reinforcement learning. SIAM Journal on Control and Optimization, 38(2), 447\u2013469.","journal-title":"SIAM Journal on Control and Optimization"},{"issue":"3","key":"5569_CR24","doi-asserted-by":"crossref","first-page":"442","DOI":"10.1109\/9.751340","volume":"44","author":"H Chen","year":"1999","unstructured":"Chen, H., Duncan, T., & Pasik-Duncan, B. (1999). A Kiefer\u2013Wolfowitz algorithm with randomized differences. IEEE Transactions on Automatic Control, 44(3), 442\u2013453.","journal-title":"IEEE Transactions on Automatic Control"},{"issue":"1","key":"5569_CR25","doi-asserted-by":"crossref","first-page":"203","DOI":"10.1287\/opre.1080.0685","volume":"58","author":"E Delage","year":"2010","unstructured":"Delage, E., & Mannor, S. (2010). Percentile optimization for Markov decision processes with parameter uncertainty. Operations Research, 58(1), 203\u2013213.","journal-title":"Operations Research"},{"issue":"5","key":"5569_CR26","doi-asserted-by":"crossref","first-page":"1811","DOI":"10.1137\/S0363012995283789","volume":"35","author":"J Dippon","year":"1997","unstructured":"Dippon, J., & Renz, J. (1997). Weighted means in stochastic approximation of minima. SIAM Journal on Control and Optimization, 35(5), 1811\u20131827.","journal-title":"SIAM Journal on Control and Optimization"},{"key":"5569_CR27","doi-asserted-by":"crossref","first-page":"1327","DOI":"10.1214\/aoms\/1177698258","volume":"39","author":"V Fabian","year":"1968","unstructured":"Fabian, V. (1968). On asymptotic normality in stochastic approximation. The Annals of Mathematical Statistics, 39, 1327\u20131332.","journal-title":"The Annals of Mathematical Statistics"},{"issue":"1","key":"5569_CR28","doi-asserted-by":"crossref","first-page":"147","DOI":"10.1287\/moor.14.1.147","volume":"14","author":"J Filar","year":"1989","unstructured":"Filar, J., Kallenberg, L., & Lee, H. (1989). Variance-penalized Markov decision processes. Mathematics of Operations Research, 14(1), 147\u2013161.","journal-title":"Mathematics of Operations Research"},{"issue":"1","key":"5569_CR29","doi-asserted-by":"crossref","first-page":"2","DOI":"10.1109\/9.362904","volume":"40","author":"J Filar","year":"1995","unstructured":"Filar, J., Krass, D., & Ross, K. (1995). Percentile performance criteria for limiting average Markov decision processes. IEEE Transaction of Automatic Control, 40(1), 2\u201310.","journal-title":"IEEE Transaction of Automatic Control"},{"key":"5569_CR30","volume-title":"Practical optimization","author":"P Gill","year":"1981","unstructured":"Gill, P., Murray, W., & Wright, M. (1981). Practical optimization. London: Academic press."},{"issue":"7","key":"5569_CR31","doi-asserted-by":"crossref","first-page":"356","DOI":"10.1287\/mnsc.18.7.356","volume":"18","author":"R Howard","year":"1972","unstructured":"Howard, R., & Matheson, J. (1972). Risk sensitive Markov decision processes. Management Science, 18(7), 356\u2013369.","journal-title":"Management Science"},{"key":"5569_CR32","first-page":"81","volume":"8","author":"V Katkovnik","year":"1972","unstructured":"Katkovnik, V., & Kulchitsky, Y. (1972). Convergence of a class of random search algorithms. Automatic Remote Control, 8, 81\u201387.","journal-title":"Automatic Remote Control"},{"key":"5569_CR33","unstructured":"Konda, V., & Tsitsiklis, J. (2000). Actor-critic algorithms. In Proceedings of advances in neural information processing systems (Vol. 12, pp. 1008\u20131014)."},{"issue":"2","key":"5569_CR34","doi-asserted-by":"crossref","first-page":"796","DOI":"10.1214\/105051604000000116","volume":"14","author":"VR Konda","year":"2004","unstructured":"Konda, V. R., & Tsitsiklis, J. N. (2004). Convergence rate of linear two-time-scale stochastic approximation. Annals of Applied Probability, 14(2), 796\u2013819.","journal-title":"Annals of Applied Probability"},{"key":"5569_CR35","unstructured":"Korda, N., & Prashanth, L. (2015). On TD (0) with function approximation: Concentration bounds and a centered variant with exponential convergence. In International conference on machine learning (ICML)."},{"key":"5569_CR36","doi-asserted-by":"crossref","DOI":"10.1007\/978-1-4684-9352-8","volume-title":"Stochastic approximation methods for constrained and unconstrained systems","author":"H Kushner","year":"1978","unstructured":"Kushner, H., & Clark, D. (1978). Stochastic approximation methods for constrained and unconstrained systems. Berlin: Springer."},{"key":"5569_CR37","unstructured":"Mannor, S., & Tsitsiklis, J. (2011). Mean\u2013variance optimization in Markov decision processes. In Proceedings of the twenty-eighth international conference on machine learning (pp. 177\u2013184)."},{"issue":"3","key":"5569_CR38","doi-asserted-by":"crossref","first-page":"645","DOI":"10.1016\/j.ejor.2013.06.019","volume":"231","author":"S Mannor","year":"2013","unstructured":"Mannor, S., & Tsitsiklis, J. N. (2013). Algorithmic aspects of mean\u2013variance optimization in Markov decision processes. European Journal of Operational Research, 231(3), 645\u2013653.","journal-title":"European Journal of Operational Research"},{"key":"5569_CR39","unstructured":"Marbach, P. (1998). Simulated-based methods for Markov decision processes. Ph.D. thesis, Massachusetts Institute of Technology."},{"key":"5569_CR40","volume-title":"Microeconomic theory","author":"A Mas-Colell","year":"1995","unstructured":"Mas-Colell, A., Whinston, M., & Green, J. (1995). Microeconomic theory. Oxford: Oxford University Press."},{"issue":"2","key":"5569_CR41","doi-asserted-by":"crossref","first-page":"267","DOI":"10.1023\/A:1017940631555","volume":"49","author":"O Mihatsch","year":"2002","unstructured":"Mihatsch, O., & Neuneier, R. (2002). Risk-sensitive reinforcement learning. Machine Learning, 49(2), 267\u2013290.","journal-title":"Machine Learning"},{"issue":"2","key":"5569_CR42","doi-asserted-by":"crossref","first-page":"583","DOI":"10.1111\/1468-0262.00296","volume":"70","author":"P Milgrom","year":"2002","unstructured":"Milgrom, P., & Segal, I. (2002). Envelope theorems for arbitrary choice sets. Econometrica, 70(2), 583\u2013601.","journal-title":"Econometrica"},{"issue":"5","key":"5569_CR43","doi-asserted-by":"crossref","first-page":"780","DOI":"10.1287\/opre.1050.0216","volume":"53","author":"A Nilim","year":"2005","unstructured":"Nilim, A., & Ghaoui, L. E. (2005). Robust control of Markov decision processes with uncertain transition matrices. Operations Research, 53(5), 780\u2013798.","journal-title":"Operations Research"},{"key":"5569_CR44","doi-asserted-by":"crossref","unstructured":"Peters, J., Vijayakumar, S., & Schaal, S. (2005). Natural actor-critic. In Proceedings of the sixteenth european conference on machine learning (pp. 280\u2013291).","DOI":"10.1007\/11564096_29"},{"issue":"4","key":"5569_CR45","doi-asserted-by":"crossref","first-page":"838","DOI":"10.1137\/0330046","volume":"30","author":"BT Polyak","year":"1992","unstructured":"Polyak, B. T., & Juditsky, A. B. (1992). Acceleration of stochastic approximation by averaging. SIAM Journal on Control and Optimization, 30(4), 838\u2013855.","journal-title":"SIAM Journal on Control and Optimization"},{"issue":"2","key":"5569_CR46","doi-asserted-by":"crossref","first-page":"412","DOI":"10.1109\/TITS.2010.2091408","volume":"12","author":"L Prashanth","year":"2011","unstructured":"Prashanth, L., & Bhatnagar, S. (2011). Reinforcement learning with function approximation for traffic signal control. IEEE Transactions on Intelligent Transportation Systems, 12(2), 412\u2013421.","journal-title":"IEEE Transactions on Intelligent Transportation Systems"},{"issue":"9","key":"5569_CR47","doi-asserted-by":"crossref","first-page":"3865","DOI":"10.1109\/TVT.2012.2209904","volume":"61","author":"L Prashanth","year":"2012","unstructured":"Prashanth, L., & Bhatnagar, S. (2012). Threshold tuning using stochastic optimization for graded signal control. IEEE Transactions on Vehicular Technology, 61(9), 3865\u20133880.","journal-title":"IEEE Transactions on Vehicular Technology"},{"key":"5569_CR48","unstructured":"Prashanth, L., & Ghavamzadeh, M. (2013). Actor-critic algorithms for risk-sensitive MDPs. In Proceedings of advances in neural information processing systems (Vol. 26, pp. 252\u2013260)."},{"key":"5569_CR49","unstructured":"Prashanth, L., Jie, C., Fu, M., Marcus, S. & Szepesvari, C. (2016). Cumulative prospect theory meets reinforcement learning: Prediction and control. In Proceedings of the 33rd international conference on machine learning (pp. 1406\u20131415)."},{"key":"5569_CR50","doi-asserted-by":"crossref","DOI":"10.1002\/9780470316887","volume-title":"Markov decision processes: Discrete stochastic dynamic programming","author":"M Puterman","year":"1994","unstructured":"Puterman, M. (1994). Markov decision processes: Discrete stochastic dynamic programming. London: Wiley."},{"key":"5569_CR51","unstructured":"Ruppert, D. (1991). Stochastic approximation. In B. K. Ghosh & P. K. Sen (Eds.), Handbook of Sequential Analysis (pp. 503\u2013529). New York: Marcel Dekker."},{"key":"5569_CR52","doi-asserted-by":"crossref","first-page":"235","DOI":"10.1007\/s10107-010-0393-3","volume":"125","author":"A Ruszczy\u0144ski","year":"2010","unstructured":"Ruszczy\u0144ski, A. (2010). Risk-averse dynamic programming for Markov decision processes. Mathematical Programming, 125, 235\u2013261.","journal-title":"Mathematical Programming"},{"key":"5569_CR53","doi-asserted-by":"crossref","first-page":"401","DOI":"10.1017\/S0021900200110083","volume":"5","author":"PJ Schweitzer","year":"1968","unstructured":"Schweitzer, P. J. (1968). Perturbation theory and finite Markov chains. Journal of Applied Probability, 5, 401\u2013413.","journal-title":"Journal of Applied Probability"},{"issue":"1","key":"5569_CR54","doi-asserted-by":"crossref","first-page":"119","DOI":"10.1086\/294846","volume":"39","author":"W Sharpe","year":"1966","unstructured":"Sharpe, W. (1966). Mutual fund performance. Journal of Business, 39(1), 119\u2013138.","journal-title":"Journal of Business"},{"issue":"5","key":"5569_CR55","doi-asserted-by":"crossref","first-page":"3652","DOI":"10.1137\/120899005","volume":"51","author":"Y Shen","year":"2013","unstructured":"Shen, Y., Stannat, W., & Obermayer, K. (2013). Risk-sensitive Markov control processes. SIAM Journal on Control and Optimization, 51(5), 3652\u20133672.","journal-title":"SIAM Journal on Control and Optimization"},{"issue":"1","key":"5569_CR56","doi-asserted-by":"crossref","first-page":"171","DOI":"10.2140\/pjm.1958.8.171","volume":"8","author":"M Sion","year":"1958","unstructured":"Sion, M. (1958). On general minimax theorems. Pacific Journal of Mathematics, 8(1), 171\u2013176.","journal-title":"Pacific Journal of Mathematics"},{"key":"5569_CR57","doi-asserted-by":"crossref","first-page":"794","DOI":"10.1017\/S0021900200023123","volume":"19","author":"M Sobel","year":"1982","unstructured":"Sobel, M. (1982). The variance of discounted Markov decision processes. Applied Probability, 19, 794\u2013802.","journal-title":"Applied Probability"},{"issue":"3","key":"5569_CR58","doi-asserted-by":"crossref","first-page":"332","DOI":"10.1109\/9.119632","volume":"37","author":"J Spall","year":"1992","unstructured":"Spall, J. (1992). Multivariate stochastic approximation using a simultaneous perturbation gradient approximation. IEEE Transactions on Automatic Control, 37(3), 332\u2013341.","journal-title":"IEEE Transactions on Automatic Control"},{"issue":"1","key":"5569_CR59","doi-asserted-by":"crossref","first-page":"109","DOI":"10.1016\/S0005-1098(96)00149-5","volume":"33","author":"J Spall","year":"1997","unstructured":"Spall, J. (1997). A one-measurement form of simultaneous perturbation stochastic approximation. Automatica, 33(1), 109\u2013112.","journal-title":"Automatica"},{"issue":"10","key":"5569_CR60","doi-asserted-by":"crossref","first-page":"1839","DOI":"10.1109\/TAC.2000.880982","volume":"45","author":"J Spall","year":"2000","unstructured":"Spall, J. (2000). Adaptive stochastic approximation by the simultaneous perturbation method. IEEE Transactions on Automatic Control, 45(10), 1839\u20131853.","journal-title":"IEEE Transactions on Automatic Control"},{"issue":"5","key":"5569_CR61","doi-asserted-by":"crossref","first-page":"79","DOI":"10.1109\/TCAD.1986.1270179","volume":"1","author":"MA Styblinski","year":"1986","unstructured":"Styblinski, M. A., & Opalski, L. J. (1986). Algorithms and software tools for IC yield optimization based on fundamental fabrication parameters. IEEE Transactions on Computer Aided Design CAD, 1(5), 79\u201389.","journal-title":"IEEE Transactions on Computer Aided Design CAD"},{"key":"5569_CR62","unstructured":"Sutton, R. (1984). Temporal credit assignment in reinforcement learning. Ph.D. thesis, University of Massachusetts Amherst."},{"key":"5569_CR63","first-page":"9","volume":"3","author":"R Sutton","year":"1988","unstructured":"Sutton, R. (1988). Learning to predict by the methods of temporal differences. Machine Learning, 3, 9\u201344.","journal-title":"Machine Learning"},{"key":"5569_CR64","volume-title":"Reinforcement learning: An introduction","author":"R Sutton","year":"1998","unstructured":"Sutton, R., & Barto, A. (1998). Reinforcement learning: An introduction. Cambridge: MIT Press."},{"key":"5569_CR65","unstructured":"Sutton, R., McAllester, D., Singh, S., & Mansour, Y. (2000). Policy gradient methods for reinforcement learning with function approximation. In Proceedings of advances in neural information processing systems (Vol. 12, pp. 1057\u20131063)."},{"key":"5569_CR66","unstructured":"Sutton, R. S., McAllester, D. A., Singh, S. P., Mansour, Y., et\u00a0al. (1999). Policy gradient methods for reinforcement learning with function approximation. In NIPS, Citeseer (Vol.\u00a099, pp. 1057\u20131063)."},{"key":"5569_CR67","unstructured":"Tamar, A., & Mannor, S. (2013). Variance adjusted actor critic algorithms. arXiv:1310.3697 ."},{"key":"5569_CR68","unstructured":"Tamar, A., Di\u00a0Castro, D., & Mannor, S. (2012). Policy gradients with variance related risk criteria. In Proceedings of the twenty-ninth international conference on machine learning (pp. 387\u2013396)."},{"key":"5569_CR69","unstructured":"Tamar, A., Di\u00a0Castro, D., & Mannor, S. (2013a). Policy evaluation with variance related risk criteria in markov decision processes. arXiv:1301.0104 ."},{"key":"5569_CR70","unstructured":"Tamar, A., Di\u00a0Castro, D., & Mannor, S. (2013b). Temporal difference methods for the variance of the reward to go. In Proceedings of the thirtieth international conference on machine learning (pp. 495\u2013503)."},{"issue":"5","key":"5569_CR71","doi-asserted-by":"crossref","first-page":"674","DOI":"10.1109\/9.580874","volume":"42","author":"JN Tsitsiklis","year":"1997","unstructured":"Tsitsiklis, J. N., & Van Roy, B. (1997). An analysis of temporal-difference learning with function approximation. IEEE Transactions on Automatic Control, 42(5), 674\u2013690.","journal-title":"IEEE Transactions on Automatic Control"},{"key":"5569_CR72","doi-asserted-by":"crossref","unstructured":"Wiering, M., Vreeken, J., van Veenen, J., & Koopman, A. (2004). Simulation and optimization of traffic in a city. In IEEE intelligent vehicles symposium (pp. 453\u2013458).","DOI":"10.1109\/IVS.2004.1336426"},{"key":"5569_CR73","first-page":"229","volume":"8","author":"R Williams","year":"1992","unstructured":"Williams, R. (1992). Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine Learning, 8, 229\u2013256.","journal-title":"Machine Learning"},{"issue":"2","key":"5569_CR74","doi-asserted-by":"crossref","first-page":"288","DOI":"10.1287\/moor.1120.0540","volume":"37","author":"H Xu","year":"2012","unstructured":"Xu, H., & Mannor, S. (2012). Distributionally robust Markov decision processes. Mathematics of Operations Research, 37(2), 288\u2013300.","journal-title":"Mathematics of Operations Research"}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-016-5569-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10994-016-5569-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-016-5569-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,9,12]],"date-time":"2019-09-12T07:26:57Z","timestamp":1568273217000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10994-016-5569-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,8,5]]},"references-count":74,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2016,12]]}},"alternative-id":["5569"],"URL":"https:\/\/doi.org\/10.1007\/s10994-016-5569-5","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2016,8,5]]}}}