{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T14:47:51Z","timestamp":1774018071290,"version":"3.50.1"},"reference-count":53,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,12,29]],"date-time":"2025-12-29T00:00:00Z","timestamp":1766966400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,12,29]],"date-time":"2025-12-29T00:00:00Z","timestamp":1766966400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Dyn Games Appl"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s13235-025-00693-9","type":"journal-article","created":{"date-parts":[[2025,12,29]],"date-time":"2025-12-29T05:10:04Z","timestamp":1766985004000},"page":"386-425","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Model-Agnostic Hessian-Free Meta-policy Optimization via Zeroth-Order Estimation: A Linear Quadratic Regulator Perspective"],"prefix":"10.1007","volume":"16","author":[{"given":"Yunian","family":"Pan","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tao","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Quanyan","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,12,29]]},"reference":[{"key":"693_CR1","doi-asserted-by":"publisher","unstructured":"Hochreiter SY (2001) Learning to learn using gradient descent. Lecture Notes Comput Sci, 87\u201394 https:\/\/doi.org\/10.1007\/3-540-44668-0_13","DOI":"10.1007\/3-540-44668-0_13"},{"key":"693_CR2","doi-asserted-by":"publisher","unstructured":"Zhao Y, Zhu Q (2023) Stackelberg meta-learning for strategic guidance in multi-robot trajectory planning. In: 2023 IEEE\/RSJ international conference on intelligent robots and systems (IROS), pp 11342\u201311347. https:\/\/doi.org\/10.1109\/IROS55552.2023.10342202","DOI":"10.1109\/IROS55552.2023.10342202"},{"key":"693_CR3","doi-asserted-by":"publisher","unstructured":"Li T, Lei H, Zhu Q (2023) Self-adaptive driving in nonstationary environments through conjectural online lookahead adaptation. In: 2023 IEEE international conference on robotics and automation (ICRA), pp 7205\u20137211. https:\/\/doi.org\/10.1109\/ICRA48891.2023.10161368","DOI":"10.1109\/ICRA48891.2023.10161368"},{"key":"693_CR4","doi-asserted-by":"publisher","unstructured":"Pan Y, Li T, Li H, Xu T, Zheng Z, Zhu Q (2023) A first order meta Stackelberg method for robust federated learning. In: Adversarial Machine learning frontiers workshop at 40th international conference on machine learning. https:\/\/doi.org\/10.48550\/arxiv.2306.13800","DOI":"10.48550\/arxiv.2306.13800"},{"key":"693_CR5","doi-asserted-by":"publisher","unstructured":"Li T, Li H, Pan Y, Xu T, Zheng Z, Zhu Q (2024) Meta stackelberg game: Robust federated learning against adaptive and mixed poisoning attacks. arXiv preprint https:\/\/doi.org\/10.48550\/arXiv.2410.17431","DOI":"10.48550\/arXiv.2410.17431"},{"key":"693_CR6","doi-asserted-by":"publisher","unstructured":"Ge Y, Li T, Zhu Q (2023) Scenario-agnostic zero-trust defense with explainable threshold policy: a meta-learning approach. In: IEEE INFOCOM 2023 - IEEE conference on computer communications workshops (INFOCOM WKSHPS), pp 1\u20136. https:\/\/doi.org\/10.1109\/INFOCOMWKSHPS57453.2023.10225816","DOI":"10.1109\/INFOCOMWKSHPS57453.2023.10225816"},{"key":"693_CR7","doi-asserted-by":"publisher","unstructured":"Li T, Lei H, Zhu Q (2022) Sampling attacks on meta reinforcement learning: a minimax formulation and complexity analysis. arXiv preprint https:\/\/doi.org\/10.48550\/arXiv.2208.00081","DOI":"10.48550\/arXiv.2208.00081"},{"key":"693_CR8","unstructured":"Finn C, Abbeel P, Levine S (2017) Model-agnostic meta-learning for fast adaptation of deep networks. In: International conference on machine learning, pp 1126\u20131135 . PMLR"},{"key":"693_CR9","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2021.3079209","author":"TM Hospedales","year":"2021","unstructured":"Hospedales TM, Antoniou A, Micaelli P, Storkey AJ (2021) Meta-learning in neural networks: a survey. IEEE Trans Pattern Anal Mach Intell. https:\/\/doi.org\/10.1109\/tpami.2021.3079209","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"1\u20132","key":"693_CR10","doi-asserted-by":"publisher","first-page":"419","DOI":"10.1007\/s10107-016-1017-3","volume":"161","author":"M Wang","year":"2017","unstructured":"Wang M, Fang EX, Liu H (2017) Stochastic compositional gradient descent: algorithms for minimizing compositions of expected-value functions. Math Program 161(1\u20132):419\u2013449. https:\/\/doi.org\/10.1007\/s10107-016-1017-3","journal-title":"Math Program"},{"key":"693_CR11","doi-asserted-by":"publisher","first-page":"4937","DOI":"10.1109\/tsp.2021.3092377","volume":"69","author":"T Chen","year":"2021","unstructured":"Chen T, Sun Y, Yin W (2021) Solving stochastic compositional optimization is nearly as easy as solving stochastic optimization. IEEE Trans Signal Process 69:4937\u20134948. https:\/\/doi.org\/10.1109\/tsp.2021.3092377","journal-title":"IEEE Trans Signal Process"},{"issue":"13s","key":"693_CR12","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3582688","volume":"55","author":"Y Song","year":"2023","unstructured":"Song Y, Wang T, Cai P, Mondal SK, Sahoo JP (2023) A comprehensive survey of few-shot learning: evolution, applications, challenges, and opportunities. ACM Comput Surv 55(13s):1\u201340","journal-title":"ACM Comput Surv"},{"key":"693_CR13","unstructured":"Fallah A, Georgiev K, Mokhtari A, Ozdaglar A (2021) On the convergence theory of debiased model-agnostic meta-reinforcement learning. In: Proceedings of the 35th international conference on neural information processing systems. NIPS \u201921. Curran Associates Inc., Red Hook, NY, USA"},{"key":"693_CR14","unstructured":"Nichol A, Achiam J, Schulman J (2018) On first-order meta-learning algorithms. CoRR abs\/1803.02999"},{"key":"693_CR15","doi-asserted-by":"crossref","unstructured":"Stein C (1972) A bound for the error in the normal approximation to the distribution of a sum of dependent random variables. In: Proceedings of the sixth Berkeley symposium on mathematical statistics and probability, Volume 2: Probability Theory, vol. 6, University of California Press, pp 583\u2013603","DOI":"10.1525\/9780520423671-036"},{"key":"693_CR16","doi-asserted-by":"crossref","unstructured":"Basar T, Olsder G (1999) Dynamic noncooperative game theory. (classics in applied mathematics 23)","DOI":"10.1137\/1.9781611971132"},{"key":"693_CR17","unstructured":"Abbasi-Yadkori Y, P\u00e1l D, Szepesv\u00e1ri C (2011) Online least squares estimation with self-normalized processes: an application to bandit problems. CoRR arXiv: abs\/1102.2670 (2011)"},{"key":"693_CR18","unstructured":"Abbasi-Yadkori Y, Szepesv\u00e1ri C (2011) Regret bounds for the adaptive control of linear quadratic systems. In: Proceedings of the 24th annual conference on learning theory, pp 1\u201326"},{"key":"693_CR19","unstructured":"Cohen A, Koren T, Mansour Y (2019) Learning linear-quadratic regulators efficiently with only $$\\sqrt{T}$$ regret. In: Chaudhuri K, Salakhutdinov R (eds.) Proceedings of the 36th international conference on machine learning. Proceedings of Machine Learning Research, vol. 97, pp 1300\u20131309. PMLR, Long Beach, California . https:\/\/proceedings.mlr.press\/v97\/cohen19b.html"},{"issue":"1","key":"693_CR20","doi-asserted-by":"publisher","first-page":"109","DOI":"10.1016\/s0005-1098(96)00149-5","volume":"33","author":"JC Spall","year":"1997","unstructured":"Spall JC (1997) A one-measurement form of simultaneous perturbation stochastic approximation. Automatica 33(1):109\u2013112. https:\/\/doi.org\/10.1016\/s0005-1098(96)00149-5","journal-title":"Automatica"},{"key":"693_CR21","unstructured":"Flaxman AD, Kalai AT, McMahan HB Online Convex optimization in the bandit setting: gradient descent without a gradient. In: Proceedings of the sixteenth annual ACM-SIAM symposium on discrete algorithms. SODA \u201905. Society for Industrial and Applied Mathematics, USA, pp 385\u2013394"},{"issue":"9","key":"693_CR22","doi-asserted-by":"publisher","first-page":"1616","DOI":"10.2514\/3.5955","volume":"8","author":"SB Gershwin","year":"1970","unstructured":"Gershwin SB, Jacobson DH (1970) A discrete-time differential dynamic programming algorithm with application to optimal orbit transfer. AIAA J 8(9):1616\u20131626","journal-title":"AIAA J"},{"issue":"3","key":"693_CR23","doi-asserted-by":"publisher","first-page":"229","DOI":"10.1023\/A:1022672621406","volume":"8","author":"RJ Williams","year":"1992","unstructured":"Williams RJ (1992) Simple statistical gradient-following algorithms for connectionist reinforcement learning. Mach Learn 8(3):229\u2013256","journal-title":"Mach Learn"},{"key":"693_CR24","unstructured":"Fazel M, Ge R, Kakade S, Mesbahi M (2018) Global convergence of policy gradient methods for the linear quadratic regulator. In: International conference on machine learning. PMLR, pp 1467\u20131476"},{"issue":"5","key":"693_CR25","doi-asserted-by":"publisher","first-page":"3359","DOI":"10.1137\/20M1382386","volume":"59","author":"B Hambly","year":"2021","unstructured":"Hambly B, Xu R, Yang H (2021) Policy gradient methods for the noisy linear quadratic regulator over a finite horizon. SIAM J Control Optim 59(5):3359\u20133391. https:\/\/doi.org\/10.1137\/20M1382386","journal-title":"SIAM J Control Optim"},{"key":"693_CR26","unstructured":"Malik D, Pananjady A, Bhatia K, Khamaru K, Bartlett P, Wainwright M (2019) Derivative-free methods for policy optimization: guarantees for linear quadratic systems. In: The 22nd international conference on artificial intelligence and statistics. PMLR, pp 2916\u20132925"},{"issue":"11","key":"693_CR27","doi-asserted-by":"publisher","first-page":"5283","DOI":"10.1109\/TAC.2020.3037046","volume":"66","author":"B Gravell","year":"2020","unstructured":"Gravell B, Esfahani PM, Summers T (2020) Learning optimal controllers for linear systems with multiplicative noise via policy gradient. IEEE Trans Autom Control 66(11):5283\u20135298","journal-title":"IEEE Trans Autom Control"},{"key":"693_CR28","doi-asserted-by":"publisher","first-page":"123","DOI":"10.1146\/annurev-control-042920-020021","volume":"6","author":"B Hu","year":"2023","unstructured":"Hu B, Zhang K, Li N, Mesbahi M, Fazel M, Ba\u015far T (2023) Toward a theoretical foundation of policy optimization for learning control policies. Ann Rev Control Robot Autonom Syst 6:123\u2013158","journal-title":"Ann Rev Control Robot Autonom Syst"},{"key":"693_CR29","unstructured":"Salimans T, Ho J, Chen X, Sidor S, Sutskever I (2017) Evolution strategies as a scalable alternative to reinforcement learning. https:\/\/arxiv.org\/abs\/1703.03864"},{"key":"693_CR30","unstructured":"Allen M, Raisbeck J, Lee H (2023) A scalable finite difference method for deep reinforcement learning. https:\/\/arxiv.org\/abs\/2210.07487"},{"key":"693_CR31","unstructured":"Finn C, Rajeswaran A, Kakade S, Levine S (2019) Online meta-learning. In: International conference on machine learning. PMLR, pp 1920\u20131930"},{"key":"693_CR32","unstructured":"Fallah A, Georgiev K, Mokhtari A, Ozdaglar A (2020) Provably convergent policy gradient methods for model-agnostic meta-reinforcement learning. arXiv preprint arXiv:2002.05135"},{"key":"693_CR33","first-page":"31059","volume":"35","author":"B Liu","year":"2022","unstructured":"Liu B, Feng X, Ren J, Mai L, Zhu R, Zhang H, Wang J, Yang Y (2022) A theoretical understanding of gradient bias in meta-reinforcement learning. Adv Neural Inf Process Syst 35:31059\u201331072","journal-title":"Adv Neural Inf Process Syst"},{"key":"693_CR34","unstructured":"Beck J, Vuorio R, Liu EZ, Xiong Z, Zintgraf L, Finn C, Whiteson S (2023) A survey of meta-reinforcement learning. arXiv preprint arXiv:2301.08028"},{"key":"693_CR35","unstructured":"Molybog I, Lavaei J (2020) Global convergence of maml for lqr. arXiv preprint arXiv:2006.00453"},{"key":"693_CR36","doi-asserted-by":"publisher","unstructured":"Musavi N, Dullerud GE (2023) Convergence of gradient-based MAML in LQR. In: Proceedings of the 62nd IEEE conference on decision and control (CDC). IEEE, Singapore, pp 7362\u20137366. https:\/\/doi.org\/10.1109\/CDC49753.2023.10383370","DOI":"10.1109\/CDC49753.2023.10383370"},{"key":"693_CR37","unstructured":"Toso LF, Zhan D, Anderson J, Wang H (2024) Meta-learning linear quadratic regulators: A policy gradient MAML approach for model-free LQR. In: Abate A, Cannon M, Margellos K, Papachristodoulou A (eds.) Proceedings of the 6th annual learning for dynamics; control conference. Proceedings of Machine Learning Research. PMLR, Oxford, UK, vol. 242, pp 902\u2013915. https:\/\/proceedings.mlr.press\/v242\/toso24a.html"},{"issue":"1","key":"693_CR38","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1007\/s10208-021-09499-8","volume":"22","author":"K Balasubramanian","year":"2022","unstructured":"Balasubramanian K, Ghadimi S (2022) Zeroth-order nonconvex stochastic optimization: handling constraints, high dimensionality, and saddle points. Found Comput Math 22(1):35\u201376. https:\/\/doi.org\/10.1007\/s10208-021-09499-8","journal-title":"Found Comput Math"},{"key":"693_CR39","unstructured":"Pan Y, Zhu Q (2024) Model-agnostic zeroth-order policy optimization for meta-learning of ergodic linear quadratic regulators. https:\/\/arxiv.org\/abs\/2405.17370"},{"key":"693_CR40","unstructured":"Bu J, Mesbahi A, Fazel M, Mesbahi M (2019) LQR through the lens of first order methods: discrete-time case. https:\/\/arxiv.org\/abs\/1907.08921"},{"key":"693_CR41","unstructured":"Yang Z, Chen Y, Hong M, Wang Z (2019) On the global convergence of actor-critic: a case for linear quadratic regulator with ergodic cost. CoRR arXiv: abs\/1907.06246"},{"key":"693_CR42","unstructured":"Yang Z, Chen Y, Hong M, Wang Z (2019) Provably global convergence of actor-critic: a case for linear quadratic regulator with ergodic cost. In: Advances in neural information processing systems, vol. 32 . https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2019\/file\/9713faa264b94e2bf346a1bb52587fd8-Paper.pdf"},{"issue":"98","key":"693_CR43","first-page":"1","volume":"22","author":"A Agarwal","year":"2021","unstructured":"Agarwal A, Kakade SM, Lee JD, Mahajan G (2021) On the theory of policy gradient methods: optimality, approximation, and distribution shift. J Mach Learn Res 22(98):1\u201376","journal-title":"J Mach Learn Res"},{"key":"693_CR44","unstructured":"Ahmed Z, Le\u00a0Roux N, Norouzi M, Schuurmans D (2019) Understanding the impact of entropy on policy optimization. In: Chaudhuri K, Salakhutdinov R (eds.) Proceedings of the 36th international conference on machine learning. Proceedings of Machine Learning Research. PMLR, Long Beach, California, vol. 97, pp 151\u2013160. https:\/\/proceedings.mlr.press\/v97\/ahmed19a.html"},{"key":"693_CR45","unstructured":"Mei J, Xiao C, Szepesvari C, Schuurmans D (2020) On the global convergence rates of softmax policy gradient methods. In: III HD, Singh A (eds.) Proceedings of the 37th international conference on machine learning. Proceedings of Machine Learning Research. PMLR, Virtual, vol. 119, pp 6820\u20136829. https:\/\/proceedings.mlr.press\/v119\/mei20b.html"},{"key":"693_CR46","doi-asserted-by":"publisher","unstructured":"Pan Y, Li T, Zhu Q (2023) On the resilience of traffic networks under non-equilibrium learning. In: 2023 American control conference (ACC), pp 3484\u20133489. https:\/\/doi.org\/10.23919\/ACC55779.2023.10156139","DOI":"10.23919\/ACC55779.2023.10156139"},{"key":"693_CR47","doi-asserted-by":"publisher","unstructured":"Pan Y, Li T, Zhu Q (2023) Is stochastic mirror descent vulnerable to adversarial delay attacks? A traffic assignment resilience study. In: 2023 62nd IEEE conference on decision and control (CDC), pp 8328\u20138333. https:\/\/doi.org\/10.1109\/CDC49753.2023.10384003","DOI":"10.1109\/CDC49753.2023.10384003"},{"key":"693_CR48","doi-asserted-by":"publisher","unstructured":"Pan Y, Li T, Zhu Q (2024) On the variational interpretation of mirror play in monotone games. In: 2024 IEEE 63rd conference on decision and control (CDC), pp 6799\u20136804. https:\/\/doi.org\/10.1109\/CDC56724.2024.10885800","DOI":"10.1109\/CDC56724.2024.10885800"},{"issue":"4","key":"693_CR49","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1109\/mcs.2022.3171478","volume":"42","author":"T Li","year":"2022","unstructured":"Li T, Peng G, Zhu Q, Baar T (2022) The confluence of networks, games, and learning a game-theoretic framework for multiagent decision making over networks. IEEE Control Syst 42(4):35\u201367. https:\/\/doi.org\/10.1109\/mcs.2022.3171478","journal-title":"IEEE Control Syst"},{"key":"693_CR50","first-page":"29274","volume":"34","author":"J Perdomo","year":"2021","unstructured":"Perdomo J, Umenberger J, Simchowitz M (2021) Stabilizing dynamical systems via policy gradient methods. Adv Neural Inf Process Syst 34:29274\u201329286","journal-title":"Adv Neural Inf Process Syst"},{"key":"693_CR51","doi-asserted-by":"publisher","first-page":"407","DOI":"10.1109\/LCSYS.2022.3188180","volume":"7","author":"IK Ozaslan","year":"2022","unstructured":"Ozaslan IK, Mohammadi H, Jovanovi\u0107 MR (2022) Computing stabilizing feedback gains via a model-free policy gradient method. IEEE Control Syst Lett 7:407\u2013412","journal-title":"IEEE Control Syst Lett"},{"key":"693_CR52","doi-asserted-by":"publisher","unstructured":"Faradonbeh MKS, Modi A (2022) Joint learning-based stabilization of multiple unknown linear systems. IFAC-PapersOnLine 55(12), 723\u2013728 https:\/\/doi.org\/10.1016\/j.ifacol.2022.07.398. 14th IFAC workshop on adaptive and learning control systems ALCOS 2022","DOI":"10.1016\/j.ifacol.2022.07.398"},{"key":"693_CR53","unstructured":"Wang H, Toso LF, Anderson J (2023) Fedsysid: A federated approach to sample-efficient system identification. In: Learning for dynamics and control conference. PMLR, pp 1308\u20131320"}],"container-title":["Dynamic Games and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13235-025-00693-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13235-025-00693-9","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13235-025-00693-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T13:02:37Z","timestamp":1774011757000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13235-025-00693-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,29]]},"references-count":53,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["693"],"URL":"https:\/\/doi.org\/10.1007\/s13235-025-00693-9","relation":{},"ISSN":["2153-0785","2153-0793"],"issn-type":[{"value":"2153-0785","type":"print"},{"value":"2153-0793","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,12,29]]},"assertion":[{"value":"1 March 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 December 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 December 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}