{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T18:37:18Z","timestamp":1780511838750,"version":"3.54.1"},"reference-count":74,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["E5EQ0101X2"],"award-info":[{"award-number":["E5EQ0101X2"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"name":"NSF","award":["CCF-2311275"],"award-info":[{"award-number":["CCF-2311275"]}]},{"name":"NSF","award":["ECCS-2326591"],"award-info":[{"award-number":["ECCS-2326591"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Signal Process."],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/tsp.2025.3624427","type":"journal-article","created":{"date-parts":[[2025,10,30]],"date-time":"2025-10-30T18:05:29Z","timestamp":1761847529000},"page":"4734-4747","source":"Crossref","is-referenced-by-count":2,"title":["Fully First-Order Methods for DecentralizedBilevel Optimization"],"prefix":"10.1109","volume":"73","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4102-4909","authenticated-orcid":false,"given":"Xiaoyu","family":"Wang","sequence":"first","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, PR China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3183-2008","authenticated-orcid":false,"given":"Xuxing","family":"Chen","sequence":"additional","affiliation":[{"name":"University of California Davis, Davis, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1967-1069","authenticated-orcid":false,"given":"Shiqian","family":"Ma","sequence":"additional","affiliation":[{"name":"Rice University, Houston, TX, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5511-2558","authenticated-orcid":false,"given":"Tong","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Champaign, IL, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","first-page":"318","article-title":"Generic methods for optimization-based modeling","volume-title":"Proc. 15th Int. Conf. Artif. Intell. Statist.","volume":"22","author":"Domke","year":"2012"},{"key":"ref2","first-page":"2113","article-title":"Gradient-based hyperparameter optimization through reversible learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Maclaurin","year":"2015"},{"key":"ref3","first-page":"1165","article-title":"Forward and reverse gradient-based hyperparameter optimization","volume-title":"Proc. Int. Conf. Mach. Learn.","volume":"70","author":"Franceschi","year":"2017"},{"key":"ref4","first-page":"1540","article-title":"Optimizing millions of hyperparameters by implicit differentiation","volume-title":"Proc. Int. Conf. Artif. Intell. Statist.","author":"Lorraine","year":"2020"},{"key":"ref5","article-title":"Learning to learn by gradient descent by gradient descent","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"29","author":"Andrychowicz","year":"2016"},{"key":"ref6","first-page":"1568","article-title":"Bilevel programming for hyperparameter optimization and meta-learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Franceschi","year":"2018"},{"key":"ref7","article-title":"Meta-learning with implicit gradients","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Rajeswaran","year":"2019"},{"key":"ref8","article-title":"Provably global convergence of actor-critic: A case for linear quadratic regulator with ergodic cost","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Yang","year":"2019"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1137\/20M1387341"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TEVC.2017.2712906"},{"key":"ref11","article-title":"Approximation methods for bilevel programming","author":"Ghadimi","year":"2018"},{"key":"ref12","first-page":"318","article-title":"Generic methods for optimization-based modelling","volume-title":"Proc. Artif. Intell. Statist.","author":"Domke","year":"2012"},{"key":"ref13","first-page":"737","article-title":"Hyperparameter optimization with approximate gradient","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Pedregosa","year":"2016"},{"key":"ref14","article-title":"On differentiating parameterized argmin and argmax problems with application to bi-level optimization","author":"Gould","year":"2016"},{"key":"ref15","first-page":"3748","article-title":"On the iteration complexity of hypergradient computation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Grazzi","year":"2020"},{"key":"ref16","first-page":"4882","article-title":"Bilevel optimization: Convergence analysis and enhanced design","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Ji","year":"2021"},{"key":"ref17","article-title":"Closing the gap: Tighter analysis of alternating stochastic gradient methods for bilevel problems","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Chen","year":"2021"},{"key":"ref18","article-title":"Amortized implicit differentiation for stochastic bilevel optimization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Arbel","year":"2022"},{"key":"ref19","first-page":"26698","article-title":"A framework for bilevel optimization that enables stochastic and global variance reduction algorithms","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Dagr\u00e9ou","year":"2022"},{"key":"ref20","article-title":"Optimal algorithms for stochastic bilevel optimization under relaxed smoothness conditions","author":"Chen","year":"2023"},{"key":"ref21","article-title":"Bilevel optimization under unbounded smoothness: A new algorithm and convergence analysis","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Hao","year":"2024"},{"key":"ref22","first-page":"17248","article-title":"Bome! bilevel optimization made easy: A simple first-order approach","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Liu","year":"2022"},{"key":"ref23","first-page":"18083","article-title":"A fully first-order method for stochastic bilevel optimization","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kwon","year":"2023"},{"key":"ref24","article-title":"Near-optimal fully first-order algorithms for finding stationary points in bilevel optimization","author":"Chen","year":"2023"},{"key":"ref25","first-page":"30992","article-title":"On penalty-based bilevel gradient descent method","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Shen","year":"2023"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746612"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/s11590-024-02101-4"},{"key":"ref28","first-page":"238","article-title":"Decentralized gossip-based stochastic bilevel optimization over communication networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Yang","year":"2022"},{"key":"ref29","article-title":"Stochastic bilevel distributed optimization over a network","author":"Gao","year":"2022"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1287\/moor.2024.0488"},{"key":"ref31","article-title":"Decentralized bilevel optimization over graphs: Loopless algorithmic update and transient iteration complexity","author":"Kong","year":"2024"},{"key":"ref32","first-page":"21146","article-title":"FedNest: Federated bilevel, minimax, and compositional optimization","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Davoud Ataee Tarzanagh","year":"2022"},{"key":"ref33","first-page":"14039","article-title":"Achieving linear speedup in non-IID federated bilevel learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Huang","year":"2023"},{"key":"ref34","article-title":"SimFBO: Towards simple, flexible and communication-efficient federated bilevel learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Yang","year":"2024"},{"key":"ref35","first-page":"25464","article-title":"Decentralized training of foundation models in heterogeneous environments","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Yuan","year":"2022"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.14778\/3503585.3503590"},{"key":"ref37","first-page":"4641","article-title":"Decentralized stochastic bilevel optimization with improved per-iteration complexity","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Chen","year":"2023"},{"key":"ref38","article-title":"How to compute Hessian-vector products?\u201d","author":"Dagr\u00e9ou"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.2307\/2550609"},{"key":"ref40","first-page":"11490","article-title":"Convergence of meta-learning with task-specific adaptation over partial parameters","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Ji","year":"2020"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1994.6.1.147"},{"key":"ref42","article-title":"Generalized inner loop meta-learning","author":"Grefenstette","year":"2019"},{"key":"ref43","article-title":"Torchmeta: A meta-learning library for PyTorch","author":"Deleu","year":"2019"},{"key":"ref44","article-title":"Learn2learn: A library for meta-learning research","author":"S\u00e9bastien","year":"2020"},{"key":"ref45","first-page":"4136","article-title":"On the convergence theory for hessian-free bilevel algorithms","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Sow","year":"2022"},{"key":"ref46","first-page":"39491","article-title":"Achieving $\\mathcal{O}\\left(\\epsilon^{-1.5}\\right)$ complexity in Hessian\/Jacobian-free stochastic bilevel optimization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Yang","year":"2023"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/s10957-025-02647-y"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CDC.2015.7402509"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TSIPN.2016.2524588"},{"key":"ref50","article-title":"Can decentralized algorithms outperform centralized algorithms? A case study for decentralized parallel stochastic gradient descent","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"30","author":"Lian","year":"2017"},{"key":"ref51","first-page":"4848","article-title":"\u201c$D^{2}$: Decentralized training over decentralized data","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Tang","year":"2018"},{"key":"ref52","first-page":"2348","article-title":"Decentralized gradient methods: Does topology matter?,","volume-title":"Proc. Int. Conf. Artif. Intell. Statist.","author":"Neglia","year":"2020"},{"key":"ref53","article-title":"An improved analysis of gradient tracking for decentralized machine learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Koloskova","year":"2021"},{"key":"ref54","first-page":"1849","article-title":"Stochastic distributed optimization under average second-order similarity: Algorithms and analysis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Lin","year":"2023"},{"key":"ref55","article-title":"Communication compression for decentralized training","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"31","author":"Tang","year":"2018"},{"key":"ref56","article-title":"Decentralized deep learning with arbitrary communication compression","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Koloskova","year":"2020"},{"key":"ref57","article-title":"Achieving near-optimal convergence for distributed minimax optimization with adaptive stepsizes","author":"Huang","year":"2024"},{"key":"ref58","article-title":"Distributed bilevel optimization via adaptive penalization with time-scale separation","author":"Niu","year":"2024"},{"key":"ref59","first-page":"6083","article-title":"On gradient descent ascent for nonconvex-concave minimax problems","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Lin","year":"2020"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1137\/16M1084316"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/TCNS.2017.2698261"},{"key":"ref62","article-title":"Decentralized stochastic gradient tracking for non-convex empirical risk minimization","author":"Zhang","year":"2019"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/DSW.2019.8755807"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/s10107-020-01487-0"},{"issue":"306","key":"ref65","first-page":"1","article-title":"Multi-consensus decentralized accelerated gradient descent","volume":"24","author":"Ye","year":"2023","journal-title":"J. Mach. Learn. Res."},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1007\/s10107-023-01997-7"},{"key":"ref67","first-page":"3011","article-title":"Will bilevel optimizers benefit from loops","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Ji","year":"2022"},{"key":"ref68","article-title":"The complexity of first-order methods in stochastic bilevel optimization","author":"Kwon","year":"2024"},{"key":"ref69","article-title":"Decentralized stochastic bilevel optimization with improved per-iteration complexity","author":"Chen","year":"2022"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/tsp.2025.3624427"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/MCSE.2021.3083216"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/5.726791"},{"key":"ref73","first-page":"2324","article-title":"A one-sample decentralized proximal algorithm for non-convex stochastic composite optimization","volume-title":"Proc. Uncertainty Artif. Intell.","author":"Xiao","year":"2023"},{"key":"ref74","article-title":"Stochastic controlled averaging for federated learning with communication compression","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Huang","year":"2023"}],"container-title":["IEEE Transactions on Signal Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/78\/10807692\/11220911.pdf?arnumber=11220911","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,23]],"date-time":"2025-12-23T18:32:48Z","timestamp":1766514768000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11220911\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":74,"URL":"https:\/\/doi.org\/10.1109\/tsp.2025.3624427","relation":{},"ISSN":["1053-587X","1941-0476"],"issn-type":[{"value":"1053-587X","type":"print"},{"value":"1941-0476","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]}}}