{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T05:54:29Z","timestamp":1778824469826,"version":"3.51.4"},"reference-count":187,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"6","license":[{"start":{"date-parts":[[2025,6,1]],"date-time":"2025-06-01T00:00:00Z","timestamp":1748736000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,6,1]],"date-time":"2025-06-01T00:00:00Z","timestamp":1748736000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,1]],"date-time":"2025-06-01T00:00:00Z","timestamp":1748736000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["72331009"],"award-info":[{"award-number":["72331009"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["72171206"],"award-info":[{"award-number":["72171206"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["92270105"],"award-info":[{"award-number":["92270105"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012302","name":"Guangdong Power Grid Company","doi-asserted-by":"publisher","award":["GDKJXM20231024"],"award-info":[{"award-number":["GDKJXM20231024"]}],"id":[{"id":"10.13039\/501100012302","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shenzhen Key Lab of Crowd Intelligence Empowered Low-Carbon Energy Network","award":["ZDSYS20220606100601002"],"award-info":[{"award-number":["ZDSYS20220606100601002"]}]},{"name":"Shenzhen Natural Science Fund","award":["GXWD20231128112434001"],"award-info":[{"award-number":["GXWD20231128112434001"]}]},{"name":"Shenzhen Institute of Artificial Intelligence and Robotics for Society"},{"name":"PolyU Direct Grant","award":["P0047700"],"award-info":[{"award-number":["P0047700"]}]},{"name":"PolyU Direct Grant","award":["P0043885"],"award-info":[{"award-number":["P0043885"]}]},{"name":"PolyU Direct Grant","award":["P0051105"],"award-info":[{"award-number":["P0051105"]}]},{"name":"CUHK Strategic Partnership Award for Research Collaboration","award":["4750467"],"award-info":[{"award-number":["4750467"]}]},{"name":"CUHK Direct Grant for Research","award":["4055228"],"award-info":[{"award-number":["4055228"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Neural Netw. Learning Syst."],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1109\/tnnls.2024.3497992","type":"journal-article","created":{"date-parts":[[2024,11,25]],"date-time":"2024-11-25T18:48:00Z","timestamp":1732560480000},"page":"9737-9757","source":"Crossref","is-referenced-by-count":100,"title":["Survey on Large Language Model-Enhanced Reinforcement Learning: Concept, Taxonomy, and Methods"],"prefix":"10.1109","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8664-0823","authenticated-orcid":false,"given":"Yuji","family":"Cao","sequence":"first","affiliation":[{"name":"Department of Mechanical and Automation Engineering, The Chinese University of Hong Kong, Hong Kong, SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3133-3137","authenticated-orcid":false,"given":"Huan","family":"Zhao","sequence":"additional","affiliation":[{"name":"Department of Building Environment and Energy Engineering, The Hong Kong Polytechnic University, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9268-2067","authenticated-orcid":false,"given":"Yuheng","family":"Cheng","sequence":"additional","affiliation":[{"name":"School of Science and Engineering, The Chinese University of Hong Kong, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8630-7868","authenticated-orcid":false,"given":"Ting","family":"Shu","sequence":"additional","affiliation":[{"name":"National Engineering Laboratory for Big Data System Computing Technology, Shenzhen University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7594-7587","authenticated-orcid":false,"given":"Yue","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Mechanical and Automation Engineering, The Chinese University of Hong Kong, Hong Kong, SAR, China"}]},{"given":"Guolong","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Electrical and Electronic Engineering, Nanyang Technological University, Jurong West, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9060-1675","authenticated-orcid":false,"given":"Gaoqi","family":"Liang","sequence":"additional","affiliation":[{"name":"School of Mechanical Engineering and Automation, Harbin Institute of Technology, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5446-2655","authenticated-orcid":false,"given":"Junhua","family":"Zhao","sequence":"additional","affiliation":[{"name":"School of Science and Engineering, The Chinese University of Hong Kong, Shenzhen, China"}]},{"given":"Jinyue","family":"Yan","sequence":"additional","affiliation":[{"name":"Shenzhen Institute for Advanced Study, University of Electronic Science and Technology of China, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6575-1839","authenticated-orcid":false,"given":"Yun","family":"Li","sequence":"additional","affiliation":[{"name":"Shenzhen Institute for Advanced Study, University of Electronic Science and Technology of China, Shenzhen, China"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"Reinforcement Learning: An Introduction","author":"Sutton","year":"2018"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1038\/nature14236"},{"key":"ref3","article-title":"Proximal policy optimization algorithms","author":"Schulman","year":"2017","journal-title":"arXiv:1707.06347"},{"key":"ref4","first-page":"1861","article-title":"Soft actor-critic: Off-policy maximum entropy deep reinforcement learning with a stochastic actor","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Haarnoja"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-019-1724-z"},{"key":"ref6","article-title":"Dota 2 with large scale deep reinforcement learning","author":"Berner","year":"2019","journal-title":"arXiv:1912.06680"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.13140\/RG.2.2.18893.74727"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-020-03051-4"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1049\/enc2.12075"},{"issue":"6556","key":"ref10","first-page":"664","article-title":"Rebel: A general game playing AI","volume":"373","author":"Schmid","year":"2021","journal-title":"Science"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1126\/science.aay2400"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref13","first-page":"1","article-title":"Imagenet classification with deep convolutional neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"25","author":"Krizhevsky"},{"key":"ref14","first-page":"1","article-title":"Language as an abstraction for hierarchical deep reinforcement learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Jiang"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00387"},{"key":"ref16","first-page":"9870","article-title":"Decoupling representation learning from reinforcement learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Stooke"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/880"},{"key":"ref18","article-title":"What matters in learning from offline human demonstrations for robot manipulation","author":"Mandlekar","year":"2021","journal-title":"arXiv:2108.03298"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/lra.2023.3295255"},{"key":"ref20","first-page":"3659","article-title":"Towards applicable reinforcement learning: Improving the generalization and sample efficiency with policy ensemble","volume-title":"Proc. Int. Joint Conf. Artif. Intell.","author":"Yang"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1016\/j.artint.2022.103829"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1016\/j.aei.2022.101612"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i5.25733"},{"key":"ref24","first-page":"12004","article-title":"Goal misgeneralization in deep reinforcement learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Di Langosco"},{"key":"ref25","first-page":"1","article-title":"A generalized algorithm for multi-objective reinforcement learning and policy adaptation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Yang"},{"key":"ref26","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018","journal-title":"arXiv:1810.04805"},{"key":"ref27","volume-title":"Improving Language Understanding By Generative Pre-Training","author":"Radford","year":"2018"},{"key":"ref28","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. NeurIPS","volume":"33","author":"Brown"},{"issue":"240","key":"ref29","first-page":"1","article-title":"PaLM: Scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2023","journal-title":"J. Mach. Learn. Res."},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1038\/s41591-023-02448-8"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-023-06792-0"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1016\/j.apenergy.2024.123431"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1016\/j.apenergy.2024.123554"},{"key":"ref34","article-title":"ElecBench: A power dispatch evaluation benchmark for large language models","author":"Zhou","year":"2024","journal-title":"arXiv:2407.05365"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160591"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1038\/s41562-023-01659-w"},{"key":"ref37","article-title":"Larger language models do in-context learning differently","author":"Wei","year":"2023","journal-title":"arXiv:2303.03846"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.67"},{"key":"ref39","article-title":"KoLA: Carefully benchmarking world knowledge of large language models","author":"Yu","year":"2023","journal-title":"arXiv:2306.09296"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/s10514-023-10135-3"},{"key":"ref41","article-title":"Learning to summarize from human feedback","author":"Stiennon","year":"2022","journal-title":"arXiv:2009.01325"},{"key":"ref42","article-title":"What learning algorithm is in-context learning? Investigations with linear models","author":"Aky\u00fcrek","year":"2023","journal-title":"arXiv:2211.15661"},{"key":"ref43","article-title":"Guiding pretraining in reinforcement learning with large language models","author":"Du","year":"2023","journal-title":"arXiv:2302.06692"},{"key":"ref44","article-title":"Grounding large language models in interactive environments with online reinforcement learning","author":"Carta","year":"2023","journal-title":"arXiv:2302.02662"},{"key":"ref45","article-title":"Learning to model the world with language","author":"Lin","year":"2023","journal-title":"arXiv:2308.01399"},{"key":"ref46","article-title":"Auto MC-reward: Automated dense reward design with large language models for minecraft","author":"Li","year":"2023","journal-title":"arXiv:2312.09238"},{"key":"ref47","article-title":"RE-MOVE: An adaptive policy design for robotic navigation tasks in dynamic environments via language-based feedback","author":"Chakraborty","year":"2023","journal-title":"arXiv:2303.07622"},{"key":"ref48","article-title":"Natural language-conditioned reinforcement learning with inside-out task language development and translation","author":"Pang","year":"2023","journal-title":"arXiv:2302.09368"},{"key":"ref49","article-title":"QT-opt: Scalable deep reinforcement learning for vision-based robotic manipulation","author":"Kalashnikov","year":"2018","journal-title":"arXiv:1806.10293"},{"key":"ref50","first-page":"7968","article-title":"Improving generalization in reinforcement learning with mixture regularization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Wang"},{"key":"ref51","first-page":"5829","article-title":"Exploration-guided reward shaping for reinforcement learning under sparse rewards","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Devidze"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2019.XV.073"},{"key":"ref53","first-page":"1","article-title":"Inverse reward design","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"30","author":"Hadfield-Menell"},{"key":"ref54","article-title":"Learning to combat compounding-error in model-based reinforcement learning","author":"Xiao","year":"2019","journal-title":"arXiv:1912.11206"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1561\/2200000086"},{"key":"ref56","first-page":"1","article-title":"Hard tasks first: Multi-task reinforcement learning through task scheduling","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","author":"Cho"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/tcds.2025.3543694"},{"key":"ref58","first-page":"21495","article-title":"PaCo: Parameter-compositional multi-task reinforcement learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Sun"},{"key":"ref59","first-page":"1","article-title":"Efficient multi-task reinforcement learning via selective behavior sharing","volume-title":"Proc. ICLR","author":"Zhang"},{"issue":"9","key":"ref60","doi-asserted-by":"crossref","first-page":"1363","DOI":"10.3390\/electronics9091363","article-title":"A survey of multi-task deep reinforcement learning","volume":"9","author":"Varghese","year":"2020","journal-title":"Electronics"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.029"},{"key":"ref62","article-title":"Plan4mc: Skill reinforcement learning and planning for open-world minecraft tasks","author":"Yuan","year":"2023","journal-title":"arXiv:2303.16563"},{"key":"ref63","article-title":"Vision-language models are zero-shot reward models for reinforcement learning","author":"Rocamonde","year":"2023","journal-title":"arXiv:2310.12921"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1145\/3624724"},{"key":"ref65","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv:2302.13971"},{"key":"ref66","article-title":"Scaling laws for neural language models","author":"Kaplan","year":"2020","journal-title":"arXiv:2001.08361"},{"key":"ref67","article-title":"Training compute-optimal large language models","author":"Hoffmann","year":"2022","journal-title":"arXiv:2203.15556"},{"key":"ref68","article-title":"Emergent abilities of large language models","volume":"2022","author":"Wei","year":"2022","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref69","article-title":"Multitask prompted training enables zero-shot task generalization","author":"Sanh","year":"2021","journal-title":"arXiv:2110.08207"},{"key":"ref70","article-title":"Training language models to follow instructions with human feedback","author":"Ouyang","year":"2022","journal-title":"arXiv:2203.02155"},{"key":"ref71","article-title":"GAIA\u2014A large language model for advanced power dispatch","author":"Cheng","year":"2024","journal-title":"arXiv:2408.03847"},{"key":"ref72","article-title":"Chain-of-thought prompting elicits reasoning in large language models","author":"Wei","year":"2023","journal-title":"arXiv:2201.11903"},{"key":"ref73","article-title":"Tree of thoughts: Deliberate problem solving with large language models","author":"Yao","year":"2023","journal-title":"arXiv:2305.10601"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29720"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3057424"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3109284"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.2975035"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/TETCI.2023.3326551"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3227717"},{"key":"ref80","article-title":"Eureka: human-level reward design via coding large language models","author":"Ma","year":"2023","journal-title":"arXiv:2310.12931"},{"key":"ref81","article-title":"Do as i can, not as i say: Grounding language in robotic affordances","author":"Ahn","year":"2022","journal-title":"arXiv:2204.01691"},{"key":"ref82","article-title":"Do embodied agents dream of pixelated sheep: Embodied decision making using language guided world modelling","author":"Nottingham","year":"2023","journal-title":"arXiv:2301.12050"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.2307\/j.ctt4cgngj.10"},{"key":"ref84","article-title":"Data-efficient reinforcement learning with self-predictive representations","author":"Schwarzer","year":"2020","journal-title":"arXiv:2007.05929"},{"key":"ref85","article-title":"History compression via language models in reinforcement learning","author":"Paischer","year":"2023","journal-title":"arXiv:2205.12258"},{"key":"ref86","first-page":"1","article-title":"Semantic helm: A human-readable memory for reinforcement learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Paischer"},{"key":"ref87","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. ICML","author":"Radford"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1285"},{"key":"ref89","article-title":"Representation learning with contrastive predictive coding","author":"van den Oord","year":"2018","journal-title":"arXiv:1807.03748"},{"key":"ref90","first-page":"1","article-title":"Efficient policy adaptation with contrastive prompt ensemble for embodied agents","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Kim"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02161"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.935"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2020.XVI.016"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i7.16749"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160591"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00280"},{"key":"ref97","first-page":"1","article-title":"Informing reinforcement learning agents by grounding language to Markov decision processes","volume-title":"Proc. Workshop Training Agents Found. Models RLC","author":"Spiegel"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72998-0_18"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127181"},{"key":"ref100","article-title":"Toward robust multimodal learning using multimodal foundational models","author":"Zhao","year":"2024","journal-title":"arXiv:2401.13697"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/UR61395.2024.10597462"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-41188-6_3"},{"key":"ref103","first-page":"166","article-title":"Modular multitask reinforcement learning with policy sketches","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Andreas"},{"key":"ref104","first-page":"29529","article-title":"ELLA: Exploration through learned language abstraction","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Mirchandani"},{"key":"ref105","first-page":"1","article-title":"Reward design with language models","volume-title":"Proc. 11th Int. Conf. Learning Represent.","author":"Kwon"},{"key":"ref106","article-title":"Read and reap the rewards: Learning to play atari with the help of instruction manuals","author":"Wu","year":"2023","journal-title":"arXiv:2302.04449"},{"key":"ref107","article-title":"Accelerating reinforcement learning of robotic manipulations via feedback from large language models","author":"Chu","year":"2023","journal-title":"arXiv:2311.02379"},{"key":"ref108","article-title":"Language reward modulation for pretraining reinforcement learning","author":"Adeniji","year":"2023","journal-title":"arXiv:2308.12270"},{"key":"ref109","article-title":"DistilBERT, a distilled version of BERT: Smaller, faster, cheaper and lighter","author":"Sanh","year":"2019","journal-title":"arXiv:1910.01108"},{"key":"ref110","article-title":"Guide your agent with adaptive multimodal rewards","author":"Kim","year":"2023","journal-title":"arXiv:2309.10790"},{"key":"ref111","article-title":"R3M: A universal visual representation for robot manipulation","author":"Nair","year":"2022","journal-title":"arXiv:2203.12601"},{"key":"ref112","article-title":"Masked world models for visual control","author":"Seo","year":"2023","journal-title":"arXiv:2206.14244"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"ref114","article-title":"RL-VLM-F: Reinforcement learning from vision language foundation model feedback","author":"Wang","year":"2024","journal-title":"arXiv:2402.03681"},{"key":"ref115","article-title":"Language to rewards for robotic skill synthesis","author":"Yu","year":"2023","journal-title":"arXiv:2306.08647"},{"key":"ref116","article-title":"Self-refine: Iterative refinement with self-feedback","author":"Madaan","year":"2023","journal-title":"arXiv:2303.17651"},{"key":"ref117","article-title":"Self-refined large language model as automated reward function designer for deep reinforcement learning in robotics","author":"Song","year":"2023","journal-title":"arXiv:2309.06687"},{"key":"ref118","first-page":"1","article-title":"Text2reward: Reward shaping with language models for reinforcement learning","volume-title":"Proc. 12th Int. Conf. Learn. Representations","author":"Xie"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.2139\/ssrn.4614228"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1162\/coli_a_00524"},{"key":"ref121","first-page":"10835","article-title":"Scaling laws for reward model overoptimization","volume-title":"Proc. 40th Int. Conf. Mach. Learn.","volume":"202","author":"Gao"},{"key":"ref122","article-title":"REBEL: A regularization-based solution for reward overoptimization in robotic reinforcement learning from human feedback","author":"Chakraborty","year":"2023","journal-title":"arXiv:2312.14436"},{"key":"ref123","article-title":"Addressing bias through ensemble learning and regularized fine-tuning","author":"Radwan","year":"2024","journal-title":"arXiv:2402.00910"},{"key":"ref124","article-title":"Prompter: Utilizing large language model prompting for a data efficient embodied instruction following","author":"Inoue","year":"2022","journal-title":"arXiv:2211.03267"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58539-6_16"},{"key":"ref126","first-page":"1273","article-title":"Offline reinforcement learning as one big sequence modeling problem","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Janner"},{"key":"ref127","first-page":"31199","article-title":"Pre-trained language models for interactive decision-making","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Li"},{"key":"ref128","article-title":"Unleashing the power of pre-trained language models for offline reinforcement learning","author":"Shi","year":"2023","journal-title":"arXiv:2310.20587"},{"key":"ref129","first-page":"1","article-title":"Goal-conditioned predictive coding for offline reinforcement learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Zeng"},{"key":"ref130","article-title":"Can Wikipedia help offline reinforcement learning?","author":"Reid","year":"2022","journal-title":"arXiv:2201.12122"},{"key":"ref131","article-title":"D4RL: Datasets for deep data-driven reinforcement learning","author":"Fu","year":"2020","journal-title":"arXiv:2004.07219"},{"key":"ref132","article-title":"Think before you act: Unified policy for interleaving language reasoning with actions","author":"Mezghani","year":"2023","journal-title":"arXiv:2304.11063"},{"key":"ref133","first-page":"2165","article-title":"RT-2: Vision-language-action models transfer web knowledge to robotic control","volume-title":"Proc. Conf. Robot Learn.","author":"Zitkovich"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.704"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6297"},{"key":"ref136","article-title":"Language instructed reinforcement learning for human-AI coordination","author":"Hu","year":"2023","journal-title":"arXiv:2304.07297"},{"key":"ref137","article-title":"Large language model as a policy teacher for training reinforcement learning agents","author":"Zhou","year":"2023","journal-title":"arXiv:2311.13373"},{"key":"ref138","first-page":"1","article-title":"Plan-seq-learn: Language model guided RL for solving long horizon robotics tasks","volume-title":"Proc. 12th Int. Conf. Learn. Represent.","author":"Dalal"},{"issue":"1","key":"ref139","first-page":"1437","article-title":"A comprehensive survey on safe reinforcement learning","volume":"16","author":"Garc\u0131a","year":"2015","journal-title":"J. Mach. Learn. Res."},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i8.20855"},{"key":"ref141","article-title":"COptiDICE: Offline constrained reinforcement learning via stationary distribution correction estimation","author":"Lee","year":"2022","journal-title":"arXiv:2204.08957"},{"key":"ref142","first-page":"21611","article-title":"Constrained decision transformer for offline safe reinforcement learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Liu"},{"key":"ref143","article-title":"Testing language model agents safely in the wild","author":"Naihin","year":"2023","journal-title":"arXiv:2311.10538"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2024.3420959"},{"key":"ref145","article-title":"Reflexion: Language agents with verbal reinforcement learning","author":"Shinn","year":"2023","journal-title":"arXiv:2303.11366"},{"key":"ref146","article-title":"LoRA: Low-rank adaptation of large language models","author":"Hu","year":"2021","journal-title":"arXiv:2106.09685"},{"key":"ref147","article-title":"Dream to control: Learning behaviors by latent imagination","author":"Hafner","year":"2020","journal-title":"arXiv:1912.01603"},{"key":"ref148","article-title":"Mastering atari with discrete world models","author":"Hafner","year":"2022","journal-title":"arXiv:2010.02193"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2022.03.037"},{"key":"ref150","first-page":"15084","article-title":"Decision transformer: Reinforcement learning via sequence modeling","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Chen"},{"key":"ref151","first-page":"1","article-title":"Transformers are sample-efficient world models","volume-title":"Proc. 11th Int. Conf. Learn. Represent.","author":"Micheli"},{"key":"ref152","article-title":"Transformer-based world models are happy with 100k interactions","author":"Robine","year":"2023","journal-title":"arXiv:2303.07109"},{"key":"ref153","article-title":"TransDreamer: Reinforcement learning with transformer world models","author":"Chen","year":"2022","journal-title":"arXiv:2202.09481"},{"key":"ref154","article-title":"Reinforcement learning with action-free pre-training from videos","author":"Seo","year":"2022","journal-title":"arXiv:2203.13880"},{"key":"ref155","first-page":"2555","article-title":"Learning latent dynamics for planning from pixels","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Hafner"},{"key":"ref156","article-title":"LanGWM: Language grounded world model","author":"Poudel","year":"2023","journal-title":"arXiv:2311.17593"},{"key":"ref157","article-title":"A survey of explainable reinforcement learning","author":"Milani","year":"2022","journal-title":"arXiv:2202.08434"},{"key":"ref158","first-page":"1","article-title":"State2explanation: Concept-based explanations to benefit agent learning and user understanding","volume-title":"Proc. 37th Conf. Neural Inf. Process. Syst.","author":"Das"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1109\/ICDL55364.2023.10364407"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1145\/3477600"},{"key":"ref161","article-title":"LLM-coordination: Evaluating and analyzing multi-agent coordination abilities in large language models","author":"Agashe","year":"2023","journal-title":"arXiv:2310.03903"},{"key":"ref162","article-title":"SMART-LLM: Smart multi-agent robot task planning using large language models","author":"Kannan","year":"2023","journal-title":"arXiv:2309.10062"},{"key":"ref163","first-page":"1","article-title":"Leveraging large language models for optimised coordination in textual multi-agent reinforcement learning","volume-title":"Proc. ICLR","author":"Slumbers"},{"key":"ref164","article-title":"LLM-based multi-agent reinforcement learning: Current and future directions","author":"Sun","year":"2024","journal-title":"arXiv:2405.11106"},{"key":"ref165","first-page":"1","article-title":"Supervised pretraining can learn in-context reinforcement learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Lee"},{"key":"ref166","article-title":"Towards optimizing human-centric objectives in AI-assisted decision-making with offline reinforcement learning","author":"Bu\u00e7inca","year":"2024","journal-title":"arXiv:2403.05911"},{"key":"ref167","article-title":"Retrieval-augmented generation for knowledge-intensive NLP tasks","author":"Lewis","year":"2020","journal-title":"arXiv:2005.11401"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3367329"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1007\/s11704-024-40231-1"},{"key":"ref170","article-title":"Exploring large language model based intelligent agents: Definitions, methods, and prospects","author":"Cheng","year":"2024","journal-title":"arXiv:2401.03428"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/TCIAIG.2012.2186810"},{"key":"ref172","article-title":"A survey on the memory mechanism of large language model based agents","author":"Zhang","year":"2024","journal-title":"arXiv:2404.13501"},{"key":"ref173","article-title":"Gorilla: Large language model connected with massive Apis","author":"Patil","year":"2023","journal-title":"arXiv:2305.15334"},{"key":"ref174","article-title":"WebGPT: Browser-assisted question-answering with human feedback","author":"Nakano","year":"2021","journal-title":"arXiv:2112.09332"},{"key":"ref175","article-title":"Multi-agent collaboration: Harnessing the power of intelligent LLM agents","author":"Talebirad","year":"2023","journal-title":"arXiv:2306.03314"},{"key":"ref176","article-title":"Generating with confidence: Uncertainty quantification for black-box large language models","author":"Lin","year":"2023","journal-title":"arXiv:2305.19187"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1093\/jamia\/ocad133"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01055"},{"key":"ref179","article-title":"Efficient prompting via dynamic in-context learning","author":"Zhou","year":"2023","journal-title":"arXiv:2305.11170"},{"key":"ref180","article-title":"Parameter-efficient mixture-of-experts architecture for pre-trained language models","author":"Gao","year":"2022","journal-title":"arXiv:2203.01104"},{"key":"ref181","article-title":"Mamba: Linear-time sequence modeling with selective state spaces","author":"Gu","year":"2024","journal-title":"arXiv:2312.00752"},{"key":"ref182","article-title":"SkipDecode: Autoregressive skip decoding with batching and caching for efficient LLM inference","author":"Del Corro","year":"2023","journal-title":"arXiv:2307.02628"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72764-1_2"},{"key":"ref184","article-title":"Fine-tuning large language models with user-level differential privacy","author":"Charles","year":"2024","journal-title":"arXiv:2407.07737"},{"key":"ref185","article-title":"XAI meets LLMs: A survey of the relation between explainable AI and large language models","author":"Cambria","year":"2024","journal-title":"arXiv:2407.15248"},{"key":"ref186","article-title":"Efficient adversarial training in LLMs with continuous attacks","author":"Xhonneux","year":"2024","journal-title":"arXiv:2405.15589"},{"key":"ref187","article-title":"Deconstructing the ethics of large language models from long-standing issues to new-emerging dilemmas: A survey","author":"Deng","year":"2024","journal-title":"arXiv:2406.05392"}],"container-title":["IEEE Transactions on Neural Networks and Learning Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/5962385\/11022714\/10766898.pdf?arnumber=10766898","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,4]],"date-time":"2025-06-04T17:57:39Z","timestamp":1749059859000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10766898\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6]]},"references-count":187,"journal-issue":{"issue":"6"},"URL":"https:\/\/doi.org\/10.1109\/tnnls.2024.3497992","relation":{},"ISSN":["2162-237X","2162-2388"],"issn-type":[{"value":"2162-237X","type":"print"},{"value":"2162-2388","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,6]]}}}