{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T16:54:06Z","timestamp":1774630446548,"version":"3.50.1"},"reference-count":120,"publisher":"Zhejiang University Press","issue":"12","license":[{"start":{"date-parts":[[2020,10,15]],"date-time":"2020-10-15T00:00:00Z","timestamp":1602720000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,10,15]],"date-time":"2020-10-15T00:00:00Z","timestamp":1602720000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Front Inform Technol Electron Eng"],"published-print":{"date-parts":[[2020,12]]},"DOI":"10.1631\/fitee.1900533","type":"journal-article","created":{"date-parts":[[2020,10,15]],"date-time":"2020-10-15T10:03:39Z","timestamp":1602756219000},"page":"1726-1744","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":243,"title":["Deep reinforcement learning: a survey"],"prefix":"10.1631","volume":"21","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0792-3858","authenticated-orcid":false,"given":"Hao-nan","family":"Wang","sequence":"first","affiliation":[]},{"given":"Ning","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Yi-yun","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Da-wei","family":"Feng","sequence":"additional","affiliation":[]},{"given":"Feng","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Dong-sheng","family":"Li","sequence":"additional","affiliation":[]},{"given":"Yi-ming","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"635","published-online":{"date-parts":[[2020,10,15]]},"reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/1015330.1015430"},{"key":"ref2","first-page":"22","article-title":"Constrained policy optimization","volume-title":"Proc 34th Int Conf on Machine Learning","author":"Achiam","year":"2017"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-19738-4_12"},{"key":"ref4","first-page":"10019","article-title":"Neural voice cloning with a few samples","volume-title":"Proc 32nd Neural Information Processing Systems","author":"Arik","year":"2018"},{"key":"ref5","first-page":"2930","article-title":"Playing hard exploration games by watching YouTube","volume-title":"Proc 32nd Neural Information Processing Systems","author":"Aytar","year":"2018"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1613\/jair.3912"},{"key":"ref7","first-page":"1471","article-title":"Unifying count-based exploration and intrinsic motivation","volume-title":"Proc 30th Neural Information Processing Systems","author":"Bellemare","year":"2016"},{"key":"ref8","first-page":"449","article-title":"A distributional perspective on reinforcement learning","volume-title":"Proc 34th Int Conf on Machine Learning","author":"Bellemare","year":"2017"},{"key":"ref9","first-page":"1613","article-title":"Weight uncertainty in neural networks","volume-title":"Proc 32nd Int Conf on Machine Learning","author":"Blundell","year":"2015"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/j.tics.2019.02.006"},{"key":"ref11","first-page":"8224","article-title":"Sample-efficient reinforcement learning with stochastic ensemble value expansion","volume-title":"Proc 32nd Neural Information Processing Systems","author":"Buckman","year":"2018"},{"key":"ref12","article-title":"Large-scale study of curiosity-driven learning","author":"Burda","year":"2019"},{"key":"ref13","first-page":"2249","article-title":"An empirical evaluation of Thomp-son sampling","volume-title":"Proc 24th Neural Information Processing Systems","author":"Chapelle","year":"2011"},{"key":"ref14","first-page":"703","article-title":"Combining model-based and model-free updates for trajectory-centric reinforcement learning","volume-title":"Proc 34th Int Conf on Machine Learning","author":"Chebotar","year":"2017"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3230543.3230551"},{"key":"ref16","article-title":"Sample efficient adaptive text-to-speech","author":"Chen","year":"2019"},{"key":"ref17","first-page":"4754","article-title":"Deep re-inforcement learning in a handful of trials using proba-bilistic dynamics models","volume-title":"Proc 32nd Neural Information Processing Systems","author":"Chua","year":"2018"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/icra.2017.7989250"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p17-1045"},{"key":"ref20","volume-title":"RL2: fast reinforcement learning via slow reinforcement learning","author":"Duan","year":"2017"},{"key":"ref21","first-page":"344","article-title":"Self-supervised visual planning with temporal skip connections","volume-title":"Proc 1 st Annual Conf on Robot Learning","author":"Ebert","year":"2017"},{"key":"ref22","article-title":"Model-based value estimation for efficient model-free reinforcement learning","author":"Feinberg","year":"2018"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/icra.2017.7989324"},{"key":"ref24","first-page":"49","article-title":"Guided cost learning: deep inverse optimal control via policy optimization","volume-title":"Proc 33rd Int Conf on Machine Learning","author":"Finn","year":"2016a"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/icra.2016.7487173"},{"key":"ref26","first-page":"1126","article-title":"Model-agnostic meta-learning for fast adaptation of deep networks","volume-title":"Proc 34th Int Conf on Machine Learning","author":"Finn","year":"2017a"},{"key":"ref27","first-page":"357","article-title":"One-shot visual imitation learning via meta-learning","volume-title":"Proc 1st Conf on Robot Learning","author":"Finn","year":"2017b"},{"key":"ref28","article-title":"Noisy networks for exploration","author":"Fortunato","year":"2019"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/iros.2016.7759592"},{"key":"ref30","first-page":"2577","article-title":"EX2: exploration with exemplar models for deep reinforcement learning","volume-title":"Proc 30th Neural Information Processing Systems","author":"Fu","year":"2017a"},{"key":"ref31","article-title":"Learning robust rewards with adversarial inverse reinforcement learning","author":"Fu","year":"2017b"},{"key":"ref32","first-page":"1587","article-title":"Addressing function approximation error in actor-critic methods","volume-title":"Proc 35th Int Conf on Machine Learning","author":"Fujimoto","year":"2018"},{"key":"ref33","first-page":"3581","article-title":"Concrete dropout","volume-title":"Proc 30th Neural Information Processing Systems","author":"Gal","year":"2017"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.65109\/ihus5472"},{"key":"ref35","first-page":"7879","article-title":"SMILe: scalable meta inverse reinforcement learning through context-conditional policies","volume-title":"Proc 32nd Neural Information Processing Systems","author":"Ghasemipour","year":"2019"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/n18-1032"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d18-1398"},{"key":"ref38","first-page":"2829","article-title":"Continuous deep Q-learning with model-based acceleration","volume-title":"Proc 33rd Int Conf on Machine Learning","author":"Gu","year":"2016"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/icra.2017.7989385"},{"key":"ref40","article-title":"Q-Prop: sample-efficient policy gradient with an off-policy critic","author":"Gu","year":"2017b"},{"key":"ref41","first-page":"5302","article-title":"Meta-reinforcement learning of structured exploration strategies","volume-title":"Proc 32nd Neural Information Processing Systems","author":"Gupta","year":"2018"},{"key":"ref42","first-page":"1352","article-title":"Reinforcement learning with deep energy-based policies","volume-title":"Proc 34th Int Conf on Machine Learning","author":"Haarnoja","year":"2017"},{"key":"ref43","first-page":"1861","article-title":"Soft actor-critic: off-policy maximum entropy deep reinforcement learning with a stochastic actor","volume-title":"Proc 35th Int Conf on Machine Learning","author":"Haarnoja","year":"2018"},{"key":"ref44","article-title":"Deep recurrent Q-learning for partially observable MDPs","author":"Hausknecht","year":"2017"},{"key":"ref45","first-page":"820","article-title":"Dual learning for machine translation","volume-title":"Proc 30th Neural Information Processing Systems","author":"He","year":"2016"},{"key":"ref46","article-title":"Emergence of locomotion behaviours in rich environments","author":"Heess","year":"2017"},{"key":"ref47","article-title":"Rain-bow: combining improvements in deep reinforcement learning","author":"Hessel","year":"2018"},{"key":"ref48","first-page":"4565","article-title":"Generative adversarial imitation learning","volume-title":"Proc 30th Neural Information Processing Systems","author":"Ho","year":"2016"},{"key":"ref49","article-title":"Distributed prioritized experience replay","author":"Horgan","year":"2018"},{"key":"ref50","first-page":"1109","article-title":"Variational information maximizing exploration","volume-title":"Proc 30th Neural Information Processing Systems","author":"Houthooft","year":"2017"},{"key":"ref51","first-page":"267","article-title":"Approximately optimal approx-imate reinforcement learning","volume-title":"Proc 19th Int Conf on Machine Learning","author":"Kakade","year":"2002"},{"key":"ref52","first-page":"651","article-title":"QT-Opt: scalable deep reinforcement learning for vision-based robotic manipulation","volume-title":"Proc 2nd Conf on Robot Learning","author":"Kalashnikov","year":"2018"},{"key":"ref53","first-page":"1007","article-title":"Inverse reinforcement learning through structured classification","volume-title":"Proc 25th Neural Information Processing Systems","author":"Klein","year":"2012"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1145\/1553374.1553441"},{"key":"ref55","first-page":"1097","article-title":"ImageNet clas-sification with deep convolutional neural networks","volume-title":"Proc 25th Neural Information Processing Systems","author":"Krizhevsky","year":"2012"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1016\/0921-8890(95)00026-c"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ijcnn.2012.6252823"},{"key":"ref58","first-page":"1","article-title":"Guided policy search","volume-title":"Proc 30th Int Conf on Machine Learning","author":"Levine","year":"2013"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/icra.2015.7138994"},{"issue":"1","key":"ref60","first-page":"1334","article-title":"End-to-end training of deep visuomotor policies","volume":"17","author":"Levine","year":"2016","journal-title":"J Mach Learn Res"},{"key":"ref61","first-page":"2829","article-title":"Continuous control with deep reinforcement learning","volume-title":"Proc 4th Int Conf on Learning Representations","author":"Lillicrap","year":"2016"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1007\/bf00992699"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1145\/3005745.3005750"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1145\/3341302.3342080"},{"key":"ref65","first-page":"2490","article-title":"Park: an open platform for learning-augmented computer systems","volume-title":"Proc 36th Int Conf on Machine Learning","author":"Mao","year":"2019b"},{"key":"ref66","article-title":"A simple neural attentive meta-learner","author":"Mishra","year":"2018"},{"key":"ref67","article-title":"Playing Atari with deep reinforcement learning","author":"Mnih","year":"2013"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1038\/nature14236"},{"key":"ref69","first-page":"1928","article-title":"Asynchronous methods for deep reinforcement learning","volume-title":"Proc 33rd Int Conf on Machine Learning","author":"Mnih","year":"2016"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-56991-8_32"},{"key":"ref71","first-page":"2775","article-title":"Bridging the gap between value and policy based reinforcement learning","volume-title":"Proc 31st Neural Information Processing Systems","author":"Nachum","year":"2017a"},{"key":"ref72","article-title":"Trust-PCL: an off-policy trust region method for continuous control","author":"Nachum","year":"2017b"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/icra.2018.8463189"},{"key":"ref74","article-title":"Learning to adapt in dynamic, real-world environments through meta- reinforcement learning","author":"Nagabandi","year":"2019"},{"key":"ref75","first-page":"663","article-title":"Algorithms for inverse reinforcement learning","volume-title":"Proc 17th Int Conf on Machine Learning","author":"Ng","year":"2000"},{"key":"ref76","first-page":"4026","article-title":"Deep ex-ploration via bootstrapped DQN","volume-title":"Proc 29th Neural Information Processing Systems","author":"Osband","year":"2016"},{"key":"ref77","first-page":"2721","article-title":"Count- based exploration with neural density models","volume-title":"Proc 34th Int Conf on Machine Learning","author":"Ostrovski","year":"2017"},{"key":"ref78","article-title":"Actor-Mimic: deep multitask and transfer reinforcement learning","author":"Parisotto","year":"2016"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/cvprw.2017.70"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201311"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/icra.2018.8460528"},{"key":"ref82","first-page":"214","article-title":"Deep voice 3: 2000-speaker neural text- to-speech","volume-title":"Proc Int Conf on Learning Representations","author":"Ping","year":"2018"},{"key":"ref83","article-title":"Observe and look further: achieving consistent performance on Atari","author":"Pohlen","year":"2018"},{"key":"ref84","first-page":"5694","article-title":"Imagination-augmented agents for deep reinforcement learning","volume-title":"Proc 31st Neural Information Processing Systems","author":"Racanicre","year":"2017"},{"key":"ref85","article-title":"Learning real manipulation tasks from virtual demon-strations using LSTM","author":"Rahmatizadeh","year":"2016"},{"key":"ref86","article-title":"EPOpt: learning robust neural network policies using model en-sembles","author":"Rajeswaran","year":"2017"},{"key":"ref87","first-page":"5331","article-title":"Efficient off-policy meta-reinforcement learning via probabilistic context variables","volume-title":"Proc 36th Int Conf on Machine Learning","author":"Rakelly","year":"2019"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143936"},{"key":"ref89","first-page":"1583","article-title":"Learning to optimize via information-directed sampling","volume-title":"Proc 27th Neural Information Processing Systems","author":"Russo","year":"2014"},{"key":"ref90","article-title":"Policy distillation","author":"Rusu","year":"2016a"},{"key":"ref91","article-title":"Pro-gressive neural networks","author":"Rusu","year":"2016b"},{"key":"ref92","article-title":"Prioritized experience replay","author":"Schaul","year":"2016"},{"key":"ref93","first-page":"1889","article-title":"Trust re-gion policy optimization","volume-title":"Proc Int Conf on Machine Learning","author":"Schulman","year":"2015"},{"key":"ref94","article-title":"High-dimensional continuous control using generalized ad-vantage estimation","author":"Schulman","year":"2016"},{"key":"ref95","article-title":"Proximal policy optimization algorithms","author":"Schulman","year":"2017"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1631\/fitee.1700826"},{"key":"ref97","first-page":"387","article-title":"Deterministic policy gradient algorithms","volume-title":"Proc 31st Int Conf on Machine Learning","author":"Silver","year":"2014"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1038\/nature16961"},{"key":"ref99","first-page":"4693","article-title":"Towards end-to-end prosody transfer for expressive speech synthesis with Tacotron","volume-title":"Int Conf on Machine Learning","author":"Skerry-Ryan","year":"2018"},{"key":"ref100","article-title":"Some considerations on learning to explore via meta-reinforcement learning","author":"Stadie","year":"2018"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1016\/j.jcss.2007.08.009"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1023\/a:1022633531479"},{"key":"ref103","volume-title":"Reinforcement Learning: an Introduction","author":"Sutton","year":"2018"},{"key":"ref104","first-page":"2753","article-title":"#Exploration: a study of count-based exploration for deep reinforcement learning","volume-title":"Proc 31 st Neural Information Processing Systems","author":"Tang","year":"2017"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v30i1.10295"},{"key":"ref106","article-title":"Meta-learning: a survey","author":"Vanschoren","year":"2018"},{"key":"ref107","article-title":"StarCraft II: a new challenge for reinforcement learning","author":"Vinyals","year":"2017"},{"key":"ref108","article-title":"Learning to reinforcement learn","author":"Wang","year":"2017"},{"key":"ref109","first-page":"1995","article-title":"Dueling network architectures for deep reinforcement learning","volume-title":"Proc 33rd Int Conf on Machine Learning","author":"Wang","year":"2016"},{"key":"ref110","article-title":"Sample efficient actor-critic with experience replay","author":"Wang","year":"2017"},{"key":"ref111","first-page":"2746","article-title":"Em-bed to control: a locally linear latent dynamics model for control from raw images","volume-title":"Proc 28th Neural Information Processing Systems","author":"Watter","year":"2015"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1023\/a:1022672621406"},{"key":"ref113","first-page":"5279","article-title":"Scalable trust-region method for deep reinforcement learning using Kronecker-factored approximation","volume-title":"Proc 30th Neural Information Processing Systems","author":"Wu","year":"2017"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1016\/j.robot.2016.06.003"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/iros.2017.8202141"},{"key":"ref116","doi-asserted-by":"crossref","DOI":"10.15607\/RSS.2018.XIV.002","article-title":"One-shot imitation from observing humans via domain-adaptive meta-learning","author":"Yu","year":"2018"},{"key":"ref117","doi-asserted-by":"crossref","DOI":"10.15607\/RSS.2017.XIII.048","article-title":"Preparing for the unknown: learning a universal policy with online system identification","author":"Yu","year":"2017"},{"key":"ref118","first-page":"7444","article-title":"SOLAR: deep structured representations for model-based rein-forcement learning","volume-title":"Proc 36th Int Conf on Machine Learning","author":"Zhang","year":"2019"},{"key":"ref119","first-page":"1433","article-title":"Maximum entropy inverse reinforcement learning","volume-title":"Proc 23rd AAAI Conf on Artificial Intelligence","author":"Ziebart","year":"2008"},{"key":"ref120","first-page":"7693","article-title":"Fast context adaptation via meta-learning","volume-title":"Proc 36th Int Conf on Machine Learning","author":"Zintgraf","year":"2019"}],"container-title":["Frontiers of Information Technology &amp; Electronic Engineering"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1631\/FITEE.1900533.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1631\/FITEE.1900533\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1631\/FITEE.1900533.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T06:59:35Z","timestamp":1771657175000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1631\/FITEE.1900533"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,10,15]]},"references-count":120,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2020,12]]}},"alternative-id":["1584"],"URL":"https:\/\/doi.org\/10.1631\/fitee.1900533","relation":{},"ISSN":["2095-9184","2095-9230"],"issn-type":[{"value":"2095-9184","type":"print"},{"value":"2095-9230","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,10,15]]},"assertion":[{"value":"29 September 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 March 2020","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 October 2020","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}