{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T16:13:12Z","timestamp":1774627992647,"version":"3.50.1"},"reference-count":211,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"6","license":[{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"MoDeaAS","award":["PID2019-104818RB-I00"],"award-info":[{"award-number":["PID2019-104818RB-I00"]}]},{"name":"Spanish national grants for PhD studies","award":["FPU17\/00166"],"award-info":[{"award-number":["FPU17\/00166"]}]},{"name":"Spanish national grants for PhD studies","award":["ACIF\/2018\/197"],"award-info":[{"award-number":["ACIF\/2018\/197"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2022,6,1]]},"DOI":"10.1109\/tpami.2020.3045007","type":"journal-article","created":{"date-parts":[[2020,12,15]],"date-time":"2020-12-15T21:40:58Z","timestamp":1608068458000},"page":"2806-2826","source":"Crossref","is-referenced-by-count":203,"title":["A Review on Deep Learning Techniques for Video Prediction"],"prefix":"10.1109","volume":"44","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9875-7110","authenticated-orcid":false,"given":"Sergiu","family":"Oprea","sequence":"first","affiliation":[{"name":"Department of Computer Technology, University of Alicante, Alicante, Spain"}]},{"given":"Pablo","family":"Martinez-Gonzalez","sequence":"additional","affiliation":[{"name":"Department of Computer Technology, University of Alicante, Alicante, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9575-6403","authenticated-orcid":false,"given":"Alberto","family":"Garcia-Garcia","sequence":"additional","affiliation":[{"name":"Campus UAB, Institute of Space Sciences (ICE-CSIC), Barcelona, Spain"}]},{"given":"John Alejandro","family":"Castro-Vargas","sequence":"additional","affiliation":[{"name":"Department of Computer Technology, University of Alicante, Alicante, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6817-6326","authenticated-orcid":false,"given":"Sergio","family":"Orts-Escolano","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Artificial Intelligence, University of Alicante, Alicante, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7798-3055","authenticated-orcid":false,"given":"Jose","family":"Garcia-Rodriguez","sequence":"additional","affiliation":[{"name":"Department of Computer Technology, University of Alicante, Alicante, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8230-3192","authenticated-orcid":false,"given":"Antonis","family":"Argyros","sequence":"additional","affiliation":[{"name":"FORTH, Institute of Computer Science, Heraklion, Greece"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01594-9"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2018.05.018"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.3390\/app10217524"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1038\/nature14539"},{"key":"ref5","first-page":"64","article-title":"Unsupervised learning for physical interaction through video prediction","volume-title":"Proc. 30th Int. Conf. Neural Inf. Process. Syst.","author":"Finn"},{"key":"ref6","article-title":"Visual foresight: Model-based deep reinforcement learning for vision-based robotic control","author":"Ebert","year":"2018"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2430335"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2019.xv.001"},{"key":"ref9","first-page":"2555","article-title":"Learning latent dynamics for planning from pixels","volume-title":"Proc. 36th Int. Conf. Mach. Learn.","author":"Hafner"},{"key":"ref10","article-title":"Model based reinforcement learning for atari","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kaiser"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00441"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00031"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_45"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00566"},{"key":"ref15","first-page":"89","article-title":"Action anticipation by predicting future dynamic images","volume-title":"Proc. Eur. Conf. Comput. Vis. Workshops","author":"Opazo"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01240-3_36"},{"key":"ref17","article-title":"Bayesian prediction of future street scenes using synthetic likelihoods","volume-title":"Proc. Int. Conf. Learn. Representations Poster","author":"Bhattacharyya"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2019.00186"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00684"},{"key":"ref20","first-page":"802","article-title":"Convolutional LSTM network: A machine learning approach for precipitation nowcasting","volume-title":"Proc. 28th Int. Conf. Neural Inf. Process. Syst.","author":"Shi"},{"key":"ref21","first-page":"5617","article-title":"Deep learning for precipitation nowcasting: A benchmark and a new model","volume-title":"Proc. Conf. Neural Inf. Process. Syst.","author":"Shi"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.478"},{"key":"ref23","first-page":"809","article-title":"Unsupervised pixel-prediction","volume-title":"Proc. 8th Int. Conf. Neural Inf. Process. Syst.","author":"Softky"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1038\/4580"},{"issue":"2","key":"ref25","doi-asserted-by":"crossref","first-page":"107","DOI":"10.1023\/A:1012423722458","article-title":"Predictive coding in the visual cortex by a recurrent network with gabor receptive fields","volume":"14","author":"Deco","year":"2001","journal-title":"Neural Process. Lett."},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1037\/0096-1523.30.3.519"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1037\/0096-3445.120.3.235"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/4528.001.0001"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1016\/j.visres.2013.10.017"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.3389\/fpsyg.2012.00548"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.50"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.320"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.13"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00769"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.262"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_32"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00840"},{"key":"ref38","article-title":"Stochastic variational video prediction","volume-title":"Proc. Int. Conf. Learn. Representations Poster","author":"Babaeizadeh"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.3390\/make2020006"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TCI.2016.2644865"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.4467\/20838476SI.16.004.6185"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.694"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00418"},{"key":"ref44","article-title":"Deep multi-scale video prediction beyond mean square error","volume-title":"Proc. Int. Conf. Learn. Representations Poster","author":"Mathieu"},{"key":"ref45","first-page":"658","article-title":"Generating images with perceptual similarity metrics based on deep networks","volume-title":"Proc. 30th Int. Conf. Neural Inf. Process. Syst.","author":"Dosovitskiy"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_43"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.19"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.481"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_36"},{"key":"ref50","article-title":"Unsupervised learning of visual structure using predictive generative networks","author":"Lotter","year":"2015"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123349"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.595"},{"key":"ref53","first-page":"6033","article-title":"Hierarchical long-term video prediction without supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wichers"},{"key":"ref54","first-page":"3560","article-title":"Learning to generate long-term future via hierarchical prediction","volume-title":"Proc. 34th Int. Conf. Mach. Learn.","author":"Villegas"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.361"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.194"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.77"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00101"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2004.1334462"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2007.70711"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"ref62","article-title":"UCF101: A dataset of 101 human actions classes from videos in the wild","author":"Soomro","year":"2012"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.280"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.248"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2016.10.018"},{"key":"ref66","article-title":"Spatio-temporal video autoencoder with differentiable memory","volume-title":"Proc. Int. Conf. Learn. Representations Workshop","author":"Patraucean"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206631"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1177\/0278364913491297"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.350"},{"key":"ref70","article-title":"Deep predictive coding networks for video prediction and unsupervised learning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Lotter"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2018.00141"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00252"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"key":"ref75","article-title":"Youtube-8m: A large-scale video classification benchmark","author":"Abu-El-Haija","year":"2016"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1145\/2812802"},{"key":"ref77","first-page":"1601","article-title":"The recurrent temporal restricted boltzmann machine","volume-title":"Proc. Advances Neural Inf. Process. Syst.","author":"Sutskever"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1162\/NECO_a_00247"},{"key":"ref79","first-page":"100","article-title":"Learning invariant features by harnessing the aperture problem","volume-title":"Proc. 30th Int. Conf. Mach. Learn.","author":"Memisevic"},{"key":"ref80","first-page":"843","article-title":"Unsupervised learning of video representations using LSTMs","volume-title":"Proc. 32nd Int. Conf. Mach. Learn.","author":"Srivastava"},{"key":"ref81","first-page":"344","article-title":"Self-supervised visual planning with temporal skip connections","volume":"78","author":"Ebert","year":"2017","journal-title":"Proc. Conf. Robot Learn."},{"key":"ref82","article-title":"Robonet: Large-scale multi-robot learning","author":"Dasari","year":"2019"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-009-0402-9"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5540145"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1613\/jair.3912"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.400"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2018.8594495"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1038\/s41597-019-0168-5"},{"key":"ref89","first-page":"92","article-title":"Unsupervised learning of object structure and dynamics from videos","volume-title":"Proc. Advances Neural Inf. Process. Syst.","author":"Minderer"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.18"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_30"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_51"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1145\/3126686.3126737"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00819"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.01045"},{"key":"ref96","article-title":"Dream to control: Learning behaviors by latent imagination","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Hafner"},{"key":"ref97","article-title":"Video (language) modeling: a baseline for generative models of natural videos","author":"Ranzato","year":"2014"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2010-343"},{"key":"ref99","article-title":"Very deep convolutional networks for large-scale image recognition","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Simonyan"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-24673-2_3"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_16"},{"key":"ref102","article-title":"Progressive growing of GANs for improved quality, stability, and variation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Karras"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.5194\/isprs-archives-xlii-2-w16-3-2019"},{"key":"ref104","first-page":"5769","article-title":"Improved training of wasserstein GANs","volume-title":"Proc. 31st Int. Conf. Neural Inf. Process. Syst.","author":"Gulrajani"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2019.00048"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00461"},{"key":"ref107","article-title":"Photo-realistic video prediction on natural videos of largely changing frames","author":"Shouno","year":"2020"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00191"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2019.8756585"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_44"},{"key":"ref111","article-title":"Eidetic 3D LSTM: A model for video prediction and beyond","volume-title":"Proc. Int. Conf. Learn. Representations Poster","author":"Wang"},{"key":"ref112","article-title":"Efficient and information-preserving future frame prediction and beyond","volume-title":"Proc. Int. Conf. Learn. Representations Poster","author":"Yu"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01270-0_46"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_44"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1162\/neco.2010.01-09-953"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126419"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.53"},{"key":"ref118","first-page":"1925","article-title":"Modeling deep temporal dependencies with recurrent grammar cells","volume-title":"Proc. 27th Int. Conf. Neural Inf. Process. Syst.","author":"Michalski"},{"key":"ref119","first-page":"2017","article-title":"Spatial transformer networks","volume-title":"Proc. 28th Int. Conf. Neural Inf. Process. Syst.","author":"Jaderberg"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.230"},{"key":"ref121","article-title":"Transformation-based models of video sequences","author":"van Amersfoort","year":"2017"},{"key":"ref122","first-page":"2863","article-title":"Action-conditional video prediction using deep networks in atari games","volume-title":"Proc. 28th Int. Conf. Neural Inf. Process. Syst.","author":"Oh"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299117"},{"key":"ref124","first-page":"667","article-title":"Dynamic filter networks","volume-title":"Proc. 30th Int. Conf. Neural Inf. Process. Syst.","author":"Brabandere"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.319"},{"key":"ref126","first-page":"613","article-title":"Generating videos with scene dynamics","volume-title":"Proc. 30th Int. Conf. Neural Inf. Process. Syst.","author":"Vondrick"},{"key":"ref127","article-title":"Adversarial video generation on complex datasets","author":"Clark","year":"2019"},{"key":"ref128","article-title":"Transformation-based adversarial video prediction on large-scale data","author":"Luc","year":"2020"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2389824"},{"key":"ref130","first-page":"568","article-title":"Two-stream convolutional networks for action recognition in videos","volume-title":"Proc. 27th Int. Conf. Neural Inf. Process. Syst.","author":"Simonyan"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00165"},{"key":"ref132","article-title":"Decomposing motion and content for natural video sequence prediction","volume-title":"Proc. Int. Conf. Learn. Representations Poster","author":"Villegas"},{"key":"ref133","first-page":"4417","article-title":"Unsupervised learning of disentangled representations from video","volume-title":"Proc. 31st Int. Conf. Neural Inf. Process. Syst.","author":"Denton"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00910"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00558"},{"key":"ref136","first-page":"515","article-title":"Learning to decompose and disentangle representations for video prediction","volume-title":"Proc. 32nd Int. Conf. Neural Inf. Process. Syst.","author":"Hsieh"},{"key":"ref137","first-page":"6691","article-title":"Neural expectation maximization","volume-title":"Proc. 31st Int. Conf. Neural Inf. Process. Syst.","author":"Greff"},{"key":"ref138","article-title":"Relational neural expectation maximization: Unsupervised discovery of objects and their interactions","volume-title":"Proc. Int. Conf. Learn. Representations Poster","author":"van Steenkiste"},{"key":"ref139","first-page":"14 222","article-title":"Are disentangled representations helpful for abstract visual reasoning?","volume-title":"Proc. Advances Neural Inf. Process. Syst.","author":"van Steenkiste"},{"key":"ref140","article-title":"Recurrent environment simulators","volume-title":"Proc. Int. Conf. Learn. Representations Poster","author":"Chiappa"},{"key":"ref141","article-title":"Learning visual predictive models of physics for playing billiards","volume-title":"Proc. Int. Conf. Learn. Representations Poster","author":"Fragkiadaki"},{"key":"ref142","article-title":"Learning to act by predicting the future","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy"},{"key":"ref143","article-title":"Self-supervised learning of predictive segmentation models from video","author":"Luc","year":"2019"},{"key":"ref144","first-page":"6918","article-title":"Predicting scene parsing and motion dynamics in the future","volume-title":"Proc. 31st Int. Conf. Neural Inf. Process. Syst.","author":"Jin"},{"key":"ref145","article-title":"Future semantic segmentation with convolutional LSTM","volume-title":"Proc. British Mach. Vis. Conf.","author":"Nabavi"},{"key":"ref146","first-page":"2654","article-title":"Do deep nets really need to be deep?","volume-title":"Proc. 27th Int. Conf. Neural Inf. Process. Syst.","author":"Ba"},{"key":"ref147","article-title":"Distilling the knowledge in a neural network","author":"Hinton","year":"2015"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01066"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.2992184"},{"key":"ref150","article-title":"Future segmentation using 3D structure","author":"Vora","year":"2018"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298720"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.660"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.75"},{"key":"ref154","article-title":"Variational approaches for auto-encoding generative adversarial networks","author":"Rosca","year":"2017"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350949"},{"key":"ref157","first-page":"1252","article-title":"Deep visual analogy-making","volume-title":"Proc. 28th Int. Conf. Neural Inf. Process. Syst.","author":"Reed"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-67070-2_36"},{"key":"ref159","article-title":"Stochastic adversarial video prediction","author":"Lee","year":"2018"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2019.8803792"},{"key":"ref161","first-page":"1174","article-title":"Stochastic video generation with a learned prior","volume-title":"Proc. 35th Int. Conf. Mach. Learn.","author":"Denton"},{"key":"ref162","article-title":"Learning a driving simulator","author":"Santana","year":"2016"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.10735"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.45"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2017.8296547"},{"key":"ref167","article-title":"Traffic4cast: Traffic map movie forecasting","year":"2020"},{"key":"ref168","article-title":"A short note about kinetics-600","author":"Carreira","year":"2018"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2004.1315150"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.28"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298925"},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01270-0_42"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-88682-2_5"},{"key":"ref174","article-title":"Generating multi-agent trajectories using programmatic weak supervision","volume-title":"Proc. Int. Conf. Learn. Representations Poster","author":"Zhan"},{"key":"ref175","first-page":"1234","article-title":"Learning to linearize under uncertainty","volume-title":"Proc. 28th Int. Conf. Neural Inf. Process. Syst.","author":"Goroshin"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.465"},{"key":"ref177","article-title":"Motion prediction under multimodality with conditional stochastic networks","author":"Fragkiadaki","year":"2017","journal-title":"arXiv: 1705.02082"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15555-0_21"},{"key":"ref179","article-title":"Prediction under uncertainty with error-encoding networks","author":"Henaff","year":"2017"},{"key":"ref180","first-page":"5092","article-title":"Learning to poke by poking: Experiential learning of intuitive physics","volume-title":"Proc. 30th Int. Conf. Neural Inf. Process. Syst.","author":"Agrawal"},{"key":"ref181","first-page":"1928","article-title":"Asynchronous methods for deep reinforcement learning","volume-title":"Proc. 33rd Int. Conf. Int. Conf. Mach. Learn.","author":"Mnih"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.10857"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00770"},{"key":"ref184","article-title":"BDD100K: A diverse driving video database with scalable annotation tooling","author":"Yu","year":"2018"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.534"},{"key":"ref186","first-page":"2236","article-title":"Visual Dynamics: Probabilistic future frame synthesis via cross convolutional networks","volume-title":"Proc. Advances Neural Inf. Process. Syst.","author":"Xue"},{"key":"ref187","article-title":"Auto-encoding variational bayes","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kingma"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_47"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.1145\/2185520.2185561"},{"key":"ref190","first-page":"81","article-title":"High fidelity video prediction with large stochastic recurrent neural networks","volume-title":"Proc. Advances Neural Inf. Process. Syst.","author":"Villegas"},{"key":"ref191","first-page":"3745","article-title":"Ladder variational autoencoders","volume-title":"Proc. 30th Int. Conf. Neural Inf. Process. Syst.","author":"S\u00f8nderby"},{"key":"ref192","article-title":"Video extrapolation with an invertible linear embedding","author":"Pottorff","year":"2019"},{"key":"ref193","article-title":"Glow: Generative flow with invertible 1x1 convolutions","volume-title":"Proc. Conf. Neural Inf. Process. Syst.","author":"Kingma"},{"key":"ref194","article-title":"Videoflow: A conditional flow-based model for stochastic video generation","volume-title":"Proc. Int. Conf. Learn. Representations Poster","author":"Kumar"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"ref196","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"ref197","article-title":"Towards accurate generative models of video: A new metric & challenges","author":"Unterthiner","year":"2018"},{"key":"ref198","first-page":"2234","article-title":"Improved techniques for training GANs","volume-title":"Proc. 30th Int. Conf. Neural Inf. Process. Syst.","author":"Salimans"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1162\/NECO_a_00158"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.1098\/rspb.1980.0020"},{"key":"ref201","first-page":"1486","article-title":"Deep generative image models using a laplacian pyramid of adversarial networks","volume-title":"Proc. 28th Int. Conf. Neural Inf. Process. Syst.","author":"Denton"},{"key":"ref202","first-page":"879","article-title":"PredRNN: Recurrent neural networks for predictive learning using spatiotemporal lstms","volume-title":"Proc. 31st Int. Conf. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref203","first-page":"5123","article-title":"Predrnn++: Towards A resolution of the deep-in-time dilemma in spatiotemporal predictive learning","volume-title":"Proc. 35th Int. Conf. Mach. Learn.","author":"Wang"},{"key":"ref204","article-title":"Video ladder networks","author":"Cricri","year":"2016"},{"key":"ref205","first-page":"6011","article-title":"Recurrent ladder networks","volume-title":"Proc. 31st Int. Conf. Neural Inf. Process. Syst.","author":"Pr"},{"key":"ref206","first-page":"1771","article-title":"Video pixel networks","volume-title":"Proc. 34th Int. Conf. Mach. Learn.","author":"Kalchbrenner"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2018.8594264"},{"key":"ref208","first-page":"296","article-title":"Mutual suppression network for video prediction using disentangled features","author":"Lee","year":"2019","journal-title":"Proc. British Mach. Vis. Conf."},{"key":"ref209","article-title":"Scaling autoregressive video models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Weissenborn"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.396"},{"key":"ref211","article-title":"A note on the evaluation of generative models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Theis"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/9769881\/09294028.pdf?arnumber=9294028","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,9]],"date-time":"2024-01-09T23:13:06Z","timestamp":1704841986000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9294028\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,6,1]]},"references-count":211,"journal-issue":{"issue":"6"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2020.3045007","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,6,1]]}}}