{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T19:17:55Z","timestamp":1777663075192,"version":"3.51.4"},"reference-count":62,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"10","license":[{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100008530","name":"European Regional Development Fund","doi-asserted-by":"crossref","id":[{"id":"10.13039\/501100008530","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Neural Netw. Learning Syst."],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1109\/tnnls.2025.3585949","type":"journal-article","created":{"date-parts":[[2025,7,22]],"date-time":"2025-07-22T18:06:04Z","timestamp":1753207564000},"page":"19106-19118","source":"Crossref","is-referenced-by-count":2,"title":["Video Prediction of Dynamic Physical Simulations With Pixel-Space Spatiotemporal Transformers"],"prefix":"10.1109","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-2042-6696","authenticated-orcid":false,"given":"Dean L.","family":"Slack","sequence":"first","affiliation":[{"name":"Department of Computer Science, Durham University, Durham, U.K."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3562-3593","authenticated-orcid":false,"given":"G.","family":"Thomas Hudson","sequence":"additional","affiliation":[{"name":"Department of Computer Science, Durham University, Durham, U.K."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Thomas","family":"Winterbottom","sequence":"additional","affiliation":[{"name":"Department of Computer Science, Durham University, Durham, U.K."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8942-355X","authenticated-orcid":false,"given":"Noura","family":"Al Moubayed","sequence":"additional","affiliation":[{"name":"Department of Computer Science, Durham University, Durham, U.K."}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref2","article-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Dosovitskiy"},{"key":"ref3","article-title":"VideoGPT: Video generation using VQ-VAE and transformers","author":"Yan","year":"2021","journal-title":"arXiv:2104.10157"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3045007"},{"key":"ref5","first-page":"14837","article-title":"Generating diverse high-fidelity images with VQ-VAE-2","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Razavi"},{"key":"ref6","article-title":"Text-to-image diffusion models in generative AI: A survey","author":"Zhang","year":"2023","journal-title":"arXiv:2303.07909"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00770"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.2987281"},{"key":"ref9","article-title":"Emergent abilities of large language models","volume":"2022","author":"Wei","year":"2022","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref10","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. NIPS","author":"Brown"},{"key":"ref11","article-title":"Benchmarking autoregressive conditional diffusion models for turbulent flow simulation","author":"Kohl","year":"2023","journal-title":"arXiv:2309.01745"},{"key":"ref12","article-title":"MetNet: A neural weather model for precipitation forecasting","author":"S\u00f8nderby","year":"2020","journal-title":"arXiv:2003.12140"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2017.7989324"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00659"},{"key":"ref15","article-title":"GAIA-1: A generative world model for autonomous driving","author":"Hu","year":"2023","journal-title":"arXiv:2309.17080"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00317"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3696415"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3261988"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01008"},{"key":"ref20","article-title":"A survey on future frame synthesis: Bridging deterministic and generative approaches","author":"Ming","year":"2024","journal-title":"arXiv:2401.14718"},{"key":"ref21","article-title":"Representation learning with contrastive predictive coding","author":"van den Oord","year":"2018","journal-title":"arXiv:1807.03748"},{"key":"ref22","first-page":"10541","article-title":"Large scale adversarial representation learning","volume-title":"Proc. NeurIPS","volume":"32","author":"Donahue"},{"key":"ref23","article-title":"Video (language) modeling: A baseline for generative models of natural videos","author":"Ranzato","year":"2014","journal-title":"arXiv:1412.6604"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICAC3N53548.2021.10702914"},{"key":"ref25","first-page":"6306","article-title":"Neural discrete representation learning","volume-title":"Proc. NIPS","author":"van den Oord"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19787-1_41"},{"key":"ref27","article-title":"CogVideo: Large-scale pretraining for text-to-video generation via transformers","volume-title":"Proc. 11th Int. Conf. Learn. Represent.","author":"Hong"},{"key":"ref28","first-page":"658","article-title":"Generating images with perceptual similarity metrics based on deep networks","volume-title":"Proc. NIPS","volume":"29","author":"Dosovitskiy"},{"key":"ref29","article-title":"Transformation-based models of video sequences","author":"van Amersfoort","year":"2017","journal-title":"arXiv:1701.08435"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01084"},{"key":"ref31","first-page":"12077","article-title":"SegFormer: Simple and efficient design for semantic segmentation with transformers","volume-title":"Proc. Neural Inf. Process. Syst. (NeurIPS)","author":"Xie"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2102.05095"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP46576.2022.9897982"},{"key":"ref34","first-page":"1","article-title":"Scaling autoregressive video models","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Weissenborn"},{"key":"ref35","first-page":"1747","article-title":"Pixel recurrent neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Van Den Oord"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.5220\/0010241801010112"},{"key":"ref37","article-title":"Implicit stacked autoregressive model for video prediction","author":"Seo","year":"2023","journal-title":"arXiv:2303.07849"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.5555\/3157096.3157104"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01149"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.5244\/C.30.39"},{"key":"ref41","article-title":"The power of next-frame prediction for learning physical laws","author":"Winterbottom","year":"2024","journal-title":"arXiv:2405.17450"},{"key":"ref42","first-page":"1","article-title":"Continuous PDE dynamics forecasting with implicit neural representations","volume-title":"Proc. 11th Int. Conf. Learn. Represent.","author":"Yin"},{"key":"ref43","first-page":"3424","article-title":"Accelerating Eulerian fluid simulation with convolutional networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Tompson"},{"key":"ref44","first-page":"11128","article-title":"Learning stable deep dynamics models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Manek"},{"key":"ref45","first-page":"1494","article-title":"Deep learning for physical processes: Incorporating prior scientific knowledge","volume-title":"Proc. ICLR","author":"de Bezenac"},{"key":"ref46","first-page":"1","article-title":"Fourier neural operator for parametric partial differential equations","volume-title":"Proc. ICLR","author":"Li"},{"key":"ref47","first-page":"21640","article-title":"Meta-learning dynamics forecasting using task inference","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref48","article-title":"Gaussian error linear units (GELUs)","author":"Hendrycks","year":"2016","journal-title":"arXiv:1606.08415"},{"key":"ref49","first-page":"22466","article-title":"Vision transformers need registers","volume-title":"Proc. 12th Int. Conf. Learn. Represent.","author":"Darcet"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1016\/0168-9274(95)00108-5"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00035"},{"key":"ref52","first-page":"1","article-title":"Learning particle dynamics for manipulating rigid bodies, deformable objects, and fluids","volume-title":"Proc. ICLR","author":"Li"},{"key":"ref53","first-page":"843","article-title":"Unsupervised learning of video representations using LSTMs","volume-title":"Proc. 32nd Int. Conf. Int. Conf. Mach. Learn.","volume":"37","author":"Srivastava"},{"key":"ref54","first-page":"344","article-title":"Self-supervised visual planning with temporal skip connections","volume-title":"Proc. 1st Conf. Robot Learn. (CoRL)","volume":"78","author":"Ebert"},{"key":"ref55","first-page":"1","article-title":"Adam: A method for stochastic optimization","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Kingma"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"ref57","article-title":"Open-sora: Democratizing efficient video production for all","author":"Zheng","year":"2024","journal-title":"arXiv:2412.20404"},{"key":"ref58","first-page":"1","article-title":"CV-VAE: A compatible video VAE for latent generative video models","volume-title":"Proc. 38th Annu. Conf. Neural Inf. Process. Syst.","author":"Zhao"},{"key":"ref59","article-title":"OPT: Open pre-trained transformer language models","author":"Zhang","year":"2022","journal-title":"arXiv:2205.01068"},{"issue":"140","key":"ref60","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2019","journal-title":"J. Mach. Learn. Res."},{"issue":"240","key":"ref61","first-page":"1","article-title":"PaLM: Scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2022","journal-title":"J. Mach. Learn. Res."},{"key":"ref62","article-title":"FVD: A new metric for video generation","volume-title":"Proc. DGS@ICLR","author":"Unterthiner"}],"container-title":["IEEE Transactions on Neural Networks and Learning Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/5962385\/11195929\/11089993.pdf?arnumber=11089993","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T17:38:35Z","timestamp":1759945115000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11089993\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10]]},"references-count":62,"journal-issue":{"issue":"10"},"URL":"https:\/\/doi.org\/10.1109\/tnnls.2025.3585949","relation":{},"ISSN":["2162-237X","2162-2388"],"issn-type":[{"value":"2162-237X","type":"print"},{"value":"2162-2388","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,10]]}}}