{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T10:15:55Z","timestamp":1777889755548,"version":"3.51.4"},"reference-count":41,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.02043","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"22004-22013","source":"Crossref","is-referenced-by-count":0,"title":["Everything is a Video: Unifying Modalities Through Next-Frame Prediction"],"prefix":"10.1109","author":[{"given":"G.","family":"Thomas Hudson","sequence":"first","affiliation":[{"name":"Durham University,Department of Computer Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dean","family":"Slack","sequence":"additional","affiliation":[{"name":"Durham University,Department of Computer Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Thomas","family":"Winterbottom","sequence":"additional","affiliation":[{"name":"Durham University,Department of Computer Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jamie","family":"Sterling","sequence":"additional","affiliation":[{"name":"Durham University,Department of Computer Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chenghao","family":"Xiao","sequence":"additional","affiliation":[{"name":"Durham University,Department of Computer Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junjie","family":"Shentu","sequence":"additional","affiliation":[{"name":"Durham University,Department of Computer Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Noura","family":"Al Moubayed","sequence":"additional","affiliation":[{"name":"Durham University,Department of Computer Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-023-16560-x"},{"key":"ref2","article-title":"data2vec: A general framework for self-supervised learning in speech, vision and language","author":"Baevski","year":"2022","journal-title":"ArXiv, abs\/2202.03555"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/j.jfranklin.2023.11.038"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2102.05095"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2015.2475625"},{"key":"ref6","first-page":"1691","article-title":"Generative pretraining from pixels","volume-title":"International conference on machine learning","author":"Chen"},{"key":"ref7","article-title":"Uniter: Learning universal image-text representations","author":"Chen","year":"2019","journal-title":"ArXiv, abs\/1909.11740"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00633"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9412541"},{"key":"ref10","article-title":"An image is worth $16\\times 16$ words: Transformers for image recognition at scale","volume-title":"International Conference on Learning Representations","author":"Dosovitskiy","year":"2021"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00552"},{"key":"ref12","article-title":"Improving language understanding from screenshots","author":"Gao","year":"2024","journal-title":"arXiv preprint"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72986-7_23"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00147"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01189-x"},{"key":"ref16","article-title":"Unifying question answering, text classification, and regression via span extraction","author":"Shirish Keskar","year":"2019","journal-title":"arXiv preprint"},{"key":"ref17","author":"Krizhevsky","year":"2009","journal-title":"Learning multiple layers of features from tiny images"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-023-0342-8"},{"key":"ref19","article-title":"Decoupled weight decay regularization","volume-title":"International Conference on Learning Representations","author":"Loshchilov","year":"2017"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.7717\/peerj-cs.1400"},{"key":"ref21","article-title":"The natural language decathlon: Multitask learning as question answering","author":"McCann","year":"2018","journal-title":"arXiv preprint"},{"key":"ref22","article-title":"Transframer: Arbitrary frame prediction with generative models","author":"Nash","year":"2022","journal-title":"arXiv preprint"},{"key":"ref23","year":"2024","journal-title":"Gpt-4 technical report"},{"key":"ref24","article-title":"Video (language) modeling: a baseline for generative models of natural videos","author":"Ranzato","year":"2014","journal-title":"ArXiv, abs\/1412.6604"},{"key":"ref25","article-title":"Language modelling with pixels","author":"Rust","year":"2022","journal-title":"arXiv preprint"},{"key":"ref26","article-title":"Implicit stacked autoregressive model for video prediction","author":"Seo","year":"2023","journal-title":"ArXiv, abs\/2303.07849"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01519"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D13-1170"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.874"},{"key":"ref30","article-title":"van Amersfoort, Anitha Kannan, Marc\u2019Aurelio Ranzato, Arthur Szlam, Du Tran, and Soumith Chintala","volume":"abs\/1701.08435","author":"Joost","year":"2017","journal-title":"Transformation-based models of video sequences"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680886"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2017.05.001"},{"key":"ref34","article-title":"Pixel sentence representation learning","author":"Xiao","year":"2024","journal-title":"arXiv preprint"},{"key":"ref35","author":"Yan","year":"2021","journal-title":"Videogpt: Video generation using vq-vae and transformers"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73235-5_19"},{"key":"ref37","article-title":"Clevrer: Collision events for video representation and reasoning","author":"Yi","year":"2019","journal-title":"arXiv preprint"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP42928.2021.9506210"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46487-9_40"},{"key":"ref40","article-title":"Bag of tricks for effective language model pretraining and downstream adaptation: A case study on glue","author":"Zhong","year":"2023","journal-title":"arXiv preprint"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ITCA52113.2020.00069"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11445480.pdf?arnumber=11445480","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:18:16Z","timestamp":1777612696000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11445480\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":41,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.02043","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}