{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T10:00:42Z","timestamp":1777888842914,"version":"3.51.4"},"reference-count":130,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100013348","name":"Swiss Innovation Agency Innosuisse","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100013348","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100016353","name":"SDSC","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100016353","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100021847","name":"Swiss National Supercomputing Centre (CSCS)","doi-asserted-by":"publisher","award":["a03"],"award-info":[{"award-number":["a03"]}],"id":[{"id":"10.13039\/501100021847","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01008","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"10830-10843","source":"Crossref","is-referenced-by-count":0,"title":["EgoM2P: Egocentric Multimodal Multitask Pretraining"],"prefix":"10.1109","author":[{"given":"Gen","family":"Li","sequence":"first","affiliation":[{"name":"ETH Z&#x00FC;rich"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yutong","family":"Chen","sequence":"additional","affiliation":[{"name":"ETH Z&#x00FC;rich"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yiqian","family":"Wu","sequence":"additional","affiliation":[{"name":"ETH Z&#x00FC;rich"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kaifeng","family":"Zhao","sequence":"additional","affiliation":[{"name":"ETH Z&#x00FC;rich"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Marc","family":"Pollefeys","sequence":"additional","affiliation":[{"name":"ETH Z&#x00FC;rich"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Siyu","family":"Tang","sequence":"additional","affiliation":[{"name":"ETH Z&#x00FC;rich"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","year":"2023","journal-title":"Gpt-4v(ision) system card"},{"key":"ref2","article-title":"Phi-3 technical report: A highly capable language model locally on your phone","volume-title":"Technical Report MSR-TR-202412","author":"Abdin","year":"2024"},{"key":"ref3","article-title":"Cosmos world foundation model platform for physical ai","author":"Agarwal","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73030-6_14"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73030-6_14"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_20"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1977"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.226"},{"key":"ref10","article-title":"Introducing hot3d: An egocentric dataset for 3d hand and object tracking","author":"Banerjee","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref11","author":"Blattmann","year":"2023","journal-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72992-8_24"},{"key":"ref14","article-title":"Genie: Generative interactive environments","volume-title":"Forty-first International Conference on Machine Learning","author":"Bruce","year":"2024"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.52202\/068431-2272"},{"key":"ref19","author":"Chen","year":"2024","journal-title":"Control-a-video: Controllable text-to-video diffusion models with motion prior and reward feedback learning"},{"key":"ref20","article-title":"Scaling egocentric vision: The epickitchens dataset","volume-title":"European Conference on Computer Vision (ECCV)","author":"Damen","year":"2018"},{"key":"ref21","article-title":"An image is worth $16 \\times 16$ words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020","journal-title":"arXiv preprint arXiv"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01244"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00054"},{"key":"ref25","article-title":"Violet: End-toend video-language transformers with masked visual-token modeling","author":"Fu","year":"2021","journal-title":"arXiv preprint arXiv"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00050"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref28","author":"Goyal","year":"2018","journal-title":"Accurate, large minibatch sgd: Training imagenet in 1 hour"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01834"},{"key":"ref31","article-title":"Animatediff: Animate your personalized text-to-image diffusion models without specific tuning","volume-title":"The Twelfth International Conference on Learning Representations","author":"Guo","year":"2024"},{"key":"ref32","article-title":"World models","author":"Ha","year":"2018","journal-title":"arXiv preprint arXiv"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref34","article-title":"Classifier-free diffusion guidance","author":"Ho","year":"2021","journal-title":"NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.52202\/068431-0628"},{"key":"ref36","article-title":"The curious case of neural text degeneration","volume-title":"International Conference on Learning Representations","author":"Holtzman","year":"2020"},{"key":"ref37","article-title":"Cogvideo: Large-scale pretraining for text-tovideo generation via transformers","volume-title":"The Eleventh International Conference on Learning Representations","author":"Hong","year":"2023"},{"key":"ref38","article-title":"Autoregressive diffusion models","volume-title":"International Conference on Learning Representations","author":"Hoogeboom","year":"2022"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.02238"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_46"},{"key":"ref41","article-title":"Gpt-4o system card","author":"Hurst","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00559"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00907"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00678"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1907"},{"key":"ref48","first-page":"25105","article-title":"VideoPoet: A large language model for zero-shot video generation","volume-title":"Proceedings of the 41st International Conference on Machine Learning","author":"Kondratyuk","year":"2024"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.00998"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01879-7"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01374"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4321-9"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00981"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475596"},{"key":"ref57","author":"Liu","year":"2024","journal-title":"Llava-next: Improved reasoning, ocr, and world knowledge"},{"key":"ref58","article-title":"Exocentric-to-egocentric video generation","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems","author":"Liu","year":"2024"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01296"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02034"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02054"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"ref63","article-title":"Decoupled weight decay regularization","volume-title":"International Conference on Learning Representations","author":"Loshchilov","year":"2019"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02497"},{"key":"ref65","article-title":"UNIFIED-IO: A unified model for vision, language, and multi-modal tasks","volume-title":"The Eleventh International Conference on Learning Representations","author":"Lu","year":"2023"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02125"},{"key":"ref67","volume-title":"Dream Machine","year":"2024"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00984"},{"key":"ref69","article-title":"Aria everyday activities dataset","author":"Lv","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72691-0_25"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73397-0_18"},{"key":"ref73","volume-title":"Project Aria Glasses","year":"2023"},{"key":"ref74","article-title":"Transformers are sample-efficient world models","author":"Micheli","year":"2022","journal-title":"arXiv preprint arXiv"},{"key":"ref75","volume-title":"HoloLens 2","year":"2019"},{"key":"ref76","article-title":"4M: Massively multimodal masked modeling","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems","author":"Mizrahi","year":"2023"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01249"},{"key":"ref78","volume-title":"Video generation models as world simulators","year":"2024"},{"key":"ref79","article-title":"DINOv2: Learning robust visual features without supervision","author":"Oquab","year":"2024","journal-title":"Transactions on Machine Learning Research"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01842"},{"key":"ref81","article-title":"Movie gen: A cast of media foundation models","author":"Polyak","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref82","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford","year":"2021"},{"issue":"1","key":"ref83","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref85","volume-title":"Gen-3 Alpha","year":"2024"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20071-7_40"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02042"},{"key":"ref88","article-title":"Make-a-video: Text-to-video generation without text-video data","volume-title":"The Eleventh International Conference on Learning Representations","author":"Singer","year":"2023"},{"key":"ref89","article-title":"The Replica dataset: A digital replica of indoor spaces","author":"Straub","year":"2019","journal-title":"arXiv preprint arXiv"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.649"},{"key":"ref91","article-title":"Emu: Generative pretraining in multimodality","volume-title":"The Twelfth International Conference on Learning Representations","author":"Sun","year":"2024"},{"key":"ref92","article-title":"Gemini: a family of highly capable multimodal models","author":"Team","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref93","year":"2024","journal-title":"Kling ai video generator"},{"key":"ref94","article-title":"DROID-SLAM: Deep visual SLAM for monocular, stereo, and RGB-d cameras","author":"Teed","year":"2021","journal-title":"NeuRIPS"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.52202\/068431-0732"},{"key":"ref96","author":"Unterthiner","year":"2019","journal-title":"Towards accurate generative models of video: A new metric & challenges"},{"key":"ref97","article-title":"Diffusion models are real-time game engines","author":"Valevski","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref98","article-title":"Neural discrete representation learning","author":"van den Oord","year":"2017","journal-title":"Neural Information Processing Systems"},{"key":"ref99","article-title":"Attention is all you need","volume-title":"Advances in Neural Information Processing Systems","author":"Vaswani","year":"2017"},{"key":"ref100","article-title":"Phenaki: Variable length video generation from open domain textual descriptions","volume-title":"International Conference on Learning Representations","author":"Villegas","year":"2023"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00484"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"ref103","first-page":"23318","article-title":"OFA: unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework","volume-title":"International Conference on Machine Learning, ICML 2022","author":"Wang","year":"2022"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01956"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01854"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0334"},{"key":"ref108","article-title":"Egovid-5m: A large-scale video-action dataset for egocentric video generation","author":"Wang","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref109","article-title":"Internvideo: General video foundation models via generative and discriminative learning","author":"Wang","year":"2022","journal-title":"arXiv preprint arXiv"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73013-9_23"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"ref112","first-page":"53366","article-title":"Next-gpt: Any-to-any multimodal llm","volume-title":"Proceedings of the International Conference on Machine Learning","author":"Wu","year":"2024"},{"key":"ref113","author":"Wu","year":"2024","journal-title":"Deepseek-vl2: Mixture-of-experts vision-language models for advanced multimodal understanding"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"ref115","article-title":"Xgen: Ego-centric video prediction by watching exo-centric videos","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Xu","year":"2025"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"ref117","article-title":"Cogvideox: Text-to-video diffusion models with an expert transformer","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Yang","year":"2025"},{"key":"ref118","article-title":"MMEgo: Towards building egocentric multimodal LLMs","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Ye","year":"2025"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01806"},{"key":"ref120","article-title":"Vector-quantized image modeling with improved VQGAN","volume-title":"International Conference on Learning Representations","author":"Yu","year":"2022"},{"key":"ref121","article-title":"When and why vision-language models behave like bags-of-words, and what to do about it?","volume-title":"The Eleventh International Conference on Learning Representations","author":"Yuksekgonul","year":"2023"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00050"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20068-7_11"},{"key":"ref126","article-title":"Controlvideo: Training-free controllable text-to-video generation","volume-title":"The Twelfth International Conference on Learning Representations","author":"Zhang","year":"2024"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00637"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19778-9_39"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.511"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00589"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11444031.pdf?arnumber=11444031","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T04:59:51Z","timestamp":1777611591000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11444031\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":130,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01008","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}