{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,22]],"date-time":"2024-09-22T04:11:32Z","timestamp":1726978292051},"reference-count":69,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,6,16]],"date-time":"2024-06-16T00:00:00Z","timestamp":1718496000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,6,16]],"date-time":"2024-06-16T00:00:00Z","timestamp":1718496000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,6,16]]},"DOI":"10.1109\/cvpr52733.2024.02531","type":"proceedings-article","created":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T17:34:53Z","timestamp":1726508093000},"page":"26794-26804","source":"Crossref","is-referenced-by-count":0,"title":["Mirasol3B: A Multimodal Autoregressive Model for Time-Aligned and Contextual Modalities"],"prefix":"10.1109","author":[{"given":"AJ","family":"Piergiovanni","sequence":"first","affiliation":[{"name":"Google DeepMind"}]},{"given":"Isaac","family":"Noble","sequence":"additional","affiliation":[{"name":"Google Research"}]},{"given":"Dahun","family":"Kim","sequence":"additional","affiliation":[{"name":"Google DeepMind"}]},{"given":"Michael S.","family":"Ryoo","sequence":"additional","affiliation":[{"name":"Google DeepMind"}]},{"given":"Victor","family":"Gomes","sequence":"additional","affiliation":[{"name":"Google Research"}]},{"given":"Anelia","family":"Angelova","sequence":"additional","affiliation":[{"name":"Google DeepMind"}]}],"member":"263","reference":[{"key":"ref1","article-title":"CM3: A causal masked multimodal model of the internet","author":"Aghajanyan","year":"2022","journal-title":"ArXiv:2201.07520"},{"key":"ref2","volume-title":"Flamingo: a visual language model for few-shot learning","author":"Alayrac","year":"2022"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02209"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00293"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/icassp40776.2020.9053174"},{"key":"ref7","article-title":"Pali: A jointly-scaled multilingual language image model","author":"Chen","year":"2023","journal-title":"ICLR"},{"key":"ref8","article-title":"Videoofa: Two stage pre-training for video-to-text generation","author":"Chen","year":"2023","journal-title":"arXiv:abs\/2305.03204"},{"key":"ref9","article-title":"Vindlu: A recipe for effective video-and-language pretraining","author":"Cheng","year":"2022","journal-title":"arXiv preprint"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00241"},{"key":"ref11","article-title":"Violet: End-to-end video-language transformers with masked visual-token modeling","author":"Fu","year":"2021","journal-title":"arXiv:2111.1268"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02193"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01419"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3224688"},{"key":"ref15","article-title":"Contrastive audio-visual masked autoencoder","author":"Gong","year":"2023","journal-title":"ICLR"},{"key":"ref16","article-title":"Mavil: Masked audio-video learners","author":"Huang","journal-title":"ArXiv:2212.08071"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096198"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-55065-7_301691"},{"key":"ref19","volume-title":"Cooperative learning of audio and video models from self-supervised synchronization","author":"Korbar","year":"2018"},{"key":"ref20","article-title":"MaMMUT: A simple architecture for joint learning for multimodal tasks","author":"Kuo","year":"2023","journal-title":"Transactions on Machine Learning Research"},{"key":"ref21","volume-title":"Set transformer: A framework for attention-based permutation-invariant neural networks","author":"Lee","year":"2019"},{"key":"ref22","article-title":"Revealing single frame bias for video-and-language learnin","author":"Lei","year":"2022","journal-title":"ArXiv:abs\/2206.03428"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00490"},{"key":"ref24","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","author":"Li","year":"2021","journal-title":"NeurIPS"},{"key":"ref25","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","author":"Li","year":"2021","journal-title":"NeurIPS"},{"key":"ref26","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022","journal-title":"arXiv preprint"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01826"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"ref30","volume-title":"Attention bottlenecks for multimodal fusion","author":"Nagrani","year":"2021"},{"key":"ref31","article-title":"Streamult: Streaming multimodal transformer for heterogeneous and arbitrary long sequential data","author":"Pellegrain","year":"2021","journal-title":"ArXiv:2110.08021"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-022-13746-7"},{"key":"ref33","article-title":"Answer-me: Multi-task open-vocabulary visual question answering","author":"Piergiovanni","year":"2022","journal-title":"arXiv preprint"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_5"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00220"},{"key":"ref36","article-title":"Dynamic pretraining of vision-language models","volume-title":"First workshop on Multimodal Representation Learning, International Conference on Learning Representations (ICLR)","author":"Piergiovanni"},{"key":"ref37","article-title":"A generalist agent","author":"Reed","year":"2022","journal-title":"ArXiv:2205.06175"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1312"},{"key":"ref39","volume-title":"Tokenlearner: Adaptive space-time tokenization for videos","author":"Ryoo","year":"2021"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01828"},{"key":"ref41","article-title":"Finegrained audio-visual joint representations for multimodal large language models","author":"Sun","year":"2023","journal-title":"ArXiv:2310.05863"},{"key":"ref42","volume-title":"Long-form video-language pretraining with multimodal temporal contrastive learning","author":"Sun","year":"2022"},{"key":"ref43","article-title":"Multimodal transformer for unaligned multimodal language sequences","author":"Tsai","year":"2021","journal-title":"ArXiv:2110.08021"},{"key":"ref44","article-title":"Conditional image generation with pixelcnn decoders","author":"van den Oord","year":"2016","journal-title":"arXiv:1606.05328"},{"key":"ref45","article-title":"All in one: Exploring unified video-language pre-training","author":"Wang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref46","volume-title":"Omnivl:one foundation model for image-language and video-language tasks","author":"Wang","year":"2022"},{"key":"ref47","article-title":"Git: A generative image-to-text transformer for vision and language","author":"Wang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref48","article-title":"One-peace: Exploring one general representation model toward unlimited modalities","author":"Wang","year":"2023","journal-title":"ArXiv:2305.11172"},{"key":"ref49","article-title":"Image as a foreign language: Beit pretraining for all vision and vision language tasks","author":"Wang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref50","article-title":"Internvideo: General video foundation models via generative and discriminative learning","author":"Wang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref51","article-title":"Scaling autoregressive video models","author":"Weissenborn","year":"2020","journal-title":"ICLR"},{"key":"ref52","article-title":"Godiva: Generating open-domain videos from natural descriptions","author":"Wu","year":"2021","journal-title":"ArXiV:2104.14806"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19787-1_41"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00192"},{"key":"ref55","article-title":"Memvit: Memory-augmented multiscale vision transformer for efficient long-term video recognition","author":"Wu","year":"2023","journal-title":"ArXiv:2201.08383"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_3"},{"key":"ref58","article-title":"mplug-2: A modularized multi-modal foundation model across text, image and video","author":"Xu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref60","article-title":"Videococa: Video text modeling with zero-shot transfer from contrastive captioners","author":"Yan","year":"2022","journal-title":"ArXiV:2212.04979"},{"key":"ref61","article-title":"Videogpt: Video generation using vq-vae and transformers","author":"Yan","year":"2021","journal-title":"ArXiv:abs\/2104.10157"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00171"},{"key":"ref63","article-title":"Zero-shot video question answering via frozen bidirectional language models","author":"Yang","year":"2022","journal-title":"ArXiv:abs\/2206.08155"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.01413"},{"key":"ref65","volume-title":"Scaling autoregressive multi-modal models: Pretraining and instruction tuning","author":"Yu","year":"2023"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019127"},{"key":"ref67","volume-title":"Merlot: Multimodal neural script knowledge models","author":"Zellers","year":"2021"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01589"},{"key":"ref69","article-title":"Multiscale multimodal transformer for multimodal action recognition","author":"Zhu","journal-title":"ICLR, 2022"}],"event":{"name":"2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","location":"Seattle, WA, USA","start":{"date-parts":[[2024,6,16]]},"end":{"date-parts":[[2024,6,22]]}},"container-title":["2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10654794\/10654797\/10657010.pdf?arnumber=10657010","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,21]],"date-time":"2024-09-21T05:20:03Z","timestamp":1726896003000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10657010\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,16]]},"references-count":69,"URL":"http:\/\/dx.doi.org\/10.1109\/cvpr52733.2024.02531","relation":{},"subject":[],"published":{"date-parts":[[2024,6,16]]}}}