{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T16:42:30Z","timestamp":1777567350770,"version":"3.51.4"},"reference-count":68,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,6]]},"DOI":"10.1109\/cvpr52729.2023.00220","type":"proceedings-article","created":{"date-parts":[[2023,8,22]],"date-time":"2023-08-22T13:30:52Z","timestamp":1692711052000},"page":"2214-2224","source":"Crossref","is-referenced-by-count":60,"title":["Rethinking Video ViTs: Sparse Video Tubes for Joint Image and Video Learning"],"prefix":"10.1109","author":[{"given":"AJ","family":"Piergiovanni","sequence":"first","affiliation":[{"name":"Google Research"}]},{"given":"Weicheng","family":"Kuo","sequence":"additional","affiliation":[{"name":"Google Research"}]},{"given":"Anelia","family":"Angelova","sequence":"additional","affiliation":[{"name":"Google Research"}]}],"member":"263","reference":[{"key":"ref13","article-title":"X3d: Expanding architectures for efficient video recognition","author":"christoph","year":"0","journal-title":"CVPR"},{"key":"ref57","article-title":"What do position embeddings learn? an empirical study of pre-trained language model positional encoding","author":"wang","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"ref56","article-title":"Unidual: A unified model for image and video understanding","author":"wang","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00137"},{"key":"ref14","article-title":"Masked autoencoders as spatiotemporal learners","author":"feichtenhofer","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref58","author":"wei","year":"2021","journal-title":"Masked feature prediction for self-supervised visual pre-training"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01432"},{"key":"ref52","first-page":"69","article-title":"Efficient video transformers with spatial-temporal token selection","author":"wang","year":"0","journal-title":"European Conference on Computer Vision"},{"key":"ref11","article-title":"Omni-sourced webly-supervised learning for video recognition","author":"duan","year":"0","journal-title":"ECCV"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"ref10","article-title":"Revisiting 3d resnets for video recognition","author":"du","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"ref17","first-page":"1451","article-title":"Smart frame selection for action recognition","volume":"35","author":"shreyank","year":"0","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01563"},{"key":"ref19","article-title":"Perceiver io: A general architecture for structured inputs & outputs","author":"jaegle","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref18","article-title":"The &#x201C;omething something&#x201D; video database for learning and evaluating visual common sense","author":"goyal","year":"0","journal-title":"ICCV"},{"key":"ref51","author":"wang","year":"2022","journal-title":"Omnivl One foundation model for image-language and video-language tasks"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.441"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"ref45","article-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training","author":"tong","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00565"},{"key":"ref42","article-title":"How to train your vit? data, augmentation, and regularization in vision transformers","author":"steiner","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref41","article-title":"Two-stream convolutional networks for action recognition in videos","author":"simonyan","year":"0","journal-title":"NeurIPS"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.97"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00717"},{"key":"ref49","article-title":"Attention is all you need","author":"vaswani","year":"0","journal-title":"NeurIPS"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref7","article-title":"A short note on the kinetics-700 human action dataset","author":"carreira","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref9","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"0","journal-title":"ICLRE"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"ref6","article-title":"Is space-time attention all you need for video understanding?","author":"bertasius","year":"0","journal-title":"ICML"},{"key":"ref5","author":"bao","year":"2021","journal-title":"Beit Bert pre-training of image transformers"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.207"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_5"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00188"},{"key":"ref37","first-page":"13937","article-title":"Dynamicvit: Efficient vision transformers with dynamic token sparsification","volume":"34","author":"rao","year":"2021","journal-title":"Advances in neural information processing systems"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01233"},{"key":"ref31","article-title":"Expanding language-image pretrained models for general video recognition","author":"ni","year":"0","journal-title":"ECCV"},{"key":"ref30","first-page":"23296","article-title":"Intriguing properties of vision transformers","volume":"34","author":"muzammal naseer","year":"2021","journal-title":"Advances in neural information processing systems"},{"key":"ref33","article-title":"Keeping your eye on the ball: Trajectory attention in video transformers","author":"patrick","year":"0","journal-title":"NeurIPS 2021"},{"key":"ref32","first-page":"160","article-title":"K-centered patch sampling for efficient video recognition","author":"hyeon park","year":"0","journal-title":"European Conference on Computer Vision"},{"key":"ref2","author":"alayrac","year":"2022","journal-title":"Flamingo a visual language model for few-shot learning"},{"key":"ref1","article-title":"Vatt: Transformers for multimodal self-supervised learning from raw video, audio and text","author":"akbari","year":"0","journal-title":"NeurIPS"},{"key":"ref39","first-page":"654","article-title":"Assemblenet ++: Assembling modality representations via attention connections","author":"michael","year":"0","journal-title":"European Conference on Computer Vision"},{"key":"ref38","author":"ryoo","year":"2021","journal-title":"Tokenlearner Adaptive space-time tokenization for videos"},{"key":"ref24","article-title":"Not all patches are what you need: Expediting vision transformers via token reorganizations","author":"liang","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01332"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00476"},{"key":"ref67","article-title":"Co-training trans-former with videos and images improves action recognition","author":"zhang","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00718"},{"key":"ref25","article-title":"Polyvit: Co-training vision transformers on images, videos and audio","author":"likhosherstov","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref20","article-title":"The kinetics human action video dataset","author":"kay","year":"2017","journal-title":"ArXiv Preprint"},{"key":"ref64","article-title":"Florence: A new foundation model for computer vision","author":"yuan","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref63","author":"yu","year":"2022","journal-title":"CoCa Contrastive Captioners are Image-Text Foundation Models"},{"key":"ref22","article-title":"Diverse temporal aggregation and depthwise spatiotemporal factorization for efficient video classification","author":"lee","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01179"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01576"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01589"},{"key":"ref28","article-title":"Video swin transformer","author":"liu","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref27","first-page":"388","article-title":"Frozen clip models are efficient video learners","author":"lin","year":"0","journal-title":"European Conference on Computer Vision"},{"key":"ref29","article-title":"Token pooling in vision transformers","author":"marin","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref60","article-title":"Rethinking spatiotemporal feature learning: Speed-accuracy trade-offs in video classification","author":"xie","year":"0","journal-title":"ECCV"},{"key":"ref62","article-title":"Filip: Fine-grained interactive language-image pre-training","author":"yao","year":"2022","journal-title":"ICLRE"},{"key":"ref61","article-title":"Multiview trans-formers for video recognition","author":"yan","year":"0","journal-title":"CVPR"}],"event":{"name":"2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","location":"Vancouver, BC, Canada","start":{"date-parts":[[2023,6,17]]},"end":{"date-parts":[[2023,6,24]]}},"container-title":["2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10203037\/10203050\/10203937.pdf?arnumber=10203937","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,11]],"date-time":"2023-09-11T13:58:08Z","timestamp":1694440688000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10203937\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6]]},"references-count":68,"URL":"https:\/\/doi.org\/10.1109\/cvpr52729.2023.00220","relation":{},"subject":[],"published":{"date-parts":[[2023,6]]}}}