{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T04:56:32Z","timestamp":1780635392208,"version":"3.54.1"},"publisher-location":"Cham","reference-count":88,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031732348","type":"print"},{"value":"9783031732355","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73235-5_25","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T06:01:53Z","timestamp":1727589713000},"page":"444-463","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["ViC-MAE: Self-supervised Representation Learning from\u00a0Images and\u00a0Video with\u00a0Contrastive Masked Autoencoders"],"prefix":"10.1007","author":[{"given":"Jefferson","family":"Hernandez","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ruben","family":"Villegas","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Vicente","family":"Ordonez","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"25_CR1","doi-asserted-by":"crossref","unstructured":"Agrawal, P., Carreira, J., Malik, J.: Learning to see by moving. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 37\u201345 (2015)","DOI":"10.1109\/ICCV.2015.13"},{"key":"25_CR2","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., Schmid, C.: ViViT: a video vision transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6836\u20136846 (2021)","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"25_CR3","doi-asserted-by":"publisher","unstructured":"Assran, M., et al.: Masked siamese networks for label-efficient learning. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision, ECCV 2022. LNCS, vol. 13691, pp. 456\u2013473. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19821-2_26","DOI":"10.1007\/978-3-031-19821-2_26"},{"key":"25_CR4","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: BEiT: BERT pre-training of image transformers. In: International Conference on Learning Representations (2021)"},{"key":"25_CR5","unstructured":"Bardes, A., et al.: Revisiting feature prediction for learning visual representations from video. arXiv preprint arXiv:2404.08471 (2024)"},{"key":"25_CR6","unstructured":"Bardes, A., Ponce, J., Lecun, Y.: VICReg: variance-invariance-covariance regularization for self-supervised learning. In: International Conference on Learning Representations, ICLR 2022 (2022)"},{"key":"25_CR7","doi-asserted-by":"crossref","unstructured":"Berg, T., Liu, J., Woo\u00a0Lee, S., Alexander, M.L., Jacobs, D.W., Belhumeur, P.N.: Birdsnap: large-scale fine-grained visual categorization of birds. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2011\u20132018 (2014)","DOI":"10.1109\/CVPR.2014.259"},{"key":"25_CR8","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? In: International Conference on Machine Learning, pp. 813\u2013824. PMLR (2021)"},{"key":"25_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"446","DOI":"10.1007\/978-3-319-10599-4_29","volume-title":"Computer Vision \u2013 ECCV 2014","author":"L Bossard","year":"2014","unstructured":"Bossard, L., Guillaumin, M., Van Gool, L.: Food-101 \u2013 mining discriminative components with random forests. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8694, pp. 446\u2013461. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10599-4_29"},{"key":"25_CR10","doi-asserted-by":"crossref","unstructured":"Cai, M., et al.: ViP-LLaVA: making large multimodal models understand arbitrary visual prompts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), June 2024, pp. 12914\u201312923 (2024)","DOI":"10.1109\/CVPR52733.2024.01227"},{"key":"25_CR11","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"25_CR12","unstructured":"Carreira, J., Noland, E., Banki-Horvath, A., Hillier, C., Zisserman, A.: A short note about kinetics-600. arXiv preprint arXiv:1808.01340 (2018)"},{"key":"25_CR13","unstructured":"Carreira, J., Noland, E., Hillier, C., Zisserman, A.: A short note on the kinetics-700 human action dataset. arXiv preprint arXiv:1907.06987 (2019)"},{"key":"25_CR14","doi-asserted-by":"crossref","unstructured":"Cascante-Bonilla, P., et al.: Going beyond nouns with vision & language models using synthetic data. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), October 2023, pp. 20155\u201320165 (2023)","DOI":"10.1109\/ICCV51070.2023.01844"},{"key":"25_CR15","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"key":"25_CR16","unstructured":"Chen, T., Kornblith, S., Swersky, K., Norouzi, M., Hinton, G.E.: Big self-supervised models are strong semi-supervised learners. In: Advances in Neural Information Processing Systems, vol. 33, pp. 22243\u201322255 (2020)"},{"key":"25_CR17","unstructured":"Chen, X., Fan, H., Girshick, R., He, K.: Improved baselines with momentum contrastive learning. arXiv preprint arXiv:2003.04297 (2020)"},{"key":"25_CR18","doi-asserted-by":"crossref","unstructured":"Chen, X., He, K.: Exploring simple siamese representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15750\u201315758 (2021)","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"25_CR19","doi-asserted-by":"crossref","unstructured":"Cimpoi, M., Maji, S., Kokkinos, I., Mohamed, S., Vedaldi, A.: Describing textures in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3606\u20133613 (2014)","DOI":"10.1109\/CVPR.2014.461"},{"key":"25_CR20","unstructured":"Dehghani, M., et\u00a0al.: Scaling vision transformers to 22 billion parameters. In: International Conference on Machine Learning, pp. 7480\u20137512. PMLR (2023)"},{"key":"25_CR21","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"25_CR22","doi-asserted-by":"crossref","unstructured":"Diba, A., Sharma, V., Gool, L.V., Stiefelhagen, R.: DynamoNet: dynamic action and motion network. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6192\u20136201 (2019)","DOI":"10.1109\/ICCV.2019.00629"},{"key":"25_CR23","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2020)"},{"key":"25_CR24","doi-asserted-by":"crossref","unstructured":"Fan, H., et al.: Multiscale vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6824\u20136835 (2021)","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"25_CR25","unstructured":"Fei-Fei, L., Fergus, R., Perona, P.: Learning generative visual models from few training examples: an incremental Bayesian approach tested on 101 object categories. In: 2004 Conference on Computer Vision and Pattern Recognition Workshop, pp. 178\u2013178. IEEE (2004)"},{"key":"25_CR26","unstructured":"Feichtenhofer, C., Fan, H., Li, Y., He, K.: Masked autoencoders as spatiotemporal learners. In: Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"25_CR27","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Xiong, B., Girshick, R., He, K.: A large-scale study on unsupervised spatiotemporal representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3299\u20133309 (2021)","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"25_CR28","doi-asserted-by":"crossref","unstructured":"Girdhar, R., et al.: ImageBind: one embedding space to bind them all. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15180\u201315190 (2023)","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"25_CR29","doi-asserted-by":"crossref","unstructured":"Girdhar, R., El-Nouby, A., Singh, M., Alwala, K.V., Joulin, A., Misra, I.: OmniMAE: single model masked pretraining on images and videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10406\u201310417 (2023)","DOI":"10.1109\/CVPR52729.2023.01003"},{"key":"25_CR30","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Singh, M., Ravi, N., van\u00a0der Maaten, L., Joulin, A., Misra, I.: Omnivore: a single model for many visual modalities. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16102\u201316112 (2022)","DOI":"10.1109\/CVPR52688.2022.01563"},{"key":"25_CR31","unstructured":"Gordon, D., Ehsani, K., Fox, D., Farhadi, A.: Watching the world go by: representation learning from unlabeled videos. arXiv preprint arXiv:2003.07990 (2020)"},{"key":"25_CR32","doi-asserted-by":"crossref","unstructured":"Goyal, R., et\u00a0al.: The \u201csomething something\u201d video database for learning and evaluating visual common sense. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5842\u20135850 (2017)","DOI":"10.1109\/ICCV.2017.622"},{"key":"25_CR33","unstructured":"Gupta, A., Wu, J., Deng, J., Li, F.F.: Siamese masked autoencoders. In: Oh, A., Naumann, T., Globerson, A., Saenko, K., Hardt, M., Levine, S. (eds.) Advances in Neural Information Processing Systems, vol.\u00a036, pp. 40676\u201340693. Curran Associates, Inc. (2023). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/7ffb9f1b57628932518505b532301603-Paper-Conference.pdf"},{"key":"25_CR34","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"25_CR35","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"25_CR36","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask R-CNN. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2961\u20132969 (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"25_CR37","doi-asserted-by":"crossref","unstructured":"He, R., Cascante-Bonilla, P., Yang, Z., Berg, A.C., Ordonez, V.: Improved visual grounding through self-consistent explanations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13095\u201313105 (2024)","DOI":"10.1109\/CVPR52733.2024.01244"},{"key":"25_CR38","unstructured":"He, R., Cascante-Bonilla, P., Yang, Z., Berg, A.C., Ordonez, V.: Learning from models and data for visual grounding (2024). https:\/\/arxiv.org\/abs\/2403.13804"},{"key":"25_CR39","unstructured":"Huang, Z., et al.: Contrastive masked autoencoders are stronger vision learners. arXiv preprint arXiv:2207.13532 (2022)"},{"key":"25_CR40","unstructured":"Kay, W., et al.: The kinetics human action video dataset (2017). https:\/\/arxiv.org\/abs\/1705.06950"},{"key":"25_CR41","doi-asserted-by":"crossref","unstructured":"Krause, J., Stark, M., Deng, J., Fei-Fei, L.: 3D object representations for fine-grained categorization. In: Proceedings of the IEEE International Conference on Computer Vision Workshops, pp. 554\u2013561 (2013)","DOI":"10.1109\/ICCVW.2013.77"},{"key":"25_CR42","unstructured":"Krizhevsky, A., Hinton, G.: Learning multiple layers of features from tiny images. Technical report\u00a00, University of Toronto, Toronto, Ontario (2009). https:\/\/www.cs.toronto.edu\/~kriz\/learning-features-2009-TR.pdf"},{"key":"25_CR43","doi-asserted-by":"crossref","unstructured":"Lehner, J., Alkin, B., F\u00fcrst, A., Rumetshofer, E., Miklautz, L., Hochreiter, S.: Contrastive tuning: a little help to make masked autoencoders forget. arXiv preprint arXiv:2304.10520 (2023)","DOI":"10.1609\/aaai.v38i4.28078"},{"key":"25_CR44","unstructured":"Li, K., et al.: UniFormerV2: spatiotemporal learning by arming image ViTs with video uniformer. arXiv preprint arXiv:2211.09552 (2022)"},{"key":"25_CR45","doi-asserted-by":"crossref","unstructured":"Li, K., et al.: Unmasked teacher: towards training-efficient video foundation models. arXiv preprint arXiv:2303.16058 (2023)","DOI":"10.1109\/ICCV51070.2023.01826"},{"key":"25_CR46","doi-asserted-by":"publisher","unstructured":"Li, Y., Mao, H., Girshick, R., He, K.: Exploring plain vision transformer backbones for object detection. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision, ECCV 2022. LNCS, vol. 13669, pp. 280\u2013296. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_17","DOI":"10.1007\/978-3-031-20077-9_17"},{"key":"25_CR47","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: MViTv2: improved multiscale vision transformers for classification and detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4804\u20134814 (2022)","DOI":"10.1109\/CVPR52688.2022.00476"},{"key":"25_CR48","unstructured":"Likhosherstov, V., et al.: PolyViT: co-training vision transformers on images, videos and audio. arXiv preprint arXiv:2111.12993 (2021)"},{"key":"25_CR49","doi-asserted-by":"crossref","unstructured":"Liu, Z., et\u00a0al.: Swin Transformer V2: scaling up capacity and resolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12009\u201312019 (2022)","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"25_CR50","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Video Swin Transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3202\u20133211 (2022)","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"25_CR51","unstructured":"Lotter, W., Kreiman, G., Cox, D.: Deep predictive coding networks for video prediction and unsupervised learning. In: International Conference on Learning Representations (2016)"},{"key":"25_CR52","unstructured":"Lu, C.Z., Jin, X., Huang, Z., Hou, Q., Cheng, M.M., Feng, J.: CMAE-V: contrastive masked autoencoders for video action recognition. arXiv preprint arXiv:2301.06018 (2023)"},{"key":"25_CR53","unstructured":"Maji, S., Rahtu, E., Kannala, J., Blaschko, M., Vedaldi, A.: Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151 (2013)"},{"key":"25_CR54","unstructured":"Mathieu, M., Couprie, C., LeCun, Y.: Deep multi-scale video prediction beyond mean square error. In: 4th International Conference on Learning Representations, ICLR 2016 (2016)"},{"key":"25_CR55","unstructured":"Mishra, S., et al.: A simple, efficient and scalable contrastive masked autoencoder for learning visual representations. arXiv preprint arXiv:2210.16870 (2022)"},{"issue":"2","key":"25_CR56","doi-asserted-by":"publisher","first-page":"502","DOI":"10.1109\/TPAMI.2019.2901464","volume":"42","author":"M Monfort","year":"2020","unstructured":"Monfort, M., et al.: Moments in time dataset: one million videos for event understanding. IEEE Trans. Pattern Anal. Mach. Intell. 42(2), 502\u2013508 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"25_CR57","doi-asserted-by":"crossref","unstructured":"Nilsback, M.E., Zisserman, A.: Automated flower classification over a large number of classes. In: 2008 Sixth Indian Conference on Computer Vision, Graphics & Image Processing, pp. 722\u2013729. IEEE (2008)","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"25_CR58","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"25_CR59","unstructured":"Oquab, M., et\u00a0al.: DINOv2: learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)"},{"key":"25_CR60","doi-asserted-by":"crossref","unstructured":"Parkhi, O.M., Vedaldi, A., Zisserman, A., Jawahar, C.: Cats and dogs. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition, pp. 3498\u20133505. IEEE (2012)","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"25_CR61","unstructured":"Parthasarathy, N., Eslami, S., Carreira, J., H\u00e9naff, O.J.: Self-supervised video pretraining yields strong image representations. arXiv preprint arXiv:2210.06433 (2022)"},{"key":"25_CR62","doi-asserted-by":"crossref","unstructured":"Pathak, D., Girshick, R., Doll\u00e1r, P., Darrell, T., Hariharan, B.: Learning features by watching objects move. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2701\u20132710 (2017)","DOI":"10.1109\/CVPR.2017.638"},{"key":"25_CR63","doi-asserted-by":"crossref","unstructured":"Piergiovanni, A., Kuo, W., Angelova, A.: Rethinking video ViTs: sparse video tubes for joint image and video learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2214\u20132224 (2023)","DOI":"10.1109\/CVPR52729.2023.00220"},{"key":"25_CR64","doi-asserted-by":"crossref","unstructured":"Qian, R., et al.: Spatiotemporal contrastive video representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6964\u20136974 (2021)","DOI":"10.1109\/CVPR46437.2021.00689"},{"issue":"7","key":"25_CR65","doi-asserted-by":"publisher","first-page":"1655","DOI":"10.1109\/TPAMI.2018.2846566","volume":"41","author":"F Radenovi\u0107","year":"2018","unstructured":"Radenovi\u0107, F., Tolias, G., Chum, O.: Fine-tuning CNN image retrieval with no human annotation. IEEE Trans. Pattern Anal. Mach. Intell. 41(7), 1655\u20131668 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"25_CR66","unstructured":"Shrivastava, A., Selvaraju, R.R., Naik, N., Ordonez, V.: CLIP-Lite: information efficient visual representation learning with language supervision. In: International Conference on Artificial Intelligence and Statistics, pp. 8433\u20138447. PMLR (2023)"},{"key":"25_CR67","doi-asserted-by":"crossref","unstructured":"Singh, N., Wu, C.W., Orife, I., Kalayeh, M.: Looking similar sounding different: leveraging counterfactual cross-modal pairs for audiovisual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26907\u201326918 (2024)","DOI":"10.1109\/CVPR52733.2024.02541"},{"key":"25_CR68","unstructured":"Sriram, A., Gaidon, A., Wu, J., Niebles, J.C., Fei-Fei, L., Adeli, E.: HomE: homography-equivariant video representation learning. arXiv preprint arXiv:2306.01623 (2023)"},{"key":"25_CR69","unstructured":"Srivastava, N., Mansimov, E., Salakhudinov, R.: Unsupervised learning of video representations using LSTMs. In: International Conference on Machine Learning, pp. 843\u2013852. PMLR (2015)"},{"key":"25_CR70","doi-asserted-by":"crossref","unstructured":"Srivastava, S., Sharma, G.: OmniVec: learning robust representations with cross modal sharing. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1236\u20131248 (2024)","DOI":"10.1109\/WACV57701.2024.00127"},{"key":"25_CR71","unstructured":"Tang, Y., Shimada, D., Bi, J., Xu, C.: AVicuna: audio-visual LLM with interleaver and context-boundary alignment for temporal referential dialogue (2024). https:\/\/arxiv.org\/abs\/2403.16276"},{"key":"25_CR72","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: VideoMAE: masked autoencoders are data-efficient learners for self-supervised video pre-training. In: Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"25_CR73","doi-asserted-by":"publisher","unstructured":"Touvron, H., Cord, M., J\u00e9gou, H.: DeiT III: revenge of the ViT. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision, ECCV 2022. LNCS, vol. 13684, pp. 516\u2013533. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20053-3_30","DOI":"10.1007\/978-3-031-20053-3_30"},{"key":"25_CR74","doi-asserted-by":"crossref","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A.: Anticipating visual representations from unlabeled video. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 98\u2013106 (2016)","DOI":"10.1109\/CVPR.2016.18"},{"key":"25_CR75","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"835","DOI":"10.1007\/978-3-319-46478-7_51","volume-title":"Computer Vision \u2013 ECCV 2016","author":"J Walker","year":"2016","unstructured":"Walker, J., Doersch, C., Gupta, A., Hebert, M.: An uncertain future: forecasting from static images using variational autoencoders. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016, Part VII 14. LNCS, vol. 9911, pp. 835\u2013851. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46478-7_51"},{"key":"25_CR76","doi-asserted-by":"crossref","unstructured":"Wang, L., et al.: VideoMAE V2: scaling video masked autoencoders with dual masking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14549\u201314560 (2023)","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"25_CR77","doi-asserted-by":"crossref","unstructured":"Wang, R., et al.: BEVT: BERT pretraining of video transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14733\u201314743 (2022)","DOI":"10.1109\/CVPR52688.2022.01432"},{"key":"25_CR78","doi-asserted-by":"crossref","unstructured":"Wang, R., et al.: Masked video distillation: rethinking masked feature modeling for self-supervised video representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6312\u20136322 (2023)","DOI":"10.1109\/CVPR52729.2023.00611"},{"key":"25_CR79","doi-asserted-by":"crossref","unstructured":"Wang, X., Gupta, A.: Unsupervised learning of visual representations using videos. In: International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.320"},{"key":"25_CR80","doi-asserted-by":"crossref","unstructured":"Wang, X., Jabri, A., Efros, A.A.: Learning correspondence from the cycle-consistency of time. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2566\u20132576 (2019)","DOI":"10.1109\/CVPR.2019.00267"},{"key":"25_CR81","doi-asserted-by":"crossref","unstructured":"Wei, C., Fan, H., Xie, S., Wu, C.Y., Yuille, A., Feichtenhofer, C.: Masked feature prediction for self-supervised visual pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14668\u201314678 (2022)","DOI":"10.1109\/CVPR52688.2022.01426"},{"key":"25_CR82","doi-asserted-by":"crossref","unstructured":"Wu, H., Wang, X.: Contrastive learning of image representations with cross-video cycle-consistency. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10149\u201310159 (2021)","DOI":"10.1109\/ICCV48922.2021.00999"},{"key":"25_CR83","doi-asserted-by":"crossref","unstructured":"Xiao, J., Hays, J., Ehinger, K.A., Oliva, A., Torralba, A.: Sun database: large-scale scene recognition from abbey to zoo. In: 2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, pp. 3485\u20133492. IEEE (2010)","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"25_CR84","doi-asserted-by":"crossref","unstructured":"Xu, J., Wang, X.: Rethinking self-supervised correspondence learning: a video frame-level similarity perspective. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10075\u201310085 (2021)","DOI":"10.1109\/ICCV48922.2021.00992"},{"key":"25_CR85","doi-asserted-by":"crossref","unstructured":"Yan, S., et al.: Multiview transformers for video recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3333\u20133343 (2022)","DOI":"10.1109\/CVPR52688.2022.00333"},{"key":"25_CR86","unstructured":"Zhang, B., et al.: Co-training transformer with videos and images improves action recognition. arXiv preprint arXiv:2112.07175 (2021)"},{"issue":"6","key":"25_CR87","doi-asserted-by":"publisher","first-page":"1452","DOI":"10.1109\/TPAMI.2017.2723009","volume":"40","author":"B Zhou","year":"2017","unstructured":"Zhou, B., Lapedriza, A., Khosla, A., Oliva, A., Torralba, A.: Places: a 10 million image database for scene recognition. IEEE Trans. Pattern Anal. Mach. Intell. 40(6), 1452\u20131464 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"25_CR88","unstructured":"Zhou, J., et al.: iBOT: image BERT pre-training with online tokenizer. arXiv preprint arXiv:2111.07832 (2021)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73235-5_25","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T06:18:53Z","timestamp":1727590733000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73235-5_25"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031732348","9783031732355"],"references-count":88,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73235-5_25","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}