{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,27]],"date-time":"2025-07-27T07:15:35Z","timestamp":1753600535247,"version":"3.40.3"},"publisher-location":"Cham","reference-count":74,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031732461"},{"type":"electronic","value":"9783031732478"}],"license":[{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73247-8_3","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T12:02:20Z","timestamp":1730376140000},"page":"37-56","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Data Collection-Free Masked Video Modeling"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9485-6840","authenticated-orcid":false,"given":"Yuchi","family":"Ishikawa","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6485-2480","authenticated-orcid":false,"given":"Masayoshi","family":"Kondo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7361-0027","authenticated-orcid":false,"given":"Yoshimitsu","family":"Aoki","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,1]]},"reference":[{"key":"3_CR1","unstructured":"Abu-El-Haija, S., et al.: Youtube-8m: a large-scale video classification benchmark. arXiv preprint arXiv:1609.08675 (2016)"},{"key":"3_CR2","doi-asserted-by":"crossref","unstructured":"Ahsan, U., Madhok, R., Essa, I.: Video jigsaw: Unsupervised learning of spatiotemporal context for video action recognition. In: 2019 IEEE Winter Conference on Applications of Computer Vision (WACV), pp. 179\u2013189. IEEE (2019)","DOI":"10.1109\/WACV.2019.00025"},{"key":"3_CR3","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., Schmid, C.: Vivit: a video vision transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6836\u20136846 (2021)","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"3_CR4","unstructured":"Asano, Y.M., Rupprecht, C., Zisserman, A., Vedaldi, A.: Pass: an imagenet replacement for self-supervised pretraining without humans. arXiv preprint arXiv:2109.13228 (2021)"},{"key":"3_CR5","doi-asserted-by":"crossref","unstructured":"Bandara, W.G.C., Patel, N., Gholami, A., Nikkhah, M., Agrawal, M., Patel, V.M.: Adamae: adaptive masking for efficient spatiotemporal learning with masked autoencoders. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14507\u201314517 (2023)","DOI":"10.1109\/CVPR52729.2023.01394"},{"key":"3_CR6","unstructured":"Baradad, M., Chen, R., Wulff, J., Wang, T., Feris, R., Torralba, A., Isola, P.: Procedural image programs for representation learning. In: Advance in Neural Information Processing System,vol. 35, pp. 6450\u20136462 (2022)"},{"key":"3_CR7","unstructured":"Baradad Jurjo, M., Wulff, J., Wang, T., Isola, P., Torralba, A.: Learning to see by looking at noise. In: Advance in Neural Information Processing System, vol. 34, pp. 2556\u20132569 (2021)"},{"key":"3_CR8","doi-asserted-by":"crossref","unstructured":"Benaim, S., et al.: Speednet: learning the speediness in videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9922\u20139931 (2020)","DOI":"10.1109\/CVPR42600.2020.00994"},{"key":"3_CR9","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? In: ICML, vol.\u00a02, p.\u00a04 (2021)"},{"key":"3_CR10","unstructured":"Buolamwini, J., Gebru, T.: Gender shades: intersectional accuracy disparities in commercial gender classification. In: Conference on Fairness, Accountability and Transparency, pp. 77\u201391. PMLR (2018)"},{"key":"3_CR11","doi-asserted-by":"crossref","unstructured":"Caba\u00a0Heilbron, F., Escorcia, V., Ghanem, B., Carlos\u00a0Niebles, J.: Activitynet: a large-scale video benchmark for human activity understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 961\u2013970 (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"3_CR12","doi-asserted-by":"crossref","unstructured":"Chen, X., Xie, S., He, K.: An empirical study of training self-supervised vision transformers. arXiv preprint arXiv:2104.020572(5), 6 (2021)","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"3_CR13","unstructured":"Choi, J., Gao, C., Messou, J.C., Huang, J.B.: Why can\u2019t i dance in the mall? learning to mitigate scene bias in action recognition. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"3_CR14","doi-asserted-by":"crossref","unstructured":"De\u00a0Souza, C., Gaidon, A., Cabon, Y., L\u00f3pez, A.M.: Procedural generation of videos to train deep action recognition networks. corr. arXiv preprint arXiv:1612.00881 (2016)","DOI":"10.1109\/CVPR.2017.278"},{"key":"3_CR15","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"3_CR16","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"3_CR17","unstructured":"Feichtenhofer, C., Fan, H., Li, Y., He, K.: Masked autoencoders as spatiotemporal learners. arXiv preprint arXiv:2205.09113 (2022)"},{"key":"3_CR18","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Xiong, B., Girshick, R., He, K.: A large-scale study on unsupervised spatiotemporal representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3299\u20133309 (2021)","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"3_CR19","doi-asserted-by":"crossref","unstructured":"Fernando, B., Bilen, H., Gavves, E., Gould, S.: Self-supervised video representation learning with odd-one-out networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3636\u20133645 (2017)","DOI":"10.1109\/CVPR.2017.607"},{"key":"3_CR20","unstructured":"Fischer, P., et al.: Flownet: learning optical flow with convolutional networks. arXiv preprint arXiv:1504.06852 (2015)"},{"key":"3_CR21","doi-asserted-by":"crossref","unstructured":"Ghadiyaram, D., Tran, D., Mahajan, D.: Large-scale weakly-supervised pre-training for video action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12046\u201312055 (2019)","DOI":"10.1109\/CVPR.2019.01232"},{"key":"3_CR22","doi-asserted-by":"crossref","unstructured":"Goyal, R., et\u00a0al.: The \u201csomething something\u201d video database for learning and evaluating visual common sense. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5842\u20135850 (2017)","DOI":"10.1109\/ICCV.2017.622"},{"key":"3_CR23","doi-asserted-by":"crossref","unstructured":"Guo, X., et al.: Learning video representations of human motion from synthetic data. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 20197\u201320207 (2022)","DOI":"10.1109\/CVPR52688.2022.01956"},{"key":"3_CR24","doi-asserted-by":"crossref","unstructured":"Hara, K., Kataoka, H., Satoh, Y.: Can spatiotemporal 3D CNNs retrace the history of 2D cnns and imagenet? In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6546\u20136555 (2018)","DOI":"10.1109\/CVPR.2018.00685"},{"key":"3_CR25","doi-asserted-by":"crossref","unstructured":"Huang, Z., Zhang, S., Jiang, J., Tang, M., Jin, R., Ang, M.H.: Self-supervised motion learning from static images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1276\u20131285 (2021)","DOI":"10.1109\/CVPR46437.2021.00133"},{"key":"3_CR26","unstructured":"Hwang, H., Jang, C., Park, G., Cho, J., Kim, I.J.: Eldersim: a synthetic data generation platform for human action recognition in eldercare applications. IEEE Access (2021)"},{"key":"3_CR27","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"425","DOI":"10.1007\/978-3-030-58604-1_26","volume-title":"Computer Vision \u2013 ECCV 2020","author":"S Jenni","year":"2020","unstructured":"Jenni, S., Meishvili, G., Favaro, P.: Video representation learning by recognizing temporal transformations. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12373, pp. 425\u2013442. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58604-1_26"},{"key":"3_CR28","doi-asserted-by":"crossref","unstructured":"Kataoka, H., Hara, K., Hayashi, R., Yamagata, E., Inoue, N.: Spatiotemporal initialization for 3D CNNs with generated motion patterns. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1279\u20131288 (2022)","DOI":"10.1109\/WACV51458.2022.00081"},{"key":"3_CR29","doi-asserted-by":"crossref","unstructured":"Kataoka, H., et al.: Replacing labeled real-image datasets with auto-generated contours. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21232\u201321241 (2022)","DOI":"10.1109\/CVPR52688.2022.02055"},{"key":"3_CR30","doi-asserted-by":"crossref","unstructured":"Kataoka, H., et al.: Pre-training without natural images. In: Proceedings of the Asian Conference on Computer Vision (2020)","DOI":"10.1007\/978-3-030-69544-6_35"},{"key":"3_CR31","unstructured":"Kay, W., et\u00a0al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"3_CR32","doi-asserted-by":"crossref","unstructured":"Kim, D., Cho, D., Kweon, I.S.: Self-supervised video representation learning with space-time cubic puzzles. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a033, pp. 8545\u20138552 (2019)","DOI":"10.1609\/aaai.v33i01.33018545"},{"key":"3_CR33","unstructured":"Kim, Y.W., et al.: How transferable are video representations based on synthetic data? In: Advances in Neural Information Processing Systems, vol. 35, pp. 35710\u201335723 (2022)"},{"key":"3_CR34","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: a large video database for human motion recognition. In: 2011 International Conference on Computer Vision, pp. 2556\u20132563. IEEE (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"3_CR35","doi-asserted-by":"crossref","unstructured":"Li, K., et al.: Unmasked teacher: towards training-efficient video foundation models. arXiv preprint arXiv:2303.16058 (2023)","DOI":"10.1109\/ICCV51070.2023.01826"},{"key":"3_CR36","doi-asserted-by":"crossref","unstructured":"Li, T., Liu, J., Zhang, W., Ni, Y., Wang, W., Li, Z.: UAV-human: a large benchmark for human behavior understanding with unmanned aerial vehicles. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16266\u201316275 (2021)","DOI":"10.1109\/CVPR46437.2021.01600"},{"key":"3_CR37","doi-asserted-by":"crossref","unstructured":"Li, Y., Vasconcelos, N.: Repair: removing representation bias by dataset resampling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9572\u20139581 (2019)","DOI":"10.1109\/CVPR.2019.00980"},{"key":"3_CR38","doi-asserted-by":"crossref","unstructured":"Li, Y., Li, Y., Vasconcelos, N.: Resound: towards action recognition without representation bias. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 513\u2013528 (2018)","DOI":"10.1007\/978-3-030-01231-1_32"},{"key":"3_CR39","doi-asserted-by":"crossref","unstructured":"Ma, Z., et\u00a0al.: Order-prompted tag sequence generation for video tagging. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15681\u201315690 (2023)","DOI":"10.1109\/ICCV51070.2023.01437"},{"key":"3_CR40","doi-asserted-by":"crossref","unstructured":"Mahajan, D., et al.: Exploring the limits of weakly supervised pretraining. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 181\u2013196 (2018)","DOI":"10.1007\/978-3-030-01216-8_12"},{"key":"3_CR41","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: Howto100m: learning a text-video embedding by watching hundred million narrated video clips. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2630\u20132640 (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"3_CR42","doi-asserted-by":"crossref","unstructured":"Mu, J., Qiu, W., Hager, G.D., Yuille, A.L.: Learning from synthetic animals. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12386\u201312395 (2020)","DOI":"10.1109\/CVPR42600.2020.01240"},{"key":"3_CR43","doi-asserted-by":"crossref","unstructured":"Nakamura, R., Kataoka, H., Takashima, S., Noriega, E.J.M., Yokota, R., Inoue, N.: Pre-training vision transformers with very limited synthesized images. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 20360\u201320369 (2023)","DOI":"10.1109\/ICCV51070.2023.01862"},{"key":"3_CR44","doi-asserted-by":"crossref","unstructured":"Nakashima, K., Kataoka, H., Matsumoto, A., Iwata, K., Inoue, N., Satoh, Y.: Can vision transformers learn without natural images? In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a036, pp. 1990\u20131998 (2022)","DOI":"10.1609\/aaai.v36i2.20094"},{"key":"3_CR45","doi-asserted-by":"crossref","unstructured":"Pan, T., Song, Y., Yang, T., Jiang, W., Liu, W.: Videomoco: contrastive video representation learning with temporally adversarial examples. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11205\u201311214 (2021)","DOI":"10.1109\/CVPR46437.2021.01105"},{"issue":"3","key":"3_CR46","doi-asserted-by":"publisher","first-page":"287","DOI":"10.1145\/325165.325247","volume":"19","author":"K Perlin","year":"1985","unstructured":"Perlin, K.: An image synthesizer. ACM Siggraph Comput. Graph. 19(3), 287\u2013296 (1985)","journal-title":"ACM Siggraph Comput. Graph."},{"key":"3_CR47","doi-asserted-by":"crossref","unstructured":"Perlin, K.: Improving noise. In: Proceedings of the 29th Annual Conference on Computer Graphics and Interactive Techniques, pp. 681\u2013682 (2002)","DOI":"10.1145\/566570.566636"},{"key":"3_CR48","unstructured":"Schuhmann, C., et al.: Laion-5b: an open large-scale dataset for training next generation image-text models. In: Advance in Neural Information Processing System, vol. 35, pp. 25278\u201325294 (2022)"},{"key":"3_CR49","unstructured":"Shu, F., et al.: Masked contrastive pre-training for efficient video-text retrieval. arXiv preprint arXiv:2212.00986 (2022)"},{"key":"3_CR50","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: Ucf101: a dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)"},{"key":"3_CR51","doi-asserted-by":"crossref","unstructured":"Roberto\u00a0de Souza, C., Gaidon, A., Cabon, Y., Manuel\u00a0Lopez, A.: Procedural generation of videos to train deep action recognition networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4757\u20134767 (2017)","DOI":"10.1109\/CVPR.2017.278"},{"key":"3_CR52","doi-asserted-by":"crossref","unstructured":"Sun, C., Shrivastava, A., Singh, S., Gupta, A.: Revisiting unreasonable effectiveness of data in deep learning era. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 843\u2013852 (2017)","DOI":"10.1109\/ICCV.2017.97"},{"key":"3_CR53","doi-asserted-by":"crossref","unstructured":"Sun, X., et al.: Masked motion encoding for self-supervised video representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2235\u20132245 (2023)","DOI":"10.1109\/CVPR52729.2023.00222"},{"key":"3_CR54","doi-asserted-by":"crossref","unstructured":"Takashima, S., Hayamizu, R., Inoue, N., Kataoka, H., Yokota, R.: Visual atoms: pre-training vision transformers with sinusoidal waves. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18579\u201318588 (2023)","DOI":"10.1109\/CVPR52729.2023.01782"},{"key":"3_CR55","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"402","DOI":"10.1007\/978-3-030-58536-5_24","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Z Teed","year":"2020","unstructured":"Teed, Z., Deng, J.: RAFT: recurrent all-pairs field transforms for optical flow. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020 Part II. LNCS, vol. 12347, pp. 402\u2013419. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58536-5_24"},{"key":"3_CR56","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: Videomae: masked autoencoders are data-efficient learners for self-supervised video pre-training. arXiv preprint arXiv:2203.12602 (2022)"},{"key":"3_CR57","doi-asserted-by":"crossref","unstructured":"Toyer, S., Cherian, A., Han, T., Gould, S.: Human pose forecasting via deep Markov models. In: 2017 International Conference on Digital Image Computing: Techniques and Applications (DICTA), pp.\u00a01\u20138. IEEE (2017)","DOI":"10.1109\/DICTA.2017.8227441"},{"issue":"7","key":"3_CR58","doi-asserted-by":"publisher","first-page":"2264","DOI":"10.1007\/s11263-021-01467-7","volume":"129","author":"G Varol","year":"2021","unstructured":"Varol, G., Laptev, I., Schmid, C., Zisserman, A.: Synthetic humans for action recognition from unseen viewpoints. Int. J. Comput. Vis. 129(7), 2264\u20132287 (2021)","journal-title":"Int. J. Comput. Vis."},{"key":"3_CR59","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"504","DOI":"10.1007\/978-3-030-58520-4_30","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Wang","year":"2020","unstructured":"Wang, J., Jiao, J., Liu, Y.-H.: Self-supervised video representation learning by pace prediction. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020 Part XVII. LNCS, vol. 12362, pp. 504\u2013521. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58520-4_30"},{"key":"3_CR60","doi-asserted-by":"crossref","unstructured":"Wang, L., et al.: Videomae v2: scaling video masked autoencoders with dual masking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14549\u201314560 (2023)","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"3_CR61","doi-asserted-by":"crossref","unstructured":"Wang, R., et al.: BEVT: bert pretraining of video transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14733\u201314743 (2022)","DOI":"10.1109\/CVPR52688.2022.01432"},{"key":"3_CR62","doi-asserted-by":"crossref","unstructured":"Wang, R., et al.: Masked video distillation: rethinking masked feature modeling for self-supervised video representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6312\u20136322 (2023)","DOI":"10.1109\/CVPR52729.2023.00611"},{"key":"3_CR63","doi-asserted-by":"crossref","unstructured":"Wang, T., Zhao, J., Yatskar, M., Chang, K.W., Ordonez, V.: Balanced datasets are not enough: Estimating and mitigating gender bias in deep image representations. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5310\u20135319 (2019)","DOI":"10.1109\/ICCV.2019.00541"},{"key":"3_CR64","doi-asserted-by":"crossref","unstructured":"Wei, C., Fan, H., Xie, S., Wu, C.Y., Yuille, A., Feichtenhofer, C.: Masked feature prediction for self-supervised visual pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14668\u201314678 (2022)","DOI":"10.1109\/CVPR52688.2022.01426"},{"key":"3_CR65","doi-asserted-by":"crossref","unstructured":"Wu, Z., Wang, Z., Wang, Z., Jin, H.: Towards privacy-preserving visual recognition via adversarial training: a pilot study. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 606\u2013624 (2018)","DOI":"10.1007\/978-3-030-01270-0_37"},{"key":"3_CR66","doi-asserted-by":"crossref","unstructured":"Xu, D., Xiao, J., Zhao, Z., Shao, J., Xie, D., Zhuang, Y.: Self-supervised spatiotemporal learning via video clip order prediction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10334\u201310343 (2019)","DOI":"10.1109\/CVPR.2019.01058"},{"key":"3_CR67","unstructured":"Yang, H., et al.: Self-supervised video representation learning with motion-aware masked autoencoders. arXiv preprint arXiv:2210.04154 (2022)"},{"key":"3_CR68","doi-asserted-by":"crossref","unstructured":"Yang, K., Qinami, K., Fei-Fei, L., Deng, J., Russakovsky, O.: Towards fairer datasets: filtering and balancing the distribution of the people subtree in the imagenet hierarchy. In: Proceedings of the 2020 Conference on Fairness, Accountability, and Transparency, pp. 547\u2013558 (2020)","DOI":"10.1145\/3351095.3375709"},{"key":"3_CR69","doi-asserted-by":"crossref","unstructured":"Yao, Y., Liu, C., Luo, D., Zhou, Y., Ye, Q.: Video playback rate perception for self-supervised spatio-temporal representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6548\u20136557 (2020)","DOI":"10.1109\/CVPR42600.2020.00658"},{"key":"3_CR70","doi-asserted-by":"crossref","unstructured":"Yun, S., Han, D., Oh, S.J., Chun, S., Choe, J., Yoo, Y.: Cutmix: regularization strategy to train strong classifiers with localizable features. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6023\u20136032 (2019)","DOI":"10.1109\/ICCV.2019.00612"},{"key":"3_CR71","unstructured":"Yun, S., Oh, S.J., Heo, B., Han, D., Kim, J.: Videomix: rethinking data augmentation for video classification. arXiv preprint arXiv:2012.03457 (2020)"},{"key":"3_CR72","unstructured":"Zhang, H., Cisse, M., Dauphin, Y.N., Lopez-Paz, D.: mixup: beyond empirical risk minimization. arXiv preprint arXiv:1710.09412 (2017)"},{"key":"3_CR73","unstructured":"Zhang, Z., et al.: Create: a benchmark for Chinese short video retrieval and title generation. arXiv preprint arXiv:2203.16763 (2022)"},{"key":"3_CR74","unstructured":"Zhong, H., et al.: Learning human action recognition representations without real humans. arXiv preprint arXiv:2311.06231 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73247-8_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T12:04:14Z","timestamp":1730376254000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73247-8_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,1]]},"ISBN":["9783031732461","9783031732478"],"references-count":74,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73247-8_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,1]]},"assertion":[{"value":"1 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}