{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,27]],"date-time":"2026-02-27T01:27:17Z","timestamp":1772155637598,"version":"3.50.1"},"publisher-location":"Cham","reference-count":88,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031198298","type":"print"},{"value":"9783031198304","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19830-4_36","type":"book-chapter","created":{"date-parts":[[2022,10,21]],"date-time":"2022-10-21T16:21:10Z","timestamp":1666369270000},"page":"632-652","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":13,"title":["How Severe Is Benchmark-Sensitivity in\u00a0Video Self-supervised Learning?"],"prefix":"10.1007","author":[{"given":"Fida Mohammad","family":"Thoker","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hazel","family":"Doughty","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Piyush","family":"Bagad","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cees G. M.","family":"Snoek","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,10,22]]},"reference":[{"key":"36_CR1","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"208","DOI":"10.1007\/978-3-030-58523-5_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"T Afouras","year":"2020","unstructured":"Afouras, T., Owens, A., Chung, J.S., Zisserman, A.: Self-supervised learning of audio-visual objects from video. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12363, pp. 208\u2013224. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58523-5_13"},{"key":"36_CR2","doi-asserted-by":"crossref","unstructured":"Ahsan, U., Madhok, R., Essa, I.: Video jigsaw: Unsupervised learning of spatiotemporal context for video action recognition. In: Proceedings of the IEEE Winter Conference on Applications of Computer Vision (WACV), pp. 179\u2013189. IEEE (2019)","DOI":"10.1109\/WACV.2019.00025"},{"key":"36_CR3","unstructured":"Alwassel, H., Mahajan, D., Korbar, B., Torresani, L., Ghanem, B., Tran, D.: Self-supervised learning by cross-modal audio-video clustering. In: Advances in Neural Information Processing Systems (NeurIPS), vol. 33, pp. 9758\u20139770 (2020)"},{"key":"36_CR4","unstructured":"Asano, Y.M., Patrick, M., Rupprecht, C., Vedaldi, A.: Labelling unlabelled videos from scratch with multi-modal self-supervision. In: Advances in Neural Information Processing Systems (NeurIPS) (2020)"},{"key":"36_CR5","unstructured":"Asano, Y.M., Rupprecht, C., Vedaldi, A.: A critical analysis of self-supervision, or what we can learn from a single image. In: International Conference on Learning Representations (ICLR) (2020)"},{"key":"36_CR6","unstructured":"Bai, Y., et al.: Can temporal information help with contrastive self-supervised learning? arXiv preprint arXiv:2011.13046 (2020)"},{"key":"36_CR7","doi-asserted-by":"crossref","unstructured":"Benaim, S., et al.: Speednet: Learning the speediness in videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9922\u20139931 (2020)","DOI":"10.1109\/CVPR42600.2020.00994"},{"key":"36_CR8","doi-asserted-by":"crossref","unstructured":"Chen, B., et al.: Multimodal clustering networks for self-supervised learning from unlabeled videos. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (CVPR), pp. 8012\u20138021 (2021)","DOI":"10.1109\/ICCV48922.2021.00791"},{"key":"36_CR9","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: Proceedings of the International Conference on Machine Learning (PMLR) (2020)"},{"key":"36_CR10","unstructured":"Chen, X., Fan, H., Girshick, R., He, K.: Improved baselines with momentum contrastive learning. arXiv preprint arXiv:2003.04297 (2020)"},{"key":"36_CR11","doi-asserted-by":"publisher","first-page":"79562","DOI":"10.1109\/ACCESS.2021.3084840","volume":"9","author":"H Cho","year":"2021","unstructured":"Cho, H., Kim, T., Chang, H.J., Hwang, W.: Self-supervised spatio-temporal representation learning using variable playback speed prediction. IEEE Access 9, 79562\u201379571 (2021)","journal-title":"IEEE Access"},{"key":"36_CR12","doi-asserted-by":"crossref","unstructured":"Cole, E., Yang, X., Wilber, K., Mac Aodha, O., Belongie, S.: When does contrastive visual representation learning work? In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01434"},{"key":"36_CR13","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1007\/s11263-021-01531-2","volume":"130","author":"D Damen","year":"2021","unstructured":"Damen, D., et al.: Rescaling egocentric vision: Collection, pipeline and challenges for EPIC-KITCHENS-100. Int. J. Comput. Vis. (IJCV) 130, 33\u201355 (2021)","journal-title":"Int. J. Comput. Vis. (IJCV)"},{"key":"36_CR14","doi-asserted-by":"crossref","unstructured":"Dave, I., Gupta, R., Rizve, M.N., Shah, M.: Tclr: Temporal contrastive learning for video representation. In: Computer Vision and Image Understanding (CVIU), p. 103406 (2022)","DOI":"10.1016\/j.cviu.2022.103406"},{"key":"36_CR15","doi-asserted-by":"crossref","unstructured":"Diba, A., et al.: Vi2clr: Video and image for visual contrastive learning of representation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 1502\u20131512 (2021)","DOI":"10.1109\/ICCV48922.2021.00153"},{"key":"36_CR16","doi-asserted-by":"crossref","unstructured":"Ericsson, L., Gouk, H., Hospedales, T.M.: How well do self-supervised models transfer? In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5414\u20135423 (2021)","DOI":"10.1109\/CVPR46437.2021.00537"},{"key":"36_CR17","doi-asserted-by":"crossref","unstructured":"Ericsson, L., Gouk, H., Hospedales, T.M.: Why do self-supervised models transfer? investigating the impact of invariance on downstream tasks. arXiv preprint arXiv:2111.11398 (2021)","DOI":"10.1109\/CVPR46437.2021.00537"},{"key":"36_CR18","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"772","DOI":"10.1007\/978-3-319-10605-2_50","volume-title":"Computer Vision \u2013 ECCV 2014","author":"E Eyjolfsdottir","year":"2014","unstructured":"Eyjolfsdottir, E., et al.: Detecting social actions of fruit flies. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8690, pp. 772\u2013787. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10605-2_50"},{"key":"36_CR19","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: Slowfast networks for video recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 6201\u20136210 (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"36_CR20","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Xiong, B., Girshick, R., He, K.: A large-scale study on unsupervised spatiotemporal representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3299\u20133309 (2021)","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"36_CR21","doi-asserted-by":"crossref","unstructured":"Fernando, B., Bilen, H., Gavves, E., Gould, S.: Self-supervised video representation learning with odd-one-out networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3636\u20133645 (2017)","DOI":"10.1109\/CVPR.2017.607"},{"key":"36_CR22","doi-asserted-by":"crossref","unstructured":"Gavrilyuk, K., Jain, M., Karmanov, I., Snoek, C.G.M.: Motion-augmented self-training for video recognition at smaller scale. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 10429\u201310438 (2021)","DOI":"10.1109\/ICCV48922.2021.01026"},{"key":"36_CR23","unstructured":"Ghodrati, A., Gavves, E., Snoek, C.G.M.: Video time: Properties, encoders and evaluation. In: British Machine Vision Conference (BMVC) (2018)"},{"key":"36_CR24","doi-asserted-by":"crossref","unstructured":"Goyal, P., Mahajan, D., Gupta, A., Misra, I.: Scaling and benchmarking self-supervised visual representation learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 6391\u20136400 (2019)","DOI":"10.1109\/ICCV.2019.00649"},{"key":"36_CR25","doi-asserted-by":"crossref","unstructured":"Goyal, R., et al.: The \"something something\" video database for learning and evaluating visual common sense. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 5842\u20135850 (2017)","DOI":"10.1109\/ICCV.2017.622"},{"key":"36_CR26","unstructured":"Grauman, K., et al.: Ego4d: Around the World in 3,000 Hours of Egocentric Video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2022)"},{"key":"36_CR27","doi-asserted-by":"crossref","unstructured":"Gu, C., et al.: Ava: A video dataset of spatio-temporally localized atomic visual actions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2018)","DOI":"10.1109\/CVPR.2018.00633"},{"key":"36_CR28","doi-asserted-by":"crossref","unstructured":"Han, T., Xie, W., Zisserman, A.: Video representation learning by dense predictive coding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision Workshops (2019)","DOI":"10.1109\/ICCVW.2019.00186"},{"key":"36_CR29","unstructured":"Han, T., Xie, W., Zisserman, A.: Self-supervised co-training for video representation learning. In: Advances in Neural Information Processing Systems (NeurIPS) (2020)"},{"key":"36_CR30","doi-asserted-by":"crossref","unstructured":"Hu, D., Nie, F., Li, X.: Deep multimodal clustering for unsupervised audiovisual learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9248\u20139257 (2019)","DOI":"10.1109\/CVPR.2019.00947"},{"key":"36_CR31","doi-asserted-by":"crossref","unstructured":"Huang, D., et al.: Ascnet: Self-supervised video representation learning with appearance-speed consistency. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 8096\u20138105 (2021)","DOI":"10.1109\/ICCV48922.2021.00799"},{"key":"36_CR32","doi-asserted-by":"crossref","unstructured":"Huo, Y., et al.: Self-supervised video representation learning with constrained spatiotemporal jigsaw. In: Proceedings of the Thirtieth International Joint Conference on Artificial Intelligence (IJCAI) (2021)","DOI":"10.24963\/ijcai.2021\/104"},{"key":"36_CR33","doi-asserted-by":"crossref","unstructured":"Islam, A., Chen, C.F.R., Panda, R., Karlinsky, L., Radke, R., Feris, R.: A broad study on the transferability of visual representations with contrastive learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 8845\u20138855 (2021)","DOI":"10.1109\/ICCV48922.2021.00872"},{"key":"36_CR34","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"425","DOI":"10.1007\/978-3-030-58604-1_26","volume-title":"Computer Vision \u2013 ECCV 2020","author":"S Jenni","year":"2020","unstructured":"Jenni, S., Meishvili, G., Favaro, P.: Video representation learning by recognizing temporal transformations. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12373, pp. 425\u2013442. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58604-1_26"},{"key":"36_CR35","unstructured":"Jing, L., Yang, X., Liu, J., Tian, Y.: Self-supervised spatiotemporal feature learning via video rotation prediction. arXiv preprint arXiv:1811.11387 (2018)"},{"key":"36_CR36","unstructured":"Kay, W., et al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"36_CR37","doi-asserted-by":"crossref","unstructured":"Kim, D., Cho, D., Kweon, I.S.: Self-supervised video representation learning with space-time cubic puzzles. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 8545\u20138552 (2019)","DOI":"10.1609\/aaai.v33i01.33018545"},{"key":"36_CR38","doi-asserted-by":"crossref","unstructured":"Kolesnikov, A., Zhai, X., Beyer, L.: Revisiting self-supervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1920\u20131929 (2019)","DOI":"10.1109\/CVPR.2019.00202"},{"key":"36_CR39","unstructured":"K\u00f6p\u00fckl\u00fc, O., Wei, X., Rigoll, G.: You only watch once: A unified CNN architecture for real-time spatiotemporal action localization. arXiv preprint arXiv:1911.06644 (2019)"},{"key":"36_CR40","unstructured":"Korbar, B., Tran, D., Torresani, L.: Cooperative learning of audio and video models from self-supervised synchronization. In: Advances in Neural Information Processing Systems (NeurIPS), vol. 31 (2018)"},{"key":"36_CR41","doi-asserted-by":"crossref","unstructured":"Kotar, K., Ilharco, G., Schmidt, L., Ehsani, K., Mottaghi, R.: Contrasting contrastive self-supervised representation learning pipelines. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 9949\u20139959 (2021)","DOI":"10.1109\/ICCV48922.2021.00980"},{"key":"36_CR42","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: a large video database for human motion recognition. In: Proceedings of the International Conference on Computer Vision (ICCV) (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"36_CR43","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"520","DOI":"10.1007\/978-3-030-01231-1_32","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Y Li","year":"2018","unstructured":"Li, Y., Li, Y., Vasconcelos, N.: RESOUND: Towards action recognition without representation bias. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11210, pp. 520\u2013535. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01231-1_32"},{"key":"36_CR44","doi-asserted-by":"crossref","unstructured":"Lin, Y., Guo, X., Lu, Y.: Self-supervised video representation learning with meta-contrastive network. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 8239\u20138249 (2021)","DOI":"10.1109\/ICCV48922.2021.00813"},{"key":"36_CR45","doi-asserted-by":"crossref","unstructured":"Luo, D., et al.: Video cloze procedure for self-supervised spatio-temporal learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 11701\u201311708 (2020)","DOI":"10.1609\/aaai.v34i07.6840"},{"key":"36_CR46","unstructured":"Ma, S., Zeng, Z., McDuff, D., Song, Y.: Active contrastive learning of audio-visual video representations. In: International Conference on Learning Representations (ICLR) (2021)"},{"key":"36_CR47","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"437","DOI":"10.1007\/978-3-319-46454-1_27","volume-title":"Computer Vision \u2013 ECCV 2016","author":"P Mettes","year":"2016","unstructured":"Mettes, P., van Gemert, J.C., Snoek, C.G.M.: Spot On: Action localization from pointly-supervised proposals. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 437\u2013453. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_27"},{"key":"36_CR48","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"527","DOI":"10.1007\/978-3-319-46448-0_32","volume-title":"Computer Vision \u2013 ECCV 2016","author":"I Misra","year":"2016","unstructured":"Misra, I., Zitnick, C.L., Hebert, M.: Shuffle and learn: unsupervised learning using temporal order verification. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 527\u2013544. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_32"},{"key":"36_CR49","doi-asserted-by":"crossref","unstructured":"Morgado, P., Vasconcelos, N., Misra, I.: Audio-visual instance discrimination with cross-modal agreement. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.01229"},{"key":"36_CR50","doi-asserted-by":"crossref","unstructured":"Newell, A., Deng, J.: How useful is self-supervised pretraining for visual tasks? In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.00737"},{"key":"36_CR51","doi-asserted-by":"crossref","unstructured":"Ng, X.L., Ong, K.E., Zheng, Q., Ni, Y., Yeo, S.Y., Liu, J.: Animal kingdom: A large and diverse dataset for animal behavior understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 19023\u201319034 (2022)","DOI":"10.1109\/CVPR52688.2022.01844"},{"key":"36_CR52","unstructured":"Nguyen, T., Raghu, M., Kornblith, S.: Do wide and deep networks learn the same things? uncovering how neural network representations vary with width and depth. In: International Conference on Learning Representations (ICLR) (2021)"},{"key":"36_CR53","doi-asserted-by":"crossref","unstructured":"Pan, T., Song, Y., Yang, T., Jiang, W., Liu, W.: Videomoco: Contrastive video representation learning with temporally adversarial examples. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 11205\u201311214 (2021)","DOI":"10.1109\/CVPR46437.2021.01105"},{"key":"36_CR54","unstructured":"Patrick, M., et al.: Multi-modal self-supervision from generalized data transformations. In: International Conference on Computer Vision (ICCV) (2021)"},{"key":"36_CR55","doi-asserted-by":"crossref","unstructured":"Pedersen, M., Haurum, J.B., Bengtson, S.H., Moeslund, T.B.: 3d-zef: A 3d zebrafish tracking benchmark dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2426\u20132436 (2020)","DOI":"10.1109\/CVPR42600.2020.00250"},{"key":"36_CR56","unstructured":"Peihao, C., et al.: Rspnet: Relative speed perception for unsupervised video representation learning. In: The AAAI Conference on Artificial Intelligence (AAAI) (2021)"},{"key":"36_CR57","doi-asserted-by":"crossref","unstructured":"Piergiovanni, A., Angelova, A., Ryoo, M.S.: Evolving losses for unsupervised video representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 133\u2013142 (2020)","DOI":"10.1109\/CVPR42600.2020.00021"},{"key":"36_CR58","doi-asserted-by":"crossref","unstructured":"Qian, R., et al.: Spatiotemporal contrastive video representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6964\u20136974 (2021)","DOI":"10.1109\/CVPR46437.2021.00689"},{"key":"36_CR59","doi-asserted-by":"crossref","unstructured":"Recasens, A., et al.: Broaden your views for self-supervised video learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 1255\u20131265 (2021)","DOI":"10.1109\/ICCV48922.2021.00129"},{"key":"36_CR60","doi-asserted-by":"crossref","unstructured":"Sariyildiz, M.B., Kalantidis, Y., Larlus, D., Alahari, K.: Concept generalization in visual representation learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 9629\u20139639 (2021)","DOI":"10.1109\/ICCV48922.2021.00949"},{"key":"36_CR61","doi-asserted-by":"crossref","unstructured":"Schiappa, M.C., Rawat, Y.S., Shah, M.: Self-supervised learning for videos: A survey. arXiv preprint arXiv:2207.00419 (2022)","DOI":"10.1145\/3577925"},{"key":"36_CR62","doi-asserted-by":"crossref","unstructured":"Shahroudy, A., Liu, J., Ng, T.T., Wang, G.: Ntu rgb+ d: A large scale dataset for 3d human activity analysis. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1010\u20131019 (2016)","DOI":"10.1109\/CVPR.2016.115"},{"key":"36_CR63","doi-asserted-by":"crossref","unstructured":"Shao, D., Zhao, Y., Dai, B., Lin, D.: Finegym: A hierarchical video dataset for fine-grained action understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.00269"},{"key":"36_CR64","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"510","DOI":"10.1007\/978-3-319-46448-0_31","volume-title":"Computer Vision \u2013 ECCV 2016","author":"GA Sigurdsson","year":"2016","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., Gupta, A.: Hollywood in homes: crowdsourcing data collection for activity understanding. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 510\u2013526. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_31"},{"key":"36_CR65","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: Ucf101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)"},{"key":"36_CR66","doi-asserted-by":"crossref","unstructured":"Sun, C., Nagrani, A., Tian, Y., Schmid, C.: Composable augmentation encoding for video representation learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 8834\u20138844 (2021)","DOI":"10.1109\/ICCV48922.2021.00871"},{"key":"36_CR67","unstructured":"Sun, J.J., et al.: The multi-agent behavior dataset: Mouse dyadic social interactions. In: Vanschoren, J., Yeung, S. (eds.) Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks (2021)"},{"key":"36_CR68","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"590","DOI":"10.1007\/978-3-030-11012-3_45","volume-title":"Computer Vision \u2013 ECCV 2018 Workshops","author":"T Suzuki","year":"2019","unstructured":"Suzuki, T., Itazuri, T., Hara, K., Kataoka, H.: Learning spatiotemporal 3d convolution with video order self-supervision. In: Leal-Taix\u00e9, L., Roth, S. (eds.) ECCV 2018. LNCS, vol. 11130, pp. 590\u2013598. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-11012-3_45"},{"key":"36_CR69","doi-asserted-by":"crossref","unstructured":"Tao, L., Wang, X., Yamasaki, T.: Self-supervised video representation learning using inter-intra contrastive framework. In: Proceedings of the 28th ACM International Conference on Multimedia (ACM MM), pp. 2193\u20132201 (2020)","DOI":"10.1145\/3394171.3413694"},{"key":"36_CR70","unstructured":"Tao, L., Wang, X., Yamasaki, T.: Pretext-contrastive learning: Toward good practices in self-supervised video representation leaning. arXiv preprint arXiv:2010.15464 (2021)"},{"key":"36_CR71","doi-asserted-by":"crossref","unstructured":"Thoker, F.M., Doughty, H., Snoek, C.: Skeleton-contrastive 3d action representation learning. In: Proceedings of the 29th ACM International Conference on Multimedia, (ACM MM ) (2021)","DOI":"10.1145\/3474085.3475307"},{"key":"36_CR72","doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., Ray, J., LeCun, Y., Paluri, M.: A closer look at spatiotemporal convolutions for action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6450\u20136459 (2018)","DOI":"10.1109\/CVPR.2018.00675"},{"key":"36_CR73","doi-asserted-by":"crossref","unstructured":"Van Horn, G., Cole, E., Beery, S., Wilber, K., Belongie, S., Mac Aodha, O.: Benchmarking representation learning for natural world image collections. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 12884\u201312893 (2021)","DOI":"10.1109\/CVPR46437.2021.01269"},{"key":"36_CR74","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"717","DOI":"10.1007\/978-3-030-58574-7_43","volume-title":"Computer Vision \u2013 ECCV 2020","author":"B Wallace","year":"2020","unstructured":"Wallace, B., Hariharan, B.: Extending and analyzing self-supervised learning across domains. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12371, pp. 717\u2013734. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58574-7_43"},{"key":"36_CR75","doi-asserted-by":"crossref","unstructured":"Wang, G., Zhou, Y., Luo, C., Xie, W., Zeng, W., Xiong, Z.: Unsupervised visual representation learning by tracking patches in video. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.00259"},{"key":"36_CR76","doi-asserted-by":"crossref","unstructured":"Wang, J., Jiao, J., Bao, L., He, S., Liu, Y., Liu, W.: Self-supervised spatio-temporal representation learning for videos by predicting motion and appearance statistics. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4006\u20134015 (2019)","DOI":"10.1109\/CVPR.2019.00413"},{"key":"36_CR77","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"504","DOI":"10.1007\/978-3-030-58520-4_30","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Wang","year":"2020","unstructured":"Wang, J., Jiao, J., Liu, Y.-H.: Self-supervised video representation learning by pace prediction. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12362, pp. 504\u2013521. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58520-4_30"},{"key":"36_CR78","doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: Removing the background by adding the background: Towards background robust self-supervised video representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.01163"},{"key":"36_CR79","doi-asserted-by":"crossref","unstructured":"Wei, D., Lim, J.J., Zisserman, A., Freeman, W.T.: Learning and using the arrow of time. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8052\u20138060 (2018)","DOI":"10.1109\/CVPR.2018.00840"},{"key":"36_CR80","unstructured":"Xiao, F., Tighe, J., Modolo, D.: Modist: Motion distillation for self-supervised video representation learning. arXiv preprint arXiv:2106.09703 (2021)"},{"key":"36_CR81","doi-asserted-by":"crossref","unstructured":"Xu, D., Xiao, J., Zhao, Z., Shao, J., Xie, D., Zhuang, Y.: Self-supervised spatiotemporal learning via video clip order prediction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10334\u201310343 (2019)","DOI":"10.1109\/CVPR.2019.01058"},{"key":"36_CR82","unstructured":"Yang, C., Xu, Y., Dai, B., Zhou, B.: Video representation learning with visual tempo consistency. arXiv preprint arXiv:2006.15489 (2020)"},{"key":"36_CR83","doi-asserted-by":"crossref","unstructured":"Yang, X., He, X., Liang, Y., Yang, Y., Zhang, S., Xie, P.: Transfer learning or self-supervised learning? a tale of two pretraining paradigms. arXiv preprint arXiv:2007.04234 (2020)","DOI":"10.36227\/techrxiv.12502298.v1"},{"key":"36_CR84","doi-asserted-by":"crossref","unstructured":"Yao, T., Zhang, Y., Qiu, Z., Pan, Y., Mei, T.: Seco: Exploring sequence supervision for unsupervised representation learning. In: AAAI, vol. 2, p. 7 (2021)","DOI":"10.1609\/aaai.v35i12.17274"},{"key":"36_CR85","doi-asserted-by":"crossref","unstructured":"Yao, Y., Liu, C., Luo, D., Zhou, Y., Ye, Q.: Video playback rate perception for self-supervised spatio-temporal representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6548\u20136557 (2020)","DOI":"10.1109\/CVPR42600.2020.00658"},{"key":"36_CR86","unstructured":"Zhai, X., et al.: A large-scale study of representation learning with the visual task adaptation benchmark. arXiv preprint arXiv:1910.04867 (2019)"},{"key":"36_CR87","doi-asserted-by":"crossref","unstructured":"Zhang, H., Xu, X., Han, G., He, S.: Context-aware and scale-insensitive temporal repetition counting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.00075"},{"key":"36_CR88","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: Contrastive spatio-temporal pretext learning for self-supervised video representation. In: Proceedings of the AAAI Conference on Artificial Intelligenc (2022)","DOI":"10.1609\/aaai.v36i3.20248"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19830-4_36","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,3,9]],"date-time":"2023-03-09T13:04:50Z","timestamp":1678367090000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19830-4_36"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198298","9783031198304"],"references-count":88,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19830-4_36","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"22 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}