{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,27]],"date-time":"2026-02-27T01:28:16Z","timestamp":1772155696450,"version":"3.50.1"},"publisher-location":"Cham","reference-count":83,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031198083","type":"print"},{"value":"9783031198090","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19809-0_9","type":"book-chapter","created":{"date-parts":[[2022,10,31]],"date-time":"2022-10-31T07:03:04Z","timestamp":1667199784000},"page":"145-164","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":20,"title":["Static and\u00a0Dynamic Concepts for\u00a0Self-supervised Video Representation Learning"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0378-6438","authenticated-orcid":false,"given":"Rui","family":"Qian","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7033-774X","authenticated-orcid":false,"given":"Shuangrui","family":"Ding","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9817-7418","authenticated-orcid":false,"given":"Xian","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8865-7896","authenticated-orcid":false,"given":"Dahua","family":"Lin","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,11,1]]},"reference":[{"key":"9_CR1","unstructured":"Alvarez Melis, D., Jaakkola, T.: Towards robust interpretability with self-explaining neural networks. In: Advances in Neural Information Processing Systems 31 (2018)"},{"key":"9_CR2","unstructured":"Asano, Y.M., Patrick, M., Rupprecht, C., Vedaldi, A.: Labelling unlabelled videos from scratch with multi-modal self-supervision. arXiv preprint arXiv:2006.13662 (2020)"},{"key":"9_CR3","doi-asserted-by":"crossref","unstructured":"Behrmann, N., Fayyaz, M., Gall, J., Noroozi, M.: Long short view feature decomposition via contrastive video representation learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9244\u20139253 (2021)","DOI":"10.1109\/ICCV48922.2021.00911"},{"key":"9_CR4","doi-asserted-by":"crossref","unstructured":"Behrmann, N., Gall, J., Noroozi, M.: Unsupervised video representation learning by bidirectional feature prediction. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1670\u20131679 (2021)","DOI":"10.1109\/WACV48630.2021.00171"},{"key":"9_CR5","doi-asserted-by":"crossref","unstructured":"Benaim, S., et al.: Learning the speediness in videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9922\u20139931 (2020)","DOI":"10.1109\/CVPR42600.2020.00994"},{"key":"9_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"695","DOI":"10.1007\/978-3-030-20890-5_44","volume-title":"Computer Vision \u2013 ACCV 2018","author":"M Bucher","year":"2019","unstructured":"Bucher, M., Herbin, S., Jurie, F.: Semantic bottleneck for computer vision tasks. In: Jawahar, C.V., Li, H., Mori, G., Schindler, K. (eds.) ACCV 2018. LNCS, vol. 11362, pp. 695\u2013712. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-20890-5_44"},{"key":"9_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1007\/978-3-030-01264-9_9","volume-title":"Computer Vision \u2013 ECCV 2018","author":"M Caron","year":"2018","unstructured":"Caron, M., Bojanowski, P., Joulin, A., Douze, M.: Deep clustering for unsupervised learning of visual features. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) Computer Vision \u2013 ECCV 2018. LNCS, vol. 11218, pp. 139\u2013156. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01264-9_9"},{"key":"9_CR8","unstructured":"Caron, M., Misra, I., Mairal, J., Goyal, P., Bojanowski, P., Joulin, A.: Unsupervised learning of visual features by contrasting cluster assignments. arXiv preprint arXiv:2006.09882 (2020)"},{"key":"9_CR9","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? a new model and the kinetics dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"9_CR10","unstructured":"Chen, B., Selvaraju, R.R., Chang, S.F., Niebles, J.C., Naik, N.: Previts: contrastive pretraining with video tracking supervision. arXiv preprint arXiv:2112.00804 (2021)"},{"key":"9_CR11","doi-asserted-by":"crossref","unstructured":"Chen, P., et al.: RSPNet: relative speed perception for unsupervised video representation learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 1 (2021)","DOI":"10.1609\/aaai.v35i2.16189"},{"key":"9_CR12","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International conference on machine learning, pp. 1597\u20131607. PMLR (2020)"},{"issue":"12","key":"9_CR13","doi-asserted-by":"publisher","first-page":"772","DOI":"10.1038\/s42256-020-00265-z","volume":"2","author":"Z Chen","year":"2020","unstructured":"Chen, Z., Bei, Y., Rudin, C.: Concept whitening for interpretable image recognition. Nat. Mach. Intell. 2(12), 772\u2013782 (2020)","journal-title":"Nat. Mach. Intell."},{"key":"9_CR14","unstructured":"Cuturi, M.: Sinkhorn distances: lightspeed computation of optimal transport. In: NIPS. vol. 2, p. 4 (2013)"},{"key":"9_CR15","doi-asserted-by":"crossref","unstructured":"Dave, I., Gupta, R., Rizve, M.N., Shah, M.: TCLR: temporal contrastive learning for video representation. arXiv preprint arXiv:2101.07974 (2021)","DOI":"10.1016\/j.cviu.2022.103406"},{"issue":"9","key":"9_CR16","doi-asserted-by":"publisher","first-page":"1342","DOI":"10.1038\/s41591-018-0107-6","volume":"24","author":"J De Fauw","year":"2018","unstructured":"De Fauw, J., et al.: Clinically applicable deep learning for diagnosis and referral in retinal disease. Nat. Med. 24(9), 1342\u20131350 (2018)","journal-title":"Nat. Med."},{"key":"9_CR17","doi-asserted-by":"crossref","unstructured":"Ding, S., et al.: Motion-aware contrastive video representation learning via foreground-background merging. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9716\u20139726 (2022)","DOI":"10.1109\/CVPR52688.2022.00949"},{"key":"9_CR18","doi-asserted-by":"crossref","unstructured":"Ding, S., Qian, R., Xiong, H.: Dual contrastive learning for spatio-temporal representation. arXiv preprint arXiv:2207.05340 (2022)","DOI":"10.1145\/3503161.3547783"},{"key":"9_CR19","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"9_CR20","unstructured":"Dosovitskiy, A., Springenberg, J.T., Riedmiller, M., Brox, T.: Discriminative unsupervised feature learning with convolutional neural networks. In: Advances in neural information processing systems 27 (2014)"},{"key":"9_CR21","unstructured":"Ermolov, A., Siarohin, A., Sangineto, E., Sebe, N.: Whitening for self-supervised representation learning. In: International Conference on Machine Learning, pp. 3015\u20133024. PMLR (2021)"},{"key":"9_CR22","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Xiong, B., Girshick, R., He, K.: A large-scale study on unsupervised spatiotemporal representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3299\u20133309 (2021)","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"9_CR23","unstructured":"Gao, P., Lu, J., Li, H., Mottaghi, R., Kembhavi, A.: Container: context aggregation network. arXiv preprint arXiv:2106.01401 (2021)"},{"key":"9_CR24","unstructured":"Gidaris, S., Singh, P., Komodakis, N.: Unsupervised representation learning by predicting image rotations. arXiv preprint arXiv:1803.07728 (2018)"},{"key":"9_CR25","unstructured":"Gutmann, M., Hyv\u00e4rinen, A.: Noise-contrastive estimation: a new estimation principle for unnormalized statistical models. In: Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics, pp. 297\u2013304. JMLR Workshop and Conference Proceedings (2010)"},{"key":"9_CR26","doi-asserted-by":"crossref","unstructured":"Han, T., Xie, W., Zisserman, A.: Video representation learning by dense predictive coding. In: Proceedings of the IEEE International Conference on Computer Vision Workshops (2019)","DOI":"10.1109\/ICCVW.2019.00186"},{"key":"9_CR27","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"312","DOI":"10.1007\/978-3-030-58580-8_19","volume-title":"Computer Vision \u2013 ECCV 2020","author":"T Han","year":"2020","unstructured":"Han, T., Xie, W., Zisserman, A.: Memory-augmented dense predictive coding for video representation learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12348, pp. 312\u2013329. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58580-8_19"},{"key":"9_CR28","unstructured":"Han, T., Xie, W., Zisserman, A.: Self-supervised co-training for video representation learning. arXiv preprint arXiv:2010.09709 (2020)"},{"key":"9_CR29","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"9_CR30","first-page":"10077","volume":"33","author":"D Hu","year":"2020","unstructured":"Hu, D., et al.: Discriminative sounding objects localization via self-supervised audiovisual matching. Adv. Neural. Inf. Process. Syst. 33, 10077\u201310087 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR31","doi-asserted-by":"crossref","unstructured":"Huang, D., et al.: ASCNet: self-supervised video representation learning with appearance-speed consistency. arXiv preprint arXiv:2106.02342 (2021)","DOI":"10.1109\/ICCV48922.2021.00799"},{"key":"9_CR32","doi-asserted-by":"crossref","unstructured":"Huang, L., Liu, Y., Wang, B., Pan, P., Xu, Y., Jin, R.: Self-supervised video representation learning by context and motion decoupling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13886\u201313895 (2021)","DOI":"10.1109\/CVPR46437.2021.01367"},{"key":"9_CR33","unstructured":"Jabri, A., Owens, A., Efros, A.A.: Space-time correspondence as a contrastive random walk. arXiv preprint arXiv:2006.14613 (2020)"},{"key":"9_CR34","doi-asserted-by":"crossref","unstructured":"Jenni, S., Jin, H.: Time-equivariant contrastive video representation learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9970\u20139980 (2021)","DOI":"10.1109\/ICCV48922.2021.00982"},{"key":"9_CR35","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"425","DOI":"10.1007\/978-3-030-58604-1_26","volume-title":"Computer Vision \u2013 ECCV 2020","author":"S Jenni","year":"2020","unstructured":"Jenni, S., Meishvili, G., Favaro, P.: Video representation learning by recognizing temporal transformations. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12373, pp. 425\u2013442. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58604-1_26"},{"key":"9_CR36","doi-asserted-by":"crossref","unstructured":"Kim, D., Cho, D., Kweon, I.S.: Self-supervised video representation learning with space-time cubic puzzles. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 33, pp. 8545\u20138552 (2019)","DOI":"10.1609\/aaai.v33i01.33018545"},{"key":"9_CR37","doi-asserted-by":"crossref","unstructured":"Kim, D., Cho, D., Yoo, D., Kweon, I.S.: Learning image representations by completing damaged jigsaw puzzles. In: 2018 IEEE Winter Conference on Applications of Computer Vision (WACV), pp. 793\u2013802. IEEE (2018)","DOI":"10.1109\/WACV.2018.00092"},{"key":"9_CR38","unstructured":"Koh, P.W., et al.: Concept bottleneck models. In: International Conference on Machine Learning, pp. 5338\u20135348. PMLR (2020)"},{"key":"9_CR39","doi-asserted-by":"crossref","unstructured":"Kuang, H., et al.: Video contrastive learning with global context. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3195\u20133204 (2021)","DOI":"10.1109\/ICCVW54120.2021.00358"},{"key":"9_CR40","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: a large video database for human motion recognition. In: 2011 International Conference on Computer Vision, pp. 2556\u20132563. IEEE (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"9_CR41","doi-asserted-by":"crossref","unstructured":"Li, R., Zhang, Y., Qiu, Z., Yao, T., Liu, D., Mei, T.: Motion-focused contrastive learning of video representations. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2105\u20132114 (2021)","DOI":"10.1109\/ICCV48922.2021.00211"},{"key":"9_CR42","unstructured":"Li, X., Liu, S., De Mello, S., Wang, X., Kautz, J., Yang, M.H.: Joint-task self-supervised learning for temporal correspondence. arXiv preprint arXiv:1909.11895 (2019)"},{"key":"9_CR43","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"520","DOI":"10.1007\/978-3-030-01231-1_32","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Y Li","year":"2018","unstructured":"Li, Y., Li, Y., Vasconcelos, N.: RESOUND: towards action recognition without representation bias. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11210, pp. 520\u2013535. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01231-1_32"},{"key":"9_CR44","doi-asserted-by":"crossref","unstructured":"Liu, X., et al.: Visual sound localization in the wild by cross-modal interference erasing. arXiv preprint arXiv:2202.06406 (2022)","DOI":"10.1609\/aaai.v36i2.20073"},{"key":"9_CR45","doi-asserted-by":"crossref","unstructured":"Liu, X., et al.: Learning hierarchical cross-modal association for co-speech gesture generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10462\u201310472 (2022)","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"9_CR46","doi-asserted-by":"crossref","unstructured":"Liu, X., Xu, Y., Wu, Q., Zhou, H., Wu, W., Zhou, B.: Semantic-aware implicit neural audio-driven video portrait generation. arXiv preprint arXiv:2201.07786 (2022)","DOI":"10.1007\/978-3-031-19836-6_7"},{"key":"9_CR47","unstructured":"Losch, M., Fritz, M., Schiele, B.: Interpretability beyond classification output: semantic bottleneck networks. arXiv preprint arXiv:1907.10882 (2019)"},{"key":"9_CR48","doi-asserted-by":"crossref","unstructured":"Luo, D., et al.: Video cloze procedure for self-supervised spatio-temporal learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, pp. 11701\u201311708 (2020)","DOI":"10.1609\/aaai.v34i07.6840"},{"key":"9_CR49","doi-asserted-by":"crossref","unstructured":"Luo, Z., Peng, B., Huang, D.A., Alahi, A., Fei-Fei, L.: Unsupervised learning of long-term motion dynamics for videos. In: Proceedings of the IEEE international conference on computer vision, pp. 2203\u20132212 (2017)","DOI":"10.1109\/CVPR.2017.751"},{"key":"9_CR50","doi-asserted-by":"crossref","unstructured":"Miech, A., Alayrac, J.B., Smaira, L., Laptev, I., Sivic, J., Zisserman, A.: End-to-end learning of visual representations from uncurated instructional videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9879\u20139889 (2020)","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"9_CR51","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"527","DOI":"10.1007\/978-3-319-46448-0_32","volume-title":"Computer Vision \u2013 ECCV 2016","author":"I Misra","year":"2016","unstructured":"Misra, I., Zitnick, C.L., Hebert, M.: Shuffle and learn: unsupervised learning using temporal order verification. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 527\u2013544. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_32"},{"key":"9_CR52","unstructured":"van den Oord, A., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"9_CR53","doi-asserted-by":"crossref","unstructured":"Pan, T., Song, Y., Yang, T., Jiang, W., Liu, W.: Videomoco: contrastive video representation learning with temporally adversarial examples. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11205\u201311214 (2021)","DOI":"10.1109\/CVPR46437.2021.01105"},{"key":"9_CR54","doi-asserted-by":"crossref","unstructured":"Piergiovanni, A., Angelova, A., Ryoo, M.S.: Evolving losses for unsupervised video representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 133\u2013142 (2020)","DOI":"10.1109\/CVPR42600.2020.00021"},{"key":"9_CR55","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"292","DOI":"10.1007\/978-3-030-58565-5_18","volume-title":"Computer Vision \u2013 ECCV 2020","author":"R Qian","year":"2020","unstructured":"Qian, R., Hu, D., Dinkel, H., Wu, M., Xu, N., Lin, W.: Multiple sound sources localization from coarse to fine. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12365, pp. 292\u2013308. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58565-5_18"},{"key":"9_CR56","unstructured":"Qian, R., et al.: Exploring temporal granularity in self-supervised video representation learning. arXiv preprint arXiv:2112.04480 (2021)"},{"key":"9_CR57","doi-asserted-by":"crossref","unstructured":"Qian, R., et al.: Enhancing self-supervised video representation learning via multi-level feature optimization. arXiv preprint arXiv:2108.02183 (2021)","DOI":"10.1109\/ICCV48922.2021.00789"},{"key":"9_CR58","unstructured":"Qian, R., et al.: Spatiotemporal contrastive video representation learning. arXiv preprint arXiv:2008.03800 (2020)"},{"key":"9_CR59","doi-asserted-by":"crossref","unstructured":"Recasens, A., et al.: Broaden your views for self-supervised video learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1255\u20131265 (2021)","DOI":"10.1109\/ICCV48922.2021.00129"},{"key":"9_CR60","doi-asserted-by":"crossref","unstructured":"Regatti, J.R., Deshmukh, A.A., Manavoglu, E., Dogan, U.: Consensus clustering with unsupervised representation learning. arXiv preprint arXiv:2010.01245 (2020)","DOI":"10.1109\/IJCNN52387.2021.9533714"},{"key":"9_CR61","doi-asserted-by":"crossref","unstructured":"Sawada, Y., Nakamura, K.: Concept bottleneck model with additional unsupervised concepts. arXiv preprint arXiv:2202.01459 (2022)","DOI":"10.1109\/ACCESS.2022.3167702"},{"key":"9_CR62","doi-asserted-by":"publisher","unstructured":"Seel, N.M.: Encyclopedia of the sciences of learning, 1st edn. Springer (2011). https:\/\/doi.org\/10.1007\/978-1-4419-1428-6","DOI":"10.1007\/978-1-4419-1428-6"},{"key":"9_CR63","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)"},{"key":"9_CR64","unstructured":"Sun, C., Baradel, F., Murphy, K., Schmid, C.: Learning video representations using contrastive bidirectional transformer. arXiv preprint arXiv:1906.05743 (2019)"},{"key":"9_CR65","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"776","DOI":"10.1007\/978-3-030-58621-8_45","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y Tian","year":"2020","unstructured":"Tian, Y., Krishnan, D., Isola, P.: Contrastive multiview coding. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12356, pp. 776\u2013794. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58621-8_45"},{"key":"9_CR66","doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., Ray, J., LeCun, Y., Paluri, M.: A closer look at spatiotemporal convolutions for action recognition. In: Proceedings of the IEEE conference on Computer Vision and Pattern Recognition, pp. 6450\u20136459 (2018)","DOI":"10.1109\/CVPR.2018.00675"},{"key":"9_CR67","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems 30 (2017)"},{"key":"9_CR68","unstructured":"Wang, J., Jiao, J., Bao, L., He, S., Liu, W., Liu, Y.: Self-supervised video representation learning by uncovering spatio-temporal statistics. arXiv preprint arXiv:2008.13426 (2020)"},{"key":"9_CR69","doi-asserted-by":"crossref","unstructured":"Wang, J., Jiao, J., Bao, L., He, S., Liu, Y., Liu, W.: Self-supervised spatio-temporal representation learning for videos by predicting motion and appearance statistics. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4006\u20134015 (2019)","DOI":"10.1109\/CVPR.2019.00413"},{"key":"9_CR70","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"504","DOI":"10.1007\/978-3-030-58520-4_30","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Wang","year":"2020","unstructured":"Wang, J., Jiao, J., Liu, Y.-H.: Self-supervised video representation learning by pace prediction. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12362, pp. 504\u2013521. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58520-4_30"},{"key":"9_CR71","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R., Gupta, A., He, K.: Non-local neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7794\u20137803 (2018)","DOI":"10.1109\/CVPR.2018.00813"},{"key":"9_CR72","doi-asserted-by":"crossref","unstructured":"Wang, X., Jabri, A., Efros, A.A.: Learning correspondence from the cycle-consistency of time. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2566\u20132576 (2019)","DOI":"10.1109\/CVPR.2019.00267"},{"key":"9_CR73","doi-asserted-by":"crossref","unstructured":"Wang, X., Zhang, R., Shen, C., Kong, T., Li, L.: Dense contrastive learning for self-supervised visual pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3024\u20133033 (2021)","DOI":"10.1109\/CVPR46437.2021.00304"},{"key":"9_CR74","unstructured":"Weinzaepfel, P., Lucas, T., Larlus, D., Kalantidis, Y.: Learning super-features for image retrieval. arXiv preprint arXiv:2201.13182 (2022)"},{"key":"9_CR75","doi-asserted-by":"crossref","unstructured":"Wu, Z., Xiong, Y., Yu, S.X., Lin, D.: Unsupervised feature learning via non-parametric instance discrimination. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 3733\u20133742 (2018)","DOI":"10.1109\/CVPR.2018.00393"},{"key":"9_CR76","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"318","DOI":"10.1007\/978-3-030-01267-0_19","volume-title":"Computer Vision \u2013 ECCV 2018","author":"S Xie","year":"2018","unstructured":"Xie, S., Sun, C., Huang, J., Tu, Z., Murphy, K.: Rethinking spatiotemporal feature learning: speed-accuracy trade-offs in video classification. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11219, pp. 318\u2013335. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01267-0_19"},{"key":"9_CR77","doi-asserted-by":"crossref","unstructured":"Xie, Z., Lin, Y., Zhang, Z., Cao, Y., Lin, S., Hu, H.: Propagate yourself: exploring pixel-level consistency for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16684\u201316693 (2021)","DOI":"10.1109\/CVPR46437.2021.01641"},{"key":"9_CR78","unstructured":"Xiong, S., Tan, Y., Wang, G.: Explore visual concept formation for image classification. In: International Conference on Machine Learning, pp. 11470\u201311479. PMLR (2021)"},{"key":"9_CR79","doi-asserted-by":"crossref","unstructured":"Xu, D., Xiao, J., Zhao, Z., Shao, J., Xie, D., Zhuang, Y.: Self-supervised spatiotemporal learning via video clip order prediction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10334\u201310343 (2019)","DOI":"10.1109\/CVPR.2019.01058"},{"key":"9_CR80","unstructured":"Yang, C., Xu, Y., Dai, B., Zhou, B.: Video representation learning with visual tempo consistency. arXiv preprint arXiv:2006.15489 (2020)"},{"key":"9_CR81","doi-asserted-by":"crossref","unstructured":"Yao, T., Zhang, Y., Qiu, Z., Pan, Y., Mei, T.: SeCo: exploring sequence supervision for unsupervised representation learning. arXiv preprint arXiv:2008.00975 (2020)","DOI":"10.1609\/aaai.v35i12.17274"},{"key":"9_CR82","doi-asserted-by":"crossref","unstructured":"Yao, Y., Liu, C., Luo, D., Zhou, Y., Ye, Q.: Video playback rate perception for self-supervised spatio-temporal representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6548\u20136557 (2020)","DOI":"10.1109\/CVPR42600.2020.00658"},{"key":"9_CR83","doi-asserted-by":"crossref","unstructured":"Yuan, L., et al.: Contextualized spatio-temporal contrastive learning with self-supervision. arXiv preprint arXiv:2112.05181 (2021)","DOI":"10.1109\/CVPR52688.2022.01359"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19809-0_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,11,3]],"date-time":"2022-11-03T00:07:36Z","timestamp":1667434056000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19809-0_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198083","9783031198090"],"references-count":83,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19809-0_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"1 November 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}