{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T09:10:09Z","timestamp":1774602609242,"version":"3.50.1"},"reference-count":103,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T00:00:00Z","timestamp":1772064000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T00:00:00Z","timestamp":1772064000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s11263-026-02785-4","type":"journal-article","created":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T13:11:24Z","timestamp":1772111484000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Self-Supervised Video Representation Learning in a Heuristic Decoupled Perspective"],"prefix":"10.1007","volume":"134","author":[{"given":"Zeen","family":"Song","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7985-5743","authenticated-orcid":false,"given":"Wenwen","family":"Qiang","sequence":"additional","affiliation":[]},{"given":"Changwen","family":"Zheng","sequence":"additional","affiliation":[]},{"given":"Hui","family":"Xiong","sequence":"additional","affiliation":[]},{"given":"Gang","family":"Hua","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,2,26]]},"reference":[{"key":"2785_CR1","doi-asserted-by":"crossref","unstructured":"Ahsan, U., Madhok, R., & Essa, I. (2019). Video jigsaw: Unsupervised learning of spatiotemporal context for video action recognition. In 2019 IEEE Winter Conference on Applications of Computer Vision (WACV), pages 179\u2013189. IEEE.","DOI":"10.1109\/WACV.2019.00025"},{"key":"2785_CR2","unstructured":"Azencot, O., Erichson, N. B., Lin, V., & Mahoney, M. (2020). Forecasting Sequential Data Using Consistent Koopman Autoencoders. In Proceedings of the 37th International Conference on Machine Learning, pages 475\u2013485. PMLR, November."},{"key":"2785_CR3","doi-asserted-by":"crossref","unstructured":"Beery, S., Van Horn, G., & Perona, P. (2018). Recognition in terra incognita. In Proceedings of the European conference on computer vision (ECCV), pages 456\u2013473.","DOI":"10.1007\/978-3-030-01270-0_28"},{"key":"2785_CR4","doi-asserted-by":"crossref","unstructured":"Benaim, S., Ephrat, A., Lang, O., Mosseri, I., Freeman, W. T., Rubinstein, M., Irani, M., & Dekel, T. (2020). Speednet: Learning the speediness in videos. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 9922\u20139931.","DOI":"10.1109\/CVPR42600.2020.00994"},{"key":"2785_CR5","unstructured":"Berman, N., Naiman, I, & Azencot, O. (2022). Multifactor sequential disentanglement via structured koopman autoencoders. In The Eleventh International Conference on Learning Representations."},{"key":"2785_CR6","unstructured":"Bertasius, G., Wang, H., & Torresani, L. (2021). Is space-time attention all you need for video understanding? In ICML, volume 2, page 4."},{"key":"2785_CR7","doi-asserted-by":"crossref","unstructured":"Birkhoff, G. D. (1927). Dynamical systems, volume 9. American Mathematical Soc.","DOI":"10.1090\/coll\/009"},{"key":"2785_CR8","doi-asserted-by":"crossref","unstructured":"Bregonzio, M., Gong, S., & Xiang, T. (2009). Recognising action as clouds of space-time interest points. In 2009 IEEE conference on computer vision and pattern recognition, pages 1948\u20131955. IEEE.","DOI":"10.1109\/CVPR.2009.5206779"},{"key":"2785_CR9","doi-asserted-by":"crossref","unstructured":"Brunton, S. L., Brunton, B. W., Proctor, J. L., & Kutz, J. N. (2016). Koopman invariant subspaces and finite linear representations of nonlinear dynamical systems for control. PLOS ONE, 11(2):e0150171, February.","DOI":"10.1371\/journal.pone.0150171"},{"key":"2785_CR10","doi-asserted-by":"crossref","unstructured":"Brunton, S. L., Budi\u0161i\u0107, M., Kaiser, E., & Kutz, J. N. (2021). Modern Koopman Theory for Dynamical Systems, October. arXiv:2102.12086 [cs, eess, math].","DOI":"10.1137\/21M1401243"},{"key":"2785_CR11","doi-asserted-by":"crossref","unstructured":"Brunton, B. W., Johnson, L. A., Ojemann, J. G., & Kutz, J. N. (2016). Extracting spatial-temporal coherent patterns in large-scale neural recordings using dynamic mode decomposition. Journal of neuroscience methods,258, 1\u201315.","DOI":"10.1016\/j.jneumeth.2015.10.010"},{"key":"2785_CR12","unstructured":"Caron, M., Misra, I., Mairal, J., Goyal, P., Bojanowski, P., & Joulin, A. (2021). Unsupervised Learning of Visual Features by Contrasting Cluster Assignments, January. arXiv:2006.09882 [cs]."},{"key":"2785_CR13","doi-asserted-by":"crossref","unstructured":"Caron, M., Touvron, H., Misra, I., J\u00e9gou, H., Mairal, J., Bojanowski, P., & Joulin, A. (2021). Emerging Properties in Self-Supervised Vision Transformers, May. arXiv:2104.14294 [cs].","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"2785_CR14","unstructured":"Chen, T., Kornblith, S., Norouzi, M., & Hinton, G. (2020). A Simple Framework for Contrastive Learning of Visual Representations. In Proceedings of the 37th International Conference on Machine Learning, pages 1597\u20131607. PMLR, November."},{"key":"2785_CR15","doi-asserted-by":"publisher","first-page":"1045","DOI":"10.1609\/aaai.v35i2.16189","volume":"35","author":"P Chen","year":"2021","unstructured":"Chen, P., Huang, D., He, D., Long, X., Zeng, R., Wen, S., Tan, M., & Gan, C. (2021). Rspnet: Relative speed perception for unsupervised video representation learning. In Proceedings of the AAAI Conference on Artificial Intelligence, 35, 1045\u20131053.","journal-title":"In Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"2785_CR16","unstructured":"Choe, S. K., Neiswanger, W., Xie, P., & Xing, E. (2022). Betty: An Automatic Differentiation Library for Multilevel Optimization. In The Eleventh International Conference on Learning Representations, September."},{"key":"2785_CR17","doi-asserted-by":"crossref","unstructured":"Dave, I., Gupta, R., Rizve, M. N., & Shah, M. (2022). TCLR: Temporal Contrastive Learning for Video Representation. Computer Vision and Image Understanding, 219:103406, June. arXiv:2101.07974 [cs].","DOI":"10.1016\/j.cviu.2022.103406"},{"key":"2785_CR18","doi-asserted-by":"publisher","first-page":"845","DOI":"10.1007\/s11263-020-01393-0","volume":"129","author":"P Dendorfer","year":"2021","unstructured":"Dendorfer, P., Osep, A., Milan, A., Schindler, K., Cremers, D., Reid, I., Roth, S., & Leal-Taix\u00e9, L. (2021). Motchallenge: A benchmark for single-camera multiple target tracking. International Journal of Computer Vision, 129, 845\u2013881.","journal-title":"International Journal of Computer Vision"},{"key":"2785_CR19","unstructured":"Devlin, J. (2018). Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805."},{"key":"2785_CR20","doi-asserted-by":"crossref","unstructured":"Erichson, N. B., Brunton, S. L., & Kutz, J. N. (2019). Compressed Dynamic Mode Decomposition for Background Modeling. Journal of Real-Time Image Processing,16(5), 1479\u20131492. arXiv:1512.04205 [cs].","DOI":"10.1007\/s11554-016-0655-2"},{"key":"2785_CR21","doi-asserted-by":"crossref","unstructured":"Fan, D., Wang, J., Liao, S., Zhu, Y., Bhat, V., Santos-Villalobos, H., Rohith M.V., & Li, X. (2023). Motion-guided masking for spatiotemporal representation learning. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pages 5619\u20135629.","DOI":"10.1109\/ICCV51070.2023.00517"},{"key":"2785_CR22","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., & He, K. (2019). SlowFast Networks for Video Recognition, October. arXiv:1812.03982 [cs].","DOI":"10.1109\/ICCV.2019.00630"},{"key":"2785_CR23","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Xiong, B., Girshick, R., & He, K. (2021). A Large-Scale Study on Unsupervised Spatiotemporal Representation Learning. arXiv:2104.14558 [cs].","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"2785_CR24","first-page":"35946","volume":"35","author":"C Feichtenhofer","year":"2022","unstructured":"Feichtenhofer, C., Li, Y., & He, K. (2022). Masked autoencoders as spatiotemporal learners. Advances in neural information processing systems, 35, 35946\u201335958.","journal-title":"Advances in neural information processing systems"},{"key":"2785_CR25","doi-asserted-by":"crossref","unstructured":"Fernando, B., Bilen, H., Gavves, E., & Gould, S. (2017). Self-supervised video representation learning with odd-one-out networks. In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 3636\u20133645.","DOI":"10.1109\/CVPR.2017.607"},{"key":"2785_CR26","unstructured":"Finn, C., Abbeel, P., & Levine, S. (2017).1126\u20131135. PMLR."},{"key":"2785_CR27","unstructured":"Fisher, R. A., Fisher, R. A., Genetiker, S., Fisher, R. A., Genetician, S., Britain, G., Fisher, R. A., & G\u00e9n\u00e9ticien, S. (1966). The design of experiments, volume 21. Springer."},{"key":"2785_CR28","doi-asserted-by":"crossref","unstructured":"Geirhos, R., Jacobsen, J.-H., Michaelis, C., Zemel, R., Brendel, W., Bethge, M., & Wichmann, F. A. (2020). Shortcut learning in deep neural networks. Nature Machine Intelligence, 2(11), 665\u2013673.","DOI":"10.1038\/s42256-020-00257-z"},{"key":"2785_CR29","doi-asserted-by":"crossref","unstructured":"Girshick, R. (2015). Fast r-cnn. In Proceedings of the IEEE international conference on computer vision, pages 1440\u20131448.","DOI":"10.1109\/ICCV.2015.169"},{"key":"2785_CR30","unstructured":"Girshick, R., Radosavovic, I., Gkioxari, G., Doll\u00e1r, P., & He, K. (2018). Detectron. https:\/\/github.com\/facebookresearch\/detectron."},{"key":"2785_CR31","unstructured":"Glymour, M., Pearl, J., & Jewell, N. P. (2016). Causal Inference in Statistics: A Primer. John Wiley & Sons, January. Google-Books-ID: I0V2CwAAQBAJ."},{"key":"2785_CR32","doi-asserted-by":"crossref","unstructured":"Goyal, R., Kahou, S. E., Michalski, V., Materzynska, J., Westphal, S., Kim, H., Haenel, V., Fruend, I., Yianilos, P., Mueller-Freitag, M., et al. (2017). The\" something something\" video database for learning and evaluating visual common sense. In Proceedings of the IEEE international conference on computer vision, pages 5842\u20135850.","DOI":"10.1109\/ICCV.2017.622"},{"key":"2785_CR33","unstructured":"Grill, J.-B., Strub, F., Altch\u00e9, F., Tallec, C., Richemond, P., Buchatskaya, E., Doersch, C., Pires, B. A., Guo, Z., Azar, M. G., Piot, B., kavukcuoglu, k., Munos, R., & Valko, M. (2020). Bootstrap Your Own Latent - A New Approach to Self-Supervised Learning. In Advances in Neural Information Processing Systems, volume 33, pages 21271\u201321284. Curran Associates Inc."},{"key":"2785_CR34","doi-asserted-by":"crossref","unstructured":"Gu, C., Sun, C., Ross, D. A., Vondrick, C., Pantofaru, C., Li, Y., Vijayanarasimhan, S., Toderici, G., Ricco, S., Sukthankar, R., et al. (2018). Ava: A video dataset of spatio-temporally localized atomic visual actions. In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 6047\u20136056.","DOI":"10.1109\/CVPR.2018.00633"},{"key":"2785_CR35","unstructured":"Han, T., Xie, W., & Zisserman, A. (2021). Self-supervised Co-training for Video Representation Learning, January. arXiv:2010.09709 [cs]."},{"key":"2785_CR36","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., & Girshick, R. (2022). Masked autoencoders are scalable vision learners. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pages 16000\u201316009.","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"2785_CR37","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., & Girshick, R. (2020). Momentum Contrast for Unsupervised Visual Representation Learning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pages 9729\u20139738.","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"2785_CR38","unstructured":"Hermann, K. L., Mobahi, H., Fel, T., & Mozer, M. C. (2023). On the foundations of shortcut learning. arXiv preprint arXiv:2310.16228."},{"key":"2785_CR39","doi-asserted-by":"crossref","unstructured":"Ilic, F., Pock, T., & Wildes, R. P. (2022). Is appearance free action recognition possible? In European Conference on Computer Vision, pages 156\u2013173. Springer.","DOI":"10.1007\/978-3-031-19772-7_10"},{"issue":"1","key":"2785_CR40","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2012","unstructured":"Ji, S., Wei, X., Yang, M., & Kai, Yu. (2012). 3d convolutional neural networks for human action recognition. IEEE transactions on pattern analysis and machine intelligence, 35(1), 221\u2013231.","journal-title":"IEEE transactions on pattern analysis and machine intelligence"},{"key":"2785_CR41","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., & Fei-Fei, L. (2014). Large-scale video classification with convolutional neural networks. In Proceedings of the IEEE conference on Computer Vision and Pattern Recognition, pages 1725\u20131732.","DOI":"10.1109\/CVPR.2014.223"},{"key":"2785_CR42","unstructured":"Kay, W., Carreira, J., Simonyan, K., Zhang, B., Hillier, C., Vijayanarasimhan, S., Viola, F., Green, T., Back, T., Natsev, P., et al. (2017). The kinetics human action video dataset. arXiv preprint arXiv:1705.06950."},{"key":"2785_CR43","doi-asserted-by":"crossref","unstructured":"Khorasgani, S. H., Chen, Y., & Shkurti, F. (2022). SLIC: Self-Supervised Learning with Iterative Clustering for Human Action Videos, June. arXiv:2206.12534 [cs].","DOI":"10.1109\/CVPR52688.2022.01562"},{"key":"2785_CR44","doi-asserted-by":"crossref","unstructured":"Khosla, A., Zhou, T., Malisiewicz, T., Efros, A. A., & Torralba, A. (2012). Undoing the damage of dataset bias. In Computer Vision-ECCV 2012: 12th European Conference on Computer Vision, Florence, Italy, October 7\u201313, 2012, Proceedings, Part I 12, pages 158\u2013171. Springer.","DOI":"10.1007\/978-3-642-33718-5_12"},{"issue":"5","key":"2785_CR45","doi-asserted-by":"publisher","first-page":"1366","DOI":"10.1007\/s11263-022-01594-9","volume":"130","author":"Yu Kong","year":"2022","unstructured":"Kong, Yu., & Yun, F. (2022). Human action recognition and prediction: A survey. International Journal of Computer Vision, 130(5), 1366\u20131401.","journal-title":"International Journal of Computer Vision"},{"issue":"5","key":"2785_CR46","doi-asserted-by":"publisher","first-page":"315","DOI":"10.1073\/pnas.17.5.315","volume":"17","author":"BO Koopman","year":"1931","unstructured":"Koopman, B. O. (1931). Hamiltonian systems and transformation in hilbert space. Proceedings of the National Academy of Sciences, 17(5), 315\u2013318.","journal-title":"Proceedings of the National Academy of Sciences"},{"key":"2785_CR47","doi-asserted-by":"crossref","unstructured":"Kuang, H., Zhu, Y., Zhang, Z., Li, X., Tighe, J., Schwertfeger, S., Stachniss, C., & Li, M. (2021). Video contrastive learning with global context. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pages 3195\u20133204.","DOI":"10.1109\/ICCVW54120.2021.00358"},{"key":"2785_CR48","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., & Serre, T. (2011). Hmdb: a large video database for human motion recognition. In 2011 International conference on computer vision, pages 2556\u20132563. IEEE.","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"2785_CR49","doi-asserted-by":"crossref","unstructured":"Li, Y., Li, Y., & Vasconcelos, N. (2018). RESOUND: Towards Action Recognition Without Representation Bias. volume 11210In V. Ferrari, M. Hebert, C. Sminchisescu, & Y. Weiss (Eds.), Computer Vision - ECCV 2018 (pp. 520\u2013535). Cham: Springer International Publishing.","DOI":"10.1007\/978-3-030-01231-1_32"},{"key":"2785_CR50","doi-asserted-by":"crossref","unstructured":"Li, H., Liu, Y., Zhang, H., & Li, B. (2023). Mitigating and Evaluating Static Bias of Action Representations in the Background and the Foreground. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pages 19911\u201319923.","DOI":"10.1109\/ICCV51070.2023.01823"},{"key":"2785_CR51","unstructured":"Li, A., Thotakuri, M., Ross, D. A., Carreira, J., Vostrikov, A., & Zisserman, A. (2020). The ava-kinetics localized human actions video dataset. arXiv preprint arXiv:2005.00214."},{"key":"2785_CR52","doi-asserted-by":"crossref","unstructured":"Lin, T., Liu, X., Li, X., Ding, E., & Wen, S. (2019). Bmn: Boundary-matching network for temporal action proposal generation. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV).","DOI":"10.1109\/ICCV.2019.00399"},{"key":"2785_CR53","unstructured":"Liu, H., Simonyan, K., & Yang, Y. (2018). Darts: Differentiable architecture search. arXiv preprint arXiv:1806.09055."},{"key":"2785_CR54","unstructured":"Lorraine, J., Vicol, P., & Duvenaud, D. (2020). Optimizing Millions of Hyperparameters by Implicit Differentiation. In Proceedings of the Twenty Third International Conference on Artificial Intelligence and Statistics, pages 1540\u20131552. PMLR, June."},{"key":"2785_CR55","doi-asserted-by":"crossref","unstructured":"Luenberger, D. G. (1979). Dynamic Systems. J. Wiley Sons.","DOI":"10.21236\/ADA080206"},{"issue":"1","key":"2785_CR56","doi-asserted-by":"publisher","first-page":"4950","DOI":"10.1038\/s41467-018-07210-0","volume":"9","author":"B Lusch","year":"2018","unstructured":"Lusch, B., Kutz, J. N., & Brunton, S. L. (2018). Deep learning for universal linear embeddings of nonlinear dynamics. Nature communications, 9(1), 4950.","journal-title":"Nature communications"},{"key":"2785_CR57","doi-asserted-by":"crossref","unstructured":"Madan, N., M\u00f8gelmose, A., Modi, R., Rawat, Y. S., & Moeslund, T. B. (2024). Foundation models for video understanding: A survey. Authorea Preprints.","DOI":"10.36227\/techrxiv.171769139.99464428\/v2"},{"key":"2785_CR58","doi-asserted-by":"crossref","unstructured":"Pan, T., Song, Y., Yang, T., Jiang, W., & Liu, W. (2021). Videomoco: Contrastive video representation learning with temporally adversarial examples. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pages 11205\u201311214.","DOI":"10.1109\/CVPR46437.2021.01105"},{"key":"2785_CR59","unstructured":"Patrick, M., Asano, Y. M., Kuznetsova, P., Fong, R., Henriques, J. F., Zweig, G., & Vedaldi, A. (2021). Multi-modal self-supervision from generalized data transformations."},{"key":"2785_CR60","doi-asserted-by":"crossref","unstructured":"Pearl, J. (2009). Causality. Cambridge University Press.","DOI":"10.1017\/CBO9780511803161"},{"key":"2785_CR61","unstructured":"Peters, J., Janzing, D., & Sch\u00f6lkopf, B. (2017). Elements of causal inference: foundations and learning algorithms. The MIT Press."},{"key":"2785_CR62","first-page":"1256","volume":"34","author":"M Pezeshki","year":"2021","unstructured":"Pezeshki, M., Kaba, O., Bengio, Y., Courville, A. C., Precup, D., & Lajoie, G. (2021). Gradient starvation: A learning proclivity in neural networks. Advances in Neural Information Processing Systems, 34, 1256\u20131272.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2785_CR63","doi-asserted-by":"crossref","unstructured":"Qian, R., Ding, S., Liu, X., & Lin, D. (2022). Static and Dynamic Concepts for Self-supervised Video Representation Learning. In Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G. M., & Hassner, T., (Eds.), Computer Vision - ECCV 2022, volume 13686, pages 145\u2013164. Springer Nature Switzerland, Cham.","DOI":"10.1007\/978-3-031-19809-0_9"},{"key":"2785_CR64","doi-asserted-by":"crossref","unstructured":"Qian, R., Meng, T., Gong, B., Yang, M.-H., Wang, H., Belongie, S., & Cui, Y. (2021). Spatiotemporal contrastive video representation learning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 6964\u20136974.","DOI":"10.1109\/CVPR46437.2021.00689"},{"key":"2785_CR65","unstructured":"Qiang, W., Li, J., Zheng, C., Su, B., & Xiong, H. (2022). Interventional Contrastive Learning with Meta Semantic Regularizer. In Proceedings of the 39th International Conference on Machine Learning, pages 18018\u201318030. PMLR, June."},{"key":"2785_CR66","unstructured":"Rajeswaran, A., Finn, C., Kakade, S. M., & Levine, S. (2019). Meta-learning with implicit gradients. Advances in neural information processing systems, 32."},{"key":"2785_CR67","unstructured":"Rajeswaran, A., Mordatch, I., & Kumar, V. (2020). A game theoretic framework for model based reinforcement learning. In International conference on machine learning, pages 7953\u20137963. PMLR."},{"key":"2785_CR68","doi-asserted-by":"crossref","unstructured":"Ranasinghe, K., Naseer, M., Khan, S., F. S., Khan, & Michael S. (2022). Ryoo. Self-supervised Video Transformer. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pages 2864\u20132874, New Orleans, LA, USA, June. IEEE.","DOI":"10.1109\/CVPR52688.2022.00289"},{"key":"2785_CR69","unstructured":"Ren, S., He, K., Girshick, R., & Sun, J. (2015). Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems, 28."},{"issue":"1","key":"2785_CR70","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1093\/biomet\/70.1.41","volume":"70","author":"PR Rosenbaum","year":"1983","unstructured":"Rosenbaum, P. R., & Rubin, D. B. (1983). The central role of the propensity score in observational studies for causal effects. Biometrika, 70(1), 41\u201355.","journal-title":"Biometrika"},{"key":"2785_CR71","doi-asserted-by":"crossref","unstructured":"Rubin, D. B. (2005). Causal Inference Using Potential Outcomes: Design, Modeling, Decisions. Journal of the American Statistical Association, 100(469), 322\u2013331, March.","DOI":"10.1198\/016214504000001880"},{"key":"2785_CR72","doi-asserted-by":"crossref","unstructured":"Sarkar, P., Beirami, A., & Etemad, A. (2023). Uncovering the Hidden Dynamics of Video Self-supervised Learning under Distribution Shifts. arXiv preprint arXiv:2306.02014.","DOI":"10.52202\/075280-2340"},{"key":"2785_CR73","doi-asserted-by":"crossref","unstructured":"Schiappa, M. C., Rawat, Y. S., & Shah, M. (2022). Self-supervised learning for videos: A survey. ACM Computing Surveys. Publisher: ACM New York, NY.","DOI":"10.1145\/3577925"},{"key":"2785_CR74","doi-asserted-by":"publisher","first-page":"5","DOI":"10.1017\/S0022112010001217","volume":"656","author":"PJ Schmid","year":"2010","unstructured":"Schmid, P. J. (2010). Dynamic mode decomposition of numerical and experimental data. Journal of fluid mechanics, 656, 5\u201328.","journal-title":"Journal of fluid mechanics"},{"key":"2785_CR75","doi-asserted-by":"crossref","unstructured":"Schuldt, C., Laptev, I., & Caputo, B. (2004). Recognizing human actions: a local svm approach. In Proceedings of the 17th International Conference on Pattern Recognition, 2004. ICPR 2004, volume 3, pages 32\u201336. IEEE.","DOI":"10.1109\/ICPR.2004.1334462"},{"key":"2785_CR76","doi-asserted-by":"crossref","unstructured":"Selvaraju, R. R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., & Batra, D. (2017). Grad-cam: Visual explanations from deep networks via gradient-based localization. In Proceedings of the IEEE international conference on computer vision, pages 618\u2013626.","DOI":"10.1109\/ICCV.2017.74"},{"key":"2785_CR77","doi-asserted-by":"crossref","unstructured":"Shao, D., Zhao, Y., Dai, B., & Lin, D. (2020). Finegym: A hierarchical video dataset for fine-grained action understanding. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pages 2616\u20132625.","DOI":"10.1109\/CVPR42600.2020.00269"},{"key":"2785_CR78","unstructured":"Soomro, K., Zamir, A. R., & Shah, M. (2012). Ucf101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402."},{"key":"2785_CR79","doi-asserted-by":"crossref","unstructured":"Sun, X., Chen, P., Chen, L., Li, C., Li, T. H., Tan, M., & Gan, C. (2023). Masked Motion Encoding for Self-Supervised Video Representation Learning. In 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pages 2235\u20132245, Vancouver, BC, Canada, June. IEEE.","DOI":"10.1109\/CVPR52729.2023.00222"},{"key":"2785_CR80","doi-asserted-by":"crossref","unstructured":"Takeishi, N., Kawahara, Y., & Yairi, T. (2017). Learning koopman invariant subspaces for dynamic mode decomposition. Advances in neural information processing systems, 30.","DOI":"10.1109\/ICIP.2017.8296769"},{"key":"2785_CR81","doi-asserted-by":"crossref","unstructured":"Teed, Z., & Deng, J. (2020). Raft: Recurrent all-pairs field transforms for optical flow. In European conference on computer vision, pages 402\u2013419. Springer.","DOI":"10.1007\/978-3-030-58536-5_24"},{"key":"2785_CR82","doi-asserted-by":"crossref","unstructured":"Thoker, F. M., Doughty, H., & Snoek. C. (2023). Tubelet-Contrastive Self-Supervision for Video-Efficient Generalization, September. arXiv:2303.11003 [cs].","DOI":"10.1109\/ICCV51070.2023.01270"},{"key":"2785_CR83","doi-asserted-by":"crossref","unstructured":"Thoker, F. M., Doughty, H., Bagad, P., & Snoek, C. G. M. (2022). How Severe Is Benchmark-Sensitivity in Video Self-supervised Learning? In Computer Vision \u2013 ECCV 2022, volume 13694. Springer Nature Switzerland. Series Title: Lecture Notes in Computer Science.","DOI":"10.1007\/978-3-031-19830-4_36"},{"key":"2785_CR84","unstructured":"Tian, Y., Sun, C., Poole, B., Krishnan, D., Schmid, C., & Isola, P. (2020). What Makes for Good Views for Contrastive Learning? In Advances in Neural Information Processing Systems, volume 33, pages 6827\u20136839. Curran Associates Inc."},{"key":"2785_CR85","doi-asserted-by":"crossref","unstructured":"Tong, Z., Song, Y., Wang, J., & Wang, L. (2022). VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training, October. arXiv:2203.12602 [cs].","DOI":"10.52202\/068431-0732"},{"key":"2785_CR86","doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., Ray, J., LeCun, Y., & Paluri, M. (2018). A closer look at spatiotemporal convolutions for action recognition. In Proceedings of the IEEE conference on Computer Vision and Pattern Recognition, pages 6450\u20136459.","DOI":"10.1109\/CVPR.2018.00675"},{"key":"2785_CR87","unstructured":"Wang, J., Gao, Y., Li, K., Hu, J., Jiang, X., Guo, X., Ji, R., & Sun, X. (2020). Enhancing Unsupervised Video Representation Learning by Decoupling the Scene and the Motion, December. arXiv:2009.05757 [cs]."},{"key":"2785_CR88","doi-asserted-by":"crossref","unstructured":"Wang, J., Gao, Y., Li, K., Lin, Y., Ma, A. J., Cheng, H., Peng, P., Huang, F., Ji, R., & Sun, X. (2021). Removing the background by adding the background: Towards background robust self-supervised video representation learning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 11804\u201311813.","DOI":"10.1109\/CVPR46437.2021.01163"},{"key":"2785_CR89","doi-asserted-by":"crossref","unstructured":"Wang, L., Huang, B., Zhao, Z., Tong, Z., He, Y., Wang, Y., Wang, Y., & Qiao, Y. (2023). Videomae v2: Scaling video masked autoencoders with dual masking. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 14549\u201314560.","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"2785_CR90","doi-asserted-by":"crossref","unstructured":"Wang, J., Jiao, J., & Liu, Y.-H. (2020). Self-supervised Video Representation Learning by Pace Prediction, September. arXiv:2008.05861 [cs].","DOI":"10.1007\/978-3-030-58520-4_30"},{"key":"2785_CR91","doi-asserted-by":"crossref","unstructured":"Wang, J., Jiao, J., Bao, L., He, S., Liu, Y., & Liu, W. (2019). Self-supervised spatio-temporal representation learning for videos by predicting motion and appearance statistics. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 4006\u20134015.","DOI":"10.1109\/CVPR.2019.00413"},{"key":"2785_CR92","doi-asserted-by":"crossref","unstructured":"Williams, M. O., Kevrekidis, I. G., & Rowley, C. W. (2015). A data\u2013driven approximation of the koopman operator: Extending dynamic mode decomposition. Journal of Nonlinear Science, 25:1307\u20131346. Publisher: Springer.","DOI":"10.1007\/s00332-015-9258-5"},{"key":"2785_CR93","doi-asserted-by":"crossref","unstructured":"Xiao, F., Tighe, J., & Modolo, D. (2022). MaCLR: Motion-Aware Contrastive Learning of Representations for Videos. In Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G. M., & Hassner, T., (Eds.), Computer Vision \u2013 ECCV 2022, volume 13695, pages 353\u2013370. Springer Nature Switzerland, Cham. Series Title: Lecture Notes in Computer Science.","DOI":"10.1007\/978-3-031-19833-5_21"},{"key":"2785_CR94","doi-asserted-by":"crossref","unstructured":"Xie, S., Sun, C., Huang, J., Tu, Z., & Murphy, K. (2018). Rethinking spatiotemporal feature learning: Speed-accuracy trade-offs in video classification. In Proceedings of the European conference on computer vision (ECCV), pages 305\u2013321.","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"2785_CR95","doi-asserted-by":"crossref","unstructured":"Xu, M., Zhao, C., Rojas, D. S., Thabet, A., & Ghanem, B. (2020). G-tad: Sub-graph localization for temporal action detection. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR42600.2020.01017"},{"key":"2785_CR96","unstructured":"Yang, H., Huang, D., Wen, B., Wu, J., Yao, H., Jiang, Y., Zhu, X., & Yuan, Z. (2022). Self-supervised Video Representation Learning with Motion-Aware Masked Autoencoders, October. arXiv:2210.04154 [cs]."},{"key":"2785_CR97","unstructured":"Yang, C., Xu, Y., Dai, B., & Zhou, B. (2020). Video Representation Learning with Visual Tempo Consistency, December. arXiv:2006.15489 [cs]."},{"key":"2785_CR98","unstructured":"Yang, X., Zhang, H., & Cai, J. ( 2020). Deconfounded Image Captioning: A Causal Retrospect, March."},{"key":"2785_CR99","doi-asserted-by":"crossref","unstructured":"Yao, T., Zhang, Y., Qiu, Z., Pan, Y., & Mei, T. (2021). Seco: Exploring sequence supervision for unsupervised representation learning. In Proceedings of the AAAI Conference on Artificial Intelligence, pages 10656\u201310664.","DOI":"10.1609\/aaai.v35i12.17274"},{"key":"2785_CR100","unstructured":"You, Y., Gitman, I., & Ginsburg, B. (2017). Large batch training of convolutional networks. arXiv preprint arXiv:1708.03888."},{"key":"2785_CR101","unstructured":"Yue, Z., Zhang, H., Sun, Q., & Hua, X.-S. (2020). Interventional Few-Shot Learning, December."},{"key":"2785_CR102","unstructured":"Zbontar, J., Jing, L., Misra, I., LeCun, Y., & Deny, S. (2021). Barlow Twins: Self-Supervised Learning via Redundancy Reduction. In Proceedings of the 38th International Conference on Machine Learning, pages 12310\u201312320. PMLR, July."},{"key":"2785_CR103","doi-asserted-by":"crossref","unstructured":"Zhang, C., Wu, J., & Li, Y. (2022). ActionFormer: Localizing Moments of Actions with Transformers, August. arXiv:2202.07925 [cs].","DOI":"10.1007\/978-3-031-19772-7_29"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02785-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-026-02785-4","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02785-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T08:32:37Z","timestamp":1774600357000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-026-02785-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,26]]},"references-count":103,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["2785"],"URL":"https:\/\/doi.org\/10.1007\/s11263-026-02785-4","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,26]]},"assertion":[{"value":"9 April 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 February 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 March 2026","order":5,"name":"change_date","label":"Change Date","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Update","order":6,"name":"change_type","label":"Change Type","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The original version of this article was revised due to update in affiliation.","order":7,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of Interest"}}],"article-number":"139"}}