{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,9]],"date-time":"2025-09-09T21:06:59Z","timestamp":1757452019729,"version":"3.40.3"},"publisher-location":"Cham","reference-count":96,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031198205"},{"type":"electronic","value":"9783031198212"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19821-2_1","type":"book-chapter","created":{"date-parts":[[2022,10,22]],"date-time":"2022-10-22T12:12:59Z","timestamp":1666440779000},"page":"1-22","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["GOCA: Guided Online Cluster Assignment for\u00a0Self-supervised Video Representation Learning"],"prefix":"10.1007","author":[{"given":"Huseyin","family":"Coskun","sequence":"first","affiliation":[]},{"given":"Alireza","family":"Zareian","sequence":"additional","affiliation":[]},{"given":"Joshua L.","family":"Moore","sequence":"additional","affiliation":[]},{"given":"Federico","family":"Tombari","sequence":"additional","affiliation":[]},{"given":"Chen","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,10,23]]},"reference":[{"key":"1_CR1","unstructured":"Ahsan, U., Sun, C., Essa, I.: DiscrimNet: semi-supervised action recognition from videos using generative adversarial networks. arXiv preprint arXiv:1801.07230 (2018)"},{"key":"1_CR2","unstructured":"Akbari, H., et al.: VATT: transformers for multimodal self-supervised learning from raw video, audio and text. In: Advances in Neural Information Processing Systems (2021)"},{"key":"1_CR3","unstructured":"Alayrac, J.B., et al.: Self-supervised multimodal versatile networks. In: NeurIPS, vol. 2, no. 6, p. 7 (2020)"},{"key":"1_CR4","unstructured":"Alwassel, H., Mahajan, D., Korbar, B., Torresani, L., Ghanem, B., Tran, D.: Self-supervised learning by cross-modal audio-video clustering. In: Advances in Neural Information Processing Systems 33 (2020)"},{"key":"1_CR5","unstructured":"Alwassel, H., Mahajan, D., Korbar, B., Torresani, L., Ghanem, B., Tran, D.: Self-supervised learning by cross-modal audio-video clustering. In: Advances in Neural Information Processing Systems (NeurIPS) (2020)"},{"key":"1_CR6","unstructured":"Asano, Y.M., Patrick, M., Rupprecht, C., Vedaldi, A.: Labelling unlabelled videos from scratch with multi-modal self-supervision. In: Advances in Neural Information Processing Systems (2020)"},{"key":"1_CR7","unstructured":"Asano, Y.M., Rupprecht, C., Vedaldi, A.: Self-labelling via simultaneous clustering and representation learning. In: International Conference on Learning Representations (ICLR) (2020)"},{"key":"1_CR8","doi-asserted-by":"crossref","unstructured":"Benaim, S., et al.: SpeedNet: learning the speediness in videos. in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9922\u20139931 (2020)","DOI":"10.1109\/CVPR42600.2020.00994"},{"key":"1_CR9","unstructured":"Cai, T., Gao, R., Lee, J.D., Lei, Q.: A theory of label propagation for subpopulation shift. arXiv preprint arXiv:2102.11203 (2021)"},{"key":"1_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1007\/978-3-030-01264-9_9","volume-title":"Computer Vision \u2013 ECCV 2018","author":"M Caron","year":"2018","unstructured":"Caron, M., Bojanowski, P., Joulin, A., Douze, M.: Deep clustering for unsupervised learning of visual features. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) Computer Vision \u2013 ECCV 2018. LNCS, vol. 11218, pp. 139\u2013156. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01264-9_9"},{"key":"1_CR11","unstructured":"Caron, M., Misra, I., Mairal, J., Goyal, P., Bojanowski, P., Joulin, A.: Unsupervised learning of visual features by contrasting cluster assignments. In: Advances in Neural Information Processing Systems (2020)"},{"key":"1_CR12","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo Vadis, action recognition? A new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"1_CR13","doi-asserted-by":"crossref","unstructured":"Chao, Y.W., Vijayanarasimhan, S., Seybold, B., Ross, D.A., Deng, J., Sukthankar, R.: Rethinking the faster R-CNN architecture for temporal action localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1130\u20131139 (2018)","DOI":"10.1109\/CVPR.2018.00124"},{"key":"1_CR14","doi-asserted-by":"publisher","unstructured":"Chen, S., Tian, Y., Wen, F., Xu, Y., Tang, X.: EasyToon: an easy and quick tool to personalize a cartoon storyboard using family photo album. In: El-Saddik, A., Vuong, S., Griwodz, C., Bimbo, A.D., Candan, K.S., Jaimes, A. (eds.) Proceedings of the 16th International Conference on Multimedia 2008, Vancouver, British Columbia, Canada, 26\u201331 October 2008, pp. 499\u2013508. ACM (2008). https:\/\/doi.org\/10.1145\/1459359.1459426","DOI":"10.1145\/1459359.1459426"},{"key":"1_CR15","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"key":"1_CR16","doi-asserted-by":"crossref","unstructured":"Chen, X., He, K.: Exploring simple Siamese representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15750\u201315758 (2021)","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"1_CR17","doi-asserted-by":"crossref","unstructured":"Chen, X., Xie, S., He, K.: An empirical study of training self-supervised vision transformers. arXiv preprint arXiv:2104.02057 (2021)","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"1_CR18","unstructured":"Cho, H., Kim, T., Chang, H.J., Hwang, W.: Self-supervised spatio-temporal representation learning using variable playback speed prediction. arXiv preprint arXiv:2003.02692 (2020)"},{"key":"1_CR19","doi-asserted-by":"publisher","first-page":"79562","DOI":"10.1109\/ACCESS.2021.3084840","volume":"9","author":"H Cho","year":"2021","unstructured":"Cho, H., Kim, T., Chang, H.J., Hwang, W.: Self-supervised visual learning by variable playback speeds prediction of a video. IEEE Access 9, 79562\u201379571 (2021)","journal-title":"IEEE Access"},{"key":"1_CR20","doi-asserted-by":"crossref","unstructured":"Coskun, H., et al.: Domain-specific priors and meta learning for low-shot first-person action recognition. IEEE Trans. Pattern Anal. Mach. Intell. (2021)","DOI":"10.1109\/TPAMI.2021.3058606"},{"key":"1_CR21","unstructured":"Cuturi, M.: Sinkhorn distances: lightspeed computation of optimal transport. In: Advances in Neural Information Processing Systems 26, pp. 2292\u20132300 (2013)"},{"key":"1_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"753","DOI":"10.1007\/978-3-030-01225-0_44","volume-title":"Computer Vision \u2013 ECCV 2018","author":"D Damen","year":"2018","unstructured":"Damen, D., et al.: Scaling egocentric vision: the epic-kitchens dataset. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11208, pp. 753\u2013771. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01225-0_44"},{"key":"1_CR23","doi-asserted-by":"crossref","unstructured":"Dave, I., Gupta, R., Rizve, M.N., Shah, M.: TCLR: temporal contrastive learning for video representation. arXiv preprint arXiv:2101.07974 (2021)","DOI":"10.1016\/j.cviu.2022.103406"},{"key":"1_CR24","doi-asserted-by":"crossref","unstructured":"Diba, A., Sharma, V., Gool, L.V., Stiefelhagen, R.: DynamoNet: dynamic action and motion network. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6192\u20136201 (2019)","DOI":"10.1109\/ICCV.2019.00629"},{"key":"1_CR25","doi-asserted-by":"crossref","unstructured":"Doersch, C., Gupta, A., Efros, A.A.: Unsupervised visual representation learning by context prediction. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1422\u20131430 (2015)","DOI":"10.1109\/ICCV.2015.167"},{"issue":"9","key":"1_CR26","doi-asserted-by":"publisher","first-page":"1734","DOI":"10.1109\/TPAMI.2015.2496141","volume":"38","author":"A Dosovitskiy","year":"2015","unstructured":"Dosovitskiy, A., Fischer, P., Springenberg, J.T., Riedmiller, M., Brox, T.: Discriminative unsupervised feature learning with exemplar convolutional neural networks. IEEE Trans. Pattern Anal. Mach. Intell. 38(9), 1734\u20131747 (2015)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1_CR27","volume-title":"The Gravity Model in Transportation Analysis: Theory and Extensions","author":"S Erlander","year":"1990","unstructured":"Erlander, S., Stewart, N.F.: The Gravity Model in Transportation Analysis: Theory and Extensions, vol. 3. VSP, Utrecht (1990)"},{"key":"1_CR28","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: SlowFast networks for video recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6202\u20136211 (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"1_CR29","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Xiong, B., Girshick, R., He, K.: A large-scale study on unsupervised spatiotemporal representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3299\u20133309 (2021)","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"1_CR30","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Pinz, A., Zisserman, A.: Convolutional two-stream network fusion for video action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1933\u20131941 (2016)","DOI":"10.1109\/CVPR.2016.213"},{"key":"1_CR31","doi-asserted-by":"crossref","unstructured":"Fernando, B., Bilen, H., Gavves, E., Gould, S.: Self-supervised video representation learning with odd-one-out networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3636\u20133645 (2017)","DOI":"10.1109\/CVPR.2017.607"},{"key":"1_CR32","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"70","DOI":"10.1007\/978-3-030-01216-8_5","volume-title":"Computer Vision \u2013 ECCV 2018","author":"J Gao","year":"2018","unstructured":"Gao, J., Chen, K., Nevatia, R.: CTAP: complementary temporal action proposal generation. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11206, pp. 70\u201385. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01216-8_5"},{"key":"1_CR33","doi-asserted-by":"crossref","unstructured":"Gavrilyuk, K., Jain, M., Karmanov, I., Snoek, C.G.M.: Motion-augmented self-training for video recognition at smaller scale. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 10429\u201310438, October 2021","DOI":"10.1109\/ICCV48922.2021.01026"},{"key":"1_CR34","unstructured":"Grill, J.B., et al.: Bootstrap your own latent - a new approach to self-supervised learning. In: Larochelle, H., Ranzato, M., Hadsell, R., Balcan, M.F., Lin, H. (eds.) Advances in Neural Information Processing Systems, vol. 33, pp. 21271\u201321284. Curran Associates, Inc. (2020). https:\/\/proceedings.neurips.cc\/paper\/2020\/file\/f3ada80d5c4ee70142b17b8192b2958e-Paper.pdf"},{"key":"1_CR35","doi-asserted-by":"crossref","unstructured":"Gu, C., et al.: AVA: a video dataset of spatio-temporally localized atomic visual actions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6047\u20136056 (2018)","DOI":"10.1109\/CVPR.2018.00633"},{"key":"1_CR36","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"312","DOI":"10.1007\/978-3-030-58580-8_19","volume-title":"Computer Vision \u2013 ECCV 2020","author":"T Han","year":"2020","unstructured":"Han, T., Xie, W., Zisserman, A.: Memory-augmented dense predictive coding for video representation learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12348, pp. 312\u2013329. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58580-8_19"},{"key":"1_CR37","unstructured":"Han, T., Xie, W., Zisserman, A.: Self-supervised co-training for video representation learning. In: NeurIPS (2020)"},{"key":"1_CR38","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"1_CR39","doi-asserted-by":"crossref","unstructured":"Hu, K., Shao, J., Liu, Y., Raj, B., Savvides, M., Shen, Z.: Contrast and order representations for video self-supervised learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 7939\u20137949, October 2021","DOI":"10.1109\/ICCV48922.2021.00784"},{"key":"1_CR40","doi-asserted-by":"crossref","unstructured":"Huang, D., et al.: ASCNet: self-supervised video representation learning with appearance-speed consistency. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 8096\u20138105, October 2021","DOI":"10.1109\/ICCV48922.2021.00799"},{"key":"1_CR41","doi-asserted-by":"crossref","unstructured":"Jenni, S., Jin, H.: Time-equivariant contrastive video representation learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9970\u20139980 (2021)","DOI":"10.1109\/ICCV48922.2021.00982"},{"key":"1_CR42","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"425","DOI":"10.1007\/978-3-030-58604-1_26","volume-title":"Computer Vision \u2013 ECCV 2020","author":"S Jenni","year":"2020","unstructured":"Jenni, S., Meishvili, G., Favaro, P.: Video representation learning by recognizing temporal transformations. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12373, pp. 425\u2013442. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58604-1_26"},{"issue":"11","key":"1_CR43","doi-asserted-by":"publisher","first-page":"3781","DOI":"10.1109\/TIP.2015.2456412","volume":"24","author":"YG Jiang","year":"2015","unstructured":"Jiang, Y.G., Dai, Q., Liu, W., Xue, X., Ngo, C.W.: Human action recognition in unconstrained videos by explicit motion modeling. IEEE Trans. Image Process. 24(11), 3781\u20133795 (2015)","journal-title":"IEEE Trans. Image Process."},{"issue":"2","key":"1_CR44","doi-asserted-by":"publisher","first-page":"201","DOI":"10.3758\/BF03212378","volume":"14","author":"G Johansson","year":"1973","unstructured":"Johansson, G.: Visual perception of biological motion and a model for its analysis. Percept. Psychophys. 14(2), 201\u2013211 (1973)","journal-title":"Percept. Psychophys."},{"key":"1_CR45","unstructured":"Kay, W., et al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"1_CR46","unstructured":"Korbar, B., Tran, D., Torresani, L.: Cooperative learning of audio and video models from self-supervised synchronization. In: Bengio, S., Wallach, H., Larochelle, H., Grauman, K., Cesa-Bianchi, N., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol. 31. Curran Associates, Inc. (2018). https:\/\/proceedings.neurips.cc\/paper\/2018\/file\/c4616f5a24a66668f11ca4fa80525dc4-Paper.pdf"},{"key":"1_CR47","doi-asserted-by":"publisher","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: a large video database for human motion recognition. In: 2011 International Conference on Computer Vision, pp. 2556\u20132563 (2011). https:\/\/doi.org\/10.1109\/ICCV.2011.6126543","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"1_CR48","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"577","DOI":"10.1007\/978-3-319-46493-0_35","volume-title":"Computer Vision \u2013 ECCV 2016","author":"G Larsson","year":"2016","unstructured":"Larsson, G., Maire, M., Shakhnarovich, G.: Learning representations for automatic colorization. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9908, pp. 577\u2013593. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46493-0_35"},{"key":"1_CR49","doi-asserted-by":"crossref","unstructured":"Lee, H.Y., Huang, J.B., Singh, M., Yang, M.H.: Unsupervised representation learning by sorting sequences. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 667\u2013676 (2017)","DOI":"10.1109\/ICCV.2017.79"},{"key":"1_CR50","doi-asserted-by":"crossref","unstructured":"Li, R., Zhang, Y., Qiu, Z., Yao, T., Liu, D., Mei, T.: Motion-focused contrastive learning of video representations. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2105\u20132114 (2021)","DOI":"10.1109\/ICCV48922.2021.00211"},{"key":"1_CR51","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"520","DOI":"10.1007\/978-3-030-01231-1_32","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Y Li","year":"2018","unstructured":"Li, Y., Li, Y., Vasconcelos, N.: RESOUND: towards action recognition without representation bias. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11210, pp. 520\u2013535. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01231-1_32"},{"key":"1_CR52","doi-asserted-by":"crossref","unstructured":"Lin, Y., Guo, X., Lu, Y.: Self-supervised video representation learning with meta-contrastive network. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8239\u20138249 (2021)","DOI":"10.1109\/ICCV48922.2021.00813"},{"key":"1_CR53","unstructured":"Lin, Z., Qi, S., Zhengyang, S., Changhu, W.: Inter-intra variant dual representations for self-supervised video recognition. In: BMVC (2021)"},{"key":"1_CR54","doi-asserted-by":"publisher","first-page":"79","DOI":"10.1016\/j.cviu.2015.10.012","volume":"152","author":"J Liu","year":"2016","unstructured":"Liu, J., Chen, C., Zhu, Y., Liu, W., Metaxas, D.N.: Video classification via weakly supervised sequence modeling. Comput. Vis. Image Underst. 152, 79\u201387 (2016)","journal-title":"Comput. Vis. Image Underst."},{"key":"1_CR55","doi-asserted-by":"publisher","first-page":"1978","DOI":"10.1109\/TIP.2022.3147032","volume":"31","author":"Y Liu","year":"2022","unstructured":"Liu, Y., Wang, K., Liu, L., Lan, H., Lin, L.: TCGL: temporal contrastive graph for self-supervised video representation learning. IEEE Trans. Image Process. 31, 1978\u20131993 (2022). https:\/\/doi.org\/10.1109\/TIP.2022.3147032","journal-title":"IEEE Trans. Image Process."},{"key":"1_CR56","doi-asserted-by":"crossref","unstructured":"Liu, Y., Ma, L., Zhang, Y., Liu, W., Chang, S.F.: Multi-granularity generator for temporal action proposal. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3604\u20133613 (2019)","DOI":"10.1109\/CVPR.2019.00372"},{"key":"1_CR57","doi-asserted-by":"crossref","unstructured":"Luo, D., et al.: Video cloze procedure for self-supervised spatio-temporal learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, pp. 11701\u201311708 (2020)","DOI":"10.1609\/aaai.v34i07.6840"},{"key":"1_CR58","unstructured":"Mettes, P., van der Pol, E., Snoek, C.: Hyperspherical prototype networks. In: Advances in Neural Information Processing Systems 32, pp. 1487\u20131497 (2019)"},{"key":"1_CR59","doi-asserted-by":"crossref","unstructured":"Miech, A., Alayrac, J.B., Smaira, L., Laptev, I., Sivic, J., Zisserman, A.: End-to-end learning of visual representations from uncurated instructional videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9879\u20139889 (2020)","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"1_CR60","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"527","DOI":"10.1007\/978-3-319-46448-0_32","volume-title":"Computer Vision \u2013 ECCV 2016","author":"I Misra","year":"2016","unstructured":"Misra, I., Zitnick, C.L., Hebert, M.: Shuffle and learn: unsupervised learning using temporal order verification. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 527\u2013544. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_32"},{"key":"1_CR61","doi-asserted-by":"crossref","unstructured":"Morgado, P., Misra, I., Vasconcelos, N.: Robust audio-visual instance discrimination. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12934\u201312945 (2021)","DOI":"10.1109\/CVPR46437.2021.01274"},{"key":"1_CR62","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46466-4_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"M Noroozi","year":"2016","unstructured":"Noroozi, M., Favaro, P.: Unsupervised learning of visual representations by solving jigsaw puzzles. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9910, pp. 69\u201384. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46466-4_5"},{"key":"1_CR63","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"1_CR64","doi-asserted-by":"crossref","unstructured":"Pan, T., Song, Y., Yang, T., Jiang, W., Liu, W.: VideoMoCo: contrastive video representation learning with temporally adversarial examples. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11205\u201311214 (2021)","DOI":"10.1109\/CVPR46437.2021.01105"},{"key":"1_CR65","unstructured":"Patrick, M., et al.: Multi-modal self-supervision from generalized data transformations (2021)"},{"key":"1_CR66","doi-asserted-by":"crossref","unstructured":"Pickup, L.C., et al.: Seeing the arrow of time. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2035\u20132042 (2014)","DOI":"10.1109\/CVPR.2014.262"},{"key":"1_CR67","doi-asserted-by":"crossref","unstructured":"Qian, R., et al.: Enhancing self-supervised video representation learning via multi-level feature optimization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 7990\u20138001, October 2021","DOI":"10.1109\/ICCV48922.2021.00789"},{"key":"1_CR68","doi-asserted-by":"crossref","unstructured":"Qian, R., et al.: Spatiotemporal contrastive video representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6964\u20136974 (2021)","DOI":"10.1109\/CVPR46437.2021.00689"},{"key":"1_CR69","doi-asserted-by":"crossref","unstructured":"Regatti, J.R., Deshmukh, A.A., Manavoglu, E., Dogan, U.: Consensus clustering with unsupervised representation learning. In: 2021 International Joint Conference on Neural Networks (IJCNN), pp. 1\u20139. IEEE (2021)","DOI":"10.1109\/IJCNN52387.2021.9533714"},{"key":"1_CR70","unstructured":"Richemond, P.H., et al.: BYOL works even without batch statistics. CoRR abs\/2010.10241 (2020). https:\/\/arxiv.org\/abs\/2010.10241"},{"key":"1_CR71","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"281","DOI":"10.1007\/978-3-030-12939-2_20","volume-title":"Pattern Recognition","author":"L Sevilla-Lara","year":"2019","unstructured":"Sevilla-Lara, L., Liao, Y., G\u00fcney, F., Jampani, V., Geiger, A., Black, M.J.: On the integration of optical flow and action recognition. In: Brox, T., Bruhn, A., Fritz, M. (eds.) GCPR 2018. LNCS, vol. 11269, pp. 281\u2013297. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-12939-2_20"},{"key":"1_CR72","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: Advances in Neural Information Processing Systems, pp. 568\u2013576 (2014)"},{"issue":"4","key":"1_CR73","doi-asserted-by":"publisher","first-page":"402","DOI":"10.2307\/2314570","volume":"74","author":"R Sinkhorn","year":"1967","unstructured":"Sinkhorn, R.: Diagonal equivalence to matrices with prescribed row and column sums. Am. Math. Mon. 74(4), 402\u2013405 (1967)","journal-title":"Am. Math. Mon."},{"key":"1_CR74","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild. CoRR abs\/1212.0402 (2012). http:\/\/arxiv.org\/abs\/1212.0402"},{"key":"1_CR75","unstructured":"Srivastava, N., Mansimov, E., Salakhudinov, R.: Unsupervised learning of video representations using LSTMs. In: International Conference on Machine Learning, pp. 843\u2013852. PMLR (2015)"},{"key":"1_CR76","unstructured":"Sun, C., Baradel, F., Murphy, K., Schmid, C.: Learning video representations using contrastive bidirectional transformer. arXiv preprint arXiv:1906.05743 (2019)"},{"key":"1_CR77","doi-asserted-by":"crossref","unstructured":"Sun, C., Myers, A., Vondrick, C., Murphy, K., Schmid, C.: VideoBERT: a joint model for video and language representation learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7464\u20137473 (2019)","DOI":"10.1109\/ICCV.2019.00756"},{"issue":"1","key":"1_CR78","first-page":"1","volume":"27","author":"PML Tammes","year":"1930","unstructured":"Tammes, P.M.L.: On the origin of number and arrangement of the places of exit on the surface of pollen-grains. Recueil des travaux botaniques n\u00e9erlandais 27(1), 1\u201384 (1930)","journal-title":"Recueil des travaux botaniques n\u00e9erlandais"},{"key":"1_CR79","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"776","DOI":"10.1007\/978-3-030-58621-8_45","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y Tian","year":"2020","unstructured":"Tian, Y., Krishnan, D., Isola, P.: Contrastive multiview coding. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12356, pp. 776\u2013794. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58621-8_45"},{"key":"1_CR80","doi-asserted-by":"crossref","unstructured":"Toering, M., Gatopoulos, I., Stol, M., Hu, V.T.: Self-supervised video representation learning with cross-stream prototypical contrasting. Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), January 2022","DOI":"10.1109\/WACV51458.2022.00092"},{"key":"1_CR81","doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., Ray, J., LeCun, Y., Paluri, M.: A closer look at spatiotemporal convolutions for action recognition. In: Proceedings of the IEEE conference on Computer Vision and Pattern Recognition, pp. 6450\u20136459 (2018)","DOI":"10.1109\/CVPR.2018.00675"},{"key":"1_CR82","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A.: Generating videos with scene dynamics. In: Advances in Neural Information Processing Systems 29, pp. 613\u2013621 (2016)"},{"key":"1_CR83","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"504","DOI":"10.1007\/978-3-030-58520-4_30","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Wang","year":"2020","unstructured":"Wang, J., Jiao, J., Liu, Y.-H.: Self-supervised video representation learning by pace prediction. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12362, pp. 504\u2013521. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58520-4_30"},{"key":"1_CR84","doi-asserted-by":"crossref","unstructured":"Wei, D., Lim, J.J., Zisserman, A., Freeman, W.T.: Learning and using the arrow of time. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8052\u20138060 (2018)","DOI":"10.1109\/CVPR.2018.00840"},{"key":"1_CR85","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"318","DOI":"10.1007\/978-3-030-01267-0_19","volume-title":"Computer Vision \u2013 ECCV 2018","author":"S Xie","year":"2018","unstructured":"Xie, S., Sun, C., Huang, J., Tu, Z., Murphy, K.: Rethinking spatiotemporal feature learning: speed-accuracy trade-offs in video classification. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11219, pp. 318\u2013335. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01267-0_19"},{"key":"1_CR86","doi-asserted-by":"crossref","unstructured":"Xu, D., Xiao, J., Zhao, Z., Shao, J., Xie, D., Zhuang, Y.: Self-supervised spatiotemporal learning via video clip order prediction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10334\u201310343 (2019)","DOI":"10.1109\/CVPR.2019.01058"},{"key":"1_CR87","doi-asserted-by":"crossref","unstructured":"Yang, J., Parikh, D., Batra, D.: Joint unsupervised learning of deep representations and image clusters. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5147\u20135156 (2016)","DOI":"10.1109\/CVPR.2016.556"},{"key":"1_CR88","doi-asserted-by":"crossref","unstructured":"Yao, Y., Liu, C., Luo, D., Zhou, Y., Ye, Q.: Video playback rate perception for self-supervised spatio-temporal representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6548\u20136557 (2020)","DOI":"10.1109\/CVPR42600.2020.00658"},{"key":"1_CR89","doi-asserted-by":"crossref","unstructured":"Yao, Y., Liu, C., Luo, D., Zhou, Y., Ye, Q.: Video playback rate perception for self-supervised spatio-temporal representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), June 2020","DOI":"10.1109\/CVPR42600.2020.00658"},{"key":"1_CR90","unstructured":"Asano, Y.M., Rupprecht, C., Vedaldi, A.: Self-labelling via simultaneous clustering and representation learning. In: International Conference on Learning Representations (2020). https:\/\/openreview.net\/forum?id=Hyx-jyBFPr"},{"key":"1_CR91","unstructured":"You, Y., Gitman, I., Ginsburg, B.: Large batch training of convolutional networks. arXiv preprint arXiv:1708.03888 (2017)"},{"key":"1_CR92","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"214","DOI":"10.1007\/978-3-540-74936-3_22","volume-title":"Pattern Recognition","author":"C Zach","year":"2007","unstructured":"Zach, C., Pock, T., Bischof, H.: A duality based approach for realtime TV-L1 optical flow. In: Hamprecht, F.A., Schn\u00f6rr, C., J\u00e4hne, B. (eds.) DAGM 2007. LNCS, vol. 4713, pp. 214\u2013223. Springer, Heidelberg (2007). https:\/\/doi.org\/10.1007\/978-3-540-74936-3_22"},{"key":"1_CR93","unstructured":"Zbontar, J., Jing, L., Misra, I., LeCun, Y., Deny, S.: Barlow twins: self-supervised learning via redundancy reduction. In: ICML (2021)"},{"key":"1_CR94","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"649","DOI":"10.1007\/978-3-319-46487-9_40","volume-title":"Computer Vision \u2013 ECCV 2016","author":"R Zhang","year":"2016","unstructured":"Zhang, R., Isola, P., Efros, A.A.: Colorful image colorization. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9907, pp. 649\u2013666. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46487-9_40"},{"key":"1_CR95","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Xiong, Y., Wang, L., Wu, Z., Tang, X., Lin, D.: Temporal action detection with structured segment networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2914\u20132923 (2017)","DOI":"10.1109\/ICCV.2017.317"},{"key":"1_CR96","doi-asserted-by":"crossref","unstructured":"Zhuang, C., Zhai, A.L., Yamins, D.: Local aggregation for unsupervised learning of visual embeddings. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6002\u20136012 (2019)","DOI":"10.1109\/ICCV.2019.00610"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19821-2_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,22]],"date-time":"2022-10-22T12:47:12Z","timestamp":1666442832000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19821-2_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198205","9783031198212"],"references-count":96,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19821-2_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"23 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}