{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T10:31:41Z","timestamp":1763202701787,"version":"3.40.3"},"publisher-location":"Cham","reference-count":46,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031198205"},{"type":"electronic","value":"9783031198212"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19821-2_29","type":"book-chapter","created":{"date-parts":[[2022,10,22]],"date-time":"2022-10-22T12:12:59Z","timestamp":1666440779000},"page":"506-522","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["Federated Self-supervised Learning for\u00a0Video Understanding"],"prefix":"10.1007","author":[{"given":"Yasar Abbas Ur","family":"Rehman","sequence":"first","affiliation":[]},{"given":"Yan","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Jiajun","family":"Shen","sequence":"additional","affiliation":[]},{"given":"Pedro Porto Buarque","family":"de Gusm\u00e3o","sequence":"additional","affiliation":[]},{"given":"Nicholas","family":"Lane","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,10,23]]},"reference":[{"key":"29_CR1","doi-asserted-by":"crossref","unstructured":"Aytar, Y., Vondrick, C., Torralba, A.: SoundNet: learning sound representations from unlabeled video. In: Advances in Neural Information Processing Systems, vol. 29 (2016)","DOI":"10.1109\/CVPR.2016.18"},{"key":"29_CR2","doi-asserted-by":"crossref","unstructured":"Benaim, S., et al.: SpeedNet: learning the speediness in videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9922\u20139931 (2020)","DOI":"10.1109\/CVPR42600.2020.00994"},{"key":"29_CR3","unstructured":"Beutel, D.J., Topal, T., Mathur, A., Qiu, X., Parcollet, T., Lane, N.D.: Flower: a friendly federated learning research framework. arXiv preprint arXiv:2007.14390 (2020)"},{"key":"29_CR4","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"key":"29_CR5","doi-asserted-by":"crossref","unstructured":"Cho, H., Kim, T., Chang, H.J., Hwang, W.: Self-supervised visual learning by variable playback speeds prediction of a video. IEEE Access 9, 79562\u201379571 (2021)","DOI":"10.1109\/ACCESS.2021.3084840"},{"key":"29_CR6","unstructured":"Contributors, M.: MMCV: OpenMMLab computer vision foundation (2018). https:\/\/github.com\/open-mmlab\/mmcv"},{"key":"29_CR7","doi-asserted-by":"crossref","unstructured":"Doersch, C., Gupta, A., Efros, A.A.: Unsupervised visual representation learning by context prediction. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1422\u20131430 (2015)","DOI":"10.1109\/ICCV.2015.167"},{"key":"29_CR8","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Xiong, B., Girshick, R., He, K.: A large-scale study on unsupervised spatiotemporal representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3299\u20133309 (2021)","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"29_CR9","doi-asserted-by":"crossref","unstructured":"Gao, Y., et al.: End-to-end speech recognition from federated acoustic models. arXiv preprint. arXiv:2104.14297 (2021)","DOI":"10.1109\/ICASSP43922.2022.9747161"},{"key":"29_CR10","unstructured":"Goyal, P., et al.: Vision models are more robust and fair when pretrained on uncurated images without supervision. arXiv preprint arXiv:2202.08360 (2022)"},{"key":"29_CR11","doi-asserted-by":"crossref","unstructured":"Han, T., Xie, W., Zisserman, A.: Video representation learning by dense predictive coding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision Workshops (2019)","DOI":"10.1109\/ICCVW.2019.00186"},{"key":"29_CR12","unstructured":"Han, T., Xie, W., Zisserman, A.: Self-supervised co-training for video representation learning. In: Advances in Neural Information Processing Systems, vol. 33, pp. 5679\u20135690 (2020)"},{"key":"29_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"issue":"1","key":"29_CR14","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1162\/neco.1997.9.1.1","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Flat minima. Neural Comput. 9(1), 1\u201342 (1997)","journal-title":"Neural Comput."},{"key":"29_CR15","doi-asserted-by":"crossref","unstructured":"Hu, Z., Xie, H., Yu, L., Gao, X., Shang, Z., Zhang, Y.: Dynamic-aware federated learning for face forgery video detection. ACM Trans. Intell. Syst. Technol. (TIST) 13, 1\u201325 (2022)","DOI":"10.1145\/3501814"},{"key":"29_CR16","unstructured":"Izmailov, P., Podoprikhin, D., Garipov, T., Vetrov, D., Wilson, A.G.: Averaging weights leads to wider optima and better generalization. In: 34th Conference on Uncertainty in Artificial Intelligence 2018, UAI 2018, pp. 876\u2013885. Association For Uncertainty in Artificial Intelligence (AUAI) (2018)"},{"key":"29_CR17","unstructured":"Jain, A.K., Deb, D., Engelsma, J.J.: Biometrics: trust, but verify. arXiv preprint arXiv:2105.06625 (2021)"},{"key":"29_CR18","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"425","DOI":"10.1007\/978-3-030-58604-1_26","volume-title":"Computer Vision \u2013 ECCV 2020","author":"S Jenni","year":"2020","unstructured":"Jenni, S., Meishvili, G., Favaro, P.: Video representation learning by recognizing temporal transformations. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12373, pp. 425\u2013442. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58604-1_26"},{"key":"29_CR19","unstructured":"Jing, L., Yang, X., Liu, J., Tian, Y.: Self-supervised spatiotemporal feature learning via video rotation prediction. arXiv preprint arXiv:1811.11387 (2018)"},{"key":"29_CR20","unstructured":"Kairouz, P., et al.: Advances and open problems in federated learning. arXiv preprint arXiv:1912.04977 (2019)"},{"key":"29_CR21","unstructured":"Kay, W., et al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"29_CR22","unstructured":"Keskar, N.S., Mudigere, D., Nocedal, J., Smelyanskiy, M., Tang, P.T.P.: On large-batch training for deep learning: generalization gap and sharp minima. arXiv preprint arXiv:1609.04836 (2016)"},{"key":"29_CR23","doi-asserted-by":"crossref","unstructured":"Kolesnikov, A., Zhai, X., Beyer, L.: Revisiting self-supervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1920\u20131929 (2019)","DOI":"10.1109\/CVPR.2019.00202"},{"key":"29_CR24","unstructured":"Krizhevsky, A.: Learning multiple layers of features from tiny images (2009)"},{"key":"29_CR25","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: a large video database for human motion recognition. In: Proceedings of the International Conference on Computer Vision (ICCV) (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"29_CR26","doi-asserted-by":"crossref","unstructured":"Lee, H.Y., Huang, J.B., Singh, M., Yang, M.H.: Unsupervised representation learning by sorting sequences. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 667\u2013676 (2017)","DOI":"10.1109\/ICCV.2017.79"},{"key":"29_CR27","unstructured":"Li, H., Xu, Z., Taylor, G., Studer, C., Goldstein, T.: Visualizing the loss landscape of neural nets. In: Proceedings of the 32nd International Conference on Neural Information Processing Systems, pp. 6391\u20136401 (2018)"},{"key":"29_CR28","unstructured":"Li, T., Wang, L.: Learning spatiotemporal features via video and text pair discrimination. arXiv preprint arXiv:2001.05691 (2020)"},{"key":"29_CR29","unstructured":"McMahan, B., Moore, E., Ramage, D., Hampson, S., Arcas, B.A.: Communication-efficient learning of deep networks from decentralized data. In: Artificial Intelligence and Statistics, pp. 1273\u20131282. PMLR (2017)"},{"key":"29_CR30","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"527","DOI":"10.1007\/978-3-319-46448-0_32","volume-title":"Computer Vision \u2013 ECCV 2016","author":"I Misra","year":"2016","unstructured":"Misra, I., Zitnick, C.L., Hebert, M.: Shuffle and learn: unsupervised learning using temporal order verification. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 527\u2013544. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_32"},{"key":"29_CR31","doi-asserted-by":"crossref","unstructured":"Park, H., Sjosund, L., Yoo, Y., Monet, N., Bang, J., Kwak, N.: SINet: extreme lightweight portrait segmentation networks with spatial squeeze module and information blocking decoder. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2066\u20132074 (2020)","DOI":"10.1109\/WACV45572.2020.9093588"},{"key":"29_CR32","doi-asserted-by":"crossref","unstructured":"Piergiovanni, A., Angelova, A., Ryoo, M.S.: Evolving losses for unsupervised video representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 133\u2013142 (2020)","DOI":"10.1109\/CVPR42600.2020.00021"},{"key":"29_CR33","unstructured":"Reddi, S.J., et al.: Adaptive federated optimization. In: International Conference on Learning Representations (2020)"},{"key":"29_CR34","doi-asserted-by":"crossref","unstructured":"Romijnders, R., et al.: Representation learning from videos in-the-wild: an object-centric approach. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 177\u2013187 (2021)","DOI":"10.1109\/WACV48630.2021.00022"},{"issue":"3","key":"29_CR35","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., et al.: ImageNet large scale visual recognition challenge. Int. J. Comput. Vision 115(3), 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vision"},{"key":"29_CR36","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild (2012)"},{"key":"29_CR37","doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., Ray, J., LeCun, Y., Paluri, M.: A closer look at spatiotemporal convolutions for action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6450\u20136459 (2018)","DOI":"10.1109\/CVPR.2018.00675"},{"key":"29_CR38","doi-asserted-by":"crossref","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A.: Anticipating visual representations from unlabeled video. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 98\u2013106 (2016)","DOI":"10.1109\/CVPR.2016.18"},{"key":"29_CR39","doi-asserted-by":"crossref","unstructured":"Wang, G., Zhou, Y., Luo, C., Xie, W., Zeng, W., Xiong, Z.: Unsupervised visual representation learning by tracking patches in video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2563\u20132572 (2021)","DOI":"10.1109\/CVPR46437.2021.00259"},{"key":"29_CR40","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"504","DOI":"10.1007\/978-3-030-58520-4_30","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Wang","year":"2020","unstructured":"Wang, J., Jiao, J., Liu, Y.-H.: Self-supervised video representation learning by pace prediction. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12362, pp. 504\u2013521. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58520-4_30"},{"key":"29_CR41","doi-asserted-by":"crossref","unstructured":"Wang, X., Gupta, A.: Unsupervised learning of visual representations using videos. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2794\u20132802 (2015)","DOI":"10.1109\/ICCV.2015.320"},{"key":"29_CR42","doi-asserted-by":"crossref","unstructured":"Xu, D., Xiao, J., Zhao, Z., Shao, J., Xie, D., Zhuang, Y.: Self-supervised spatiotemporal learning via video clip order prediction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10334\u201310343 (2019)","DOI":"10.1109\/CVPR.2019.01058"},{"key":"29_CR43","doi-asserted-by":"crossref","unstructured":"Yao, Y., Liu, C., Luo, D., Zhou, Y., Ye, Q.: Video playback rate perception for self-supervised spatio-temporal representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6548\u20136557 (2020)","DOI":"10.1109\/CVPR42600.2020.00658"},{"key":"29_CR44","unstructured":"Zhang, F., et al.: Federated unsupervised representation learning. arXiv preprint arXiv:2010.08982 (2020)"},{"key":"29_CR45","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Xiong, Y., Wang, L., Wu, Z., Tang, X., Lin, D.: Temporal action detection with structured segment networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2914\u20132923 (2017)","DOI":"10.1109\/ICCV.2017.317"},{"key":"29_CR46","doi-asserted-by":"crossref","unstructured":"Zhuang, W., Gan, X., Wen, Y., Zhang, S., Yi, S.: Collaborative unsupervised visual representation learning from decentralized data. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4912\u20134921 (2021)","DOI":"10.1109\/ICCV48922.2021.00487"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19821-2_29","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,22]],"date-time":"2022-10-22T12:54:35Z","timestamp":1666443275000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19821-2_29"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198205","9783031198212"],"references-count":46,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19821-2_29","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"23 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}