{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,1,29]],"date-time":"2025-01-29T05:48:54Z","timestamp":1738129734843,"version":"3.33.0"},"reference-count":30,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T00:00:00Z","timestamp":1733270400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T00:00:00Z","timestamp":1733270400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SIViP"],"published-print":{"date-parts":[[2025,1]]},"DOI":"10.1007\/s11760-024-03644-w","type":"journal-article","created":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T08:24:12Z","timestamp":1733300652000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Mitigating background bias in self-supervised video representation learning"],"prefix":"10.1007","volume":"19","author":[{"given":"Arif","family":"Akar","sequence":"first","affiliation":[]},{"given":"Ufuk Umut","family":"Senturk","sequence":"additional","affiliation":[]},{"given":"Nazli","family":"Ikizler-Cinbis","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,4]]},"reference":[{"key":"3644_CR1","doi-asserted-by":"crossref","unstructured":"Ahsan, U., Madhok, R., Essa, I.: Video jigsaw: unsupervised learning of spatiotemporal context for video action recognition. In: 2019 IEEE Winter Conference on Applications of Computer Vision (WACV), pp. 179\u2013189. IEEE (2019)","DOI":"10.1109\/WACV.2019.00025"},{"key":"3644_CR2","unstructured":"Akar, A., Senturk, U.U., Ikizler-Cinbis, N.: Mac: mask-augmentation for motion-aware video representation learning. In: BMVC, p.\u00a05. (2022)"},{"key":"3644_CR3","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308. (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"3644_CR4","unstructured":"Choi, J., Gao, C., Messou, C.E.J., et\u00a0al.: Why can\u2019t I dance in the mall? Learning to mitigate scene bias in action recognition. In: NeurIPS (2019)"},{"key":"3644_CR5","doi-asserted-by":"crossref","unstructured":"Dave, I.R., Jenni, S., Shah, M.: No more shortcuts: realizing the potential of temporal self-supervision. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 1481\u20131491. (2024)","DOI":"10.1609\/aaai.v38i2.27913"},{"key":"3644_CR6","doi-asserted-by":"crossref","unstructured":"Ding, S., Li, M., Yang, T., et\u00a0al.: Motion-aware contrastive video representation learning via foreground-background merging. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 9716\u20139726. (2022)","DOI":"10.1109\/CVPR52688.2022.00949"},{"key":"3644_CR7","doi-asserted-by":"crossref","unstructured":"Ding, S., Qian, R., Xiong, H.: Dual contrastive learning for spatio-temporal representation. In: Proceedings of the 30th ACM international conference on multimedia, pp. 5649\u20135658. (2022)","DOI":"10.1145\/3503161.3547783"},{"key":"3644_CR8","doi-asserted-by":"crossref","unstructured":"Duan, H., Zhao, N., Chen, K., et\u00a0al.: Transrank: self-supervised video representation learning via ranking-based transformation recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3000\u20133010. (2022)","DOI":"10.1109\/CVPR52688.2022.00301"},{"key":"3644_CR9","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Xiong, B., et\u00a0al.: A large-scale study on unsupervised spatiotemporal representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3299\u20133309. (2021)","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"3644_CR10","unstructured":"Feichtenhofer, C., Fan, H., Li, Y., et\u00a0al.: Masked autoencoders as spatiotemporal learners. ArXiv arXiV:2205.09113 (2022)"},{"key":"3644_CR11","doi-asserted-by":"crossref","unstructured":"Gavrilyuk, K., Jain, M., Karmanov, I., et\u00a0al.: Motion-augmented self-training for video recognition at smaller scale. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10429\u201310438. (2021)","DOI":"10.1109\/ICCV48922.2021.01026"},{"key":"3644_CR12","unstructured":"Gutmann, M., Hyv\u00e4rinen, A.: Noise-contrastive estimation: A new estimation principle for unnormalized statistical models. In: Proceedings of the thirteenth international conference on artificial intelligence and statistics, JMLR Workshop and Conference Proceedings, pp. 297\u2013304. (2010)"},{"key":"3644_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., et\u00a0al.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738. (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"3644_CR14","doi-asserted-by":"crossref","unstructured":"Hu, K., Shao, J., Liu, Y., et\u00a0al.: Contrast and order representations for video self-supervised learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7939\u20137949. (2021)","DOI":"10.1109\/ICCV48922.2021.00784"},{"key":"3644_CR15","doi-asserted-by":"crossref","unstructured":"Huang, L., Liu, Y., Wang, B., et\u00a0al.: Self-supervised video representation learning by context and motion decoupling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13886\u201313895. (2021)","DOI":"10.1109\/CVPR46437.2021.01367"},{"key":"3644_CR16","doi-asserted-by":"crossref","unstructured":"Jenni, S., Meishvili, G., Favaro, P.: Video representation learning by recognizing temporal transformations. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXVIII 16, Springer, pp. 425\u2013442. (2020)","DOI":"10.1007\/978-3-030-58604-1_26"},{"key":"3644_CR17","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., et\u00a0al.: Hmdb: a large video database for human motion recognition. In: 2011 International conference on computer vision, IEEE, pp. 2556\u20132563. (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"3644_CR18","doi-asserted-by":"crossref","unstructured":"Lee, H.Y., Huang, J.B., Singh, M., et\u00a0al.: Unsupervised representation learning by sorting sequences. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 667\u2013676. (2017)","DOI":"10.1109\/ICCV.2017.79"},{"key":"3644_CR19","unstructured":"Li, W., Luo, D., Fang, B., et\u00a0al.: Video 3d sampling for self-supervised representation learning. arXiv preprint arXiv:2107.03578 (2021)"},{"key":"3644_CR20","doi-asserted-by":"crossref","unstructured":"Luo, D., Liu, C., Zhou, Y., et\u00a0al.: Video cloze procedure for self-supervised spatio-temporal learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp 11701\u201311708 (2020)","DOI":"10.1609\/aaai.v34i07.6840"},{"key":"3644_CR21","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: Ucf101: a dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)"},{"key":"3644_CR22","doi-asserted-by":"publisher","DOI":"10.1016\/j.adhoc.2020.102380","volume":"113","author":"Y Su","year":"2021","unstructured":"Su, Y., Xing, M., An, S., et al.: Vdarn: video disentangling attentive relation network for few-shot and zero-shot action recognition. Ad Hoc Netw. 113, 102380 (2021)","journal-title":"Ad Hoc Netw."},{"key":"3644_CR23","doi-asserted-by":"crossref","unstructured":"Thoker, F.M., Doughty, H., Snoek, C.G.: Tubelet-contrastive self-supervision for video-efficient generalization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13812\u201313823. (2023)","DOI":"10.1109\/ICCV51070.2023.01270"},{"key":"3644_CR24","doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., et\u00a0al.: A closer look at spatiotemporal convolutions for action recognition. In: Proceedings of the IEEE conference on Computer Vision and Pattern Recognition, pp. 6450\u20136459. (2018)","DOI":"10.1109\/CVPR.2018.00675"},{"key":"3644_CR25","doi-asserted-by":"crossref","unstructured":"Wang, G., Zhou, Y., Luo, C., et\u00a0al.: Unsupervised visual representation learning by tracking patches in video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2563\u20132572. (2021)","DOI":"10.1109\/CVPR46437.2021.00259"},{"key":"3644_CR26","doi-asserted-by":"crossref","unstructured":"Wang, J., Jiao, J., Liu, Y.H.: Self-supervised video representation learning by pace prediction. In: European Conference on Computer Vision, pp. 504\u2013521. Springer (2020)","DOI":"10.1007\/978-3-030-58520-4_30"},{"key":"3644_CR27","doi-asserted-by":"crossref","unstructured":"Wang, J., Gao, Y., Li, K., et\u00a0al.: Enhancing unsupervised video representation learning by decoupling the scene and the motion. In: AAAI (2021)","DOI":"10.1609\/aaai.v35i11.17215"},{"key":"3644_CR28","doi-asserted-by":"crossref","unstructured":"Wang, J., Gao, Y., Li, K., et\u00a0al.: Removing the background by adding the background: Towards background robust self-supervised video representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 11804\u201311813. (2021)","DOI":"10.1109\/CVPR46437.2021.01163"},{"key":"3644_CR29","doi-asserted-by":"crossref","unstructured":"Xu, D., Xiao, J., Zhao, Z., et\u00a0al.: Self-supervised spatiotemporal learning via video clip order prediction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10334\u201310343. (2019)","DOI":"10.1109\/CVPR.2019.01058"},{"key":"3644_CR30","doi-asserted-by":"crossref","unstructured":"Zach, C., Pock, T., Bischof, H.: A duality based approach for realtime tv-l 1 optical flow. In: Pattern Recognition: 29th DAGM Symposium, Heidelberg, Germany, September 12-14, 2007. Proceedings 29, pp. 214\u2013223. Springer (2007)","DOI":"10.1007\/978-3-540-74936-3_22"}],"container-title":["Signal, Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-024-03644-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11760-024-03644-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-024-03644-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,28]],"date-time":"2025-01-28T17:51:17Z","timestamp":1738086677000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11760-024-03644-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,4]]},"references-count":30,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2025,1]]}},"alternative-id":["3644"],"URL":"https:\/\/doi.org\/10.1007\/s11760-024-03644-w","relation":{},"ISSN":["1863-1703","1863-1711"],"issn-type":[{"type":"print","value":"1863-1703"},{"type":"electronic","value":"1863-1711"}],"subject":[],"published":{"date-parts":[[2024,12,4]]},"assertion":[{"value":"12 August 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 September 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 October 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 December 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"This declaration is not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}}],"article-number":"55"}}