{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:16:04Z","timestamp":1750220164934,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":98,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T00:00:00Z","timestamp":1665360000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Beijing Municipal Science & Technology Commission","award":["Z191100007119002"],"award-info":[{"award-number":["Z191100007119002"]}]},{"name":"the Key Research Program of Frontier Sciences, CAS","award":["ZDBS-LY-7024"],"award-info":[{"award-number":["ZDBS-LY-7024"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,10]]},"DOI":"10.1145\/3503161.3547888","type":"proceedings-article","created":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T15:42:35Z","timestamp":1665416555000},"page":"1348-1357","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["MaMiCo: Macro-to-Micro Semantic Correspondence for Self-supervised Video Representation Learning"],"prefix":"10.1145","author":[{"given":"Bo","family":"Fang","sequence":"first","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenhao","family":"Wu","sequence":"additional","affiliation":[{"name":"The University of Sydney &amp; Baidu Inc., Sydney, NSW, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chang","family":"Liu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yu","family":"Zhou","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dongliang","family":"He","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Weipinng","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2022,10,10]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Youtube-8m: A large-scale video classification benchmark. arXiv preprint arXiv:1609.08675","author":"Abu-El-Haija Sami","year":"2016","unstructured":"Sami Abu-El-Haija , Nisarg Kothari , Joonseok Lee , Paul Natsev , George Toderici , Balakrishnan Varadarajan , and Sudheendra Vijayanarasimhan . 2016. Youtube-8m: A large-scale video classification benchmark. arXiv preprint arXiv:1609.08675 ( 2016 ). Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, and Sudheendra Vijayanarasimhan. 2016. Youtube-8m: A large-scale video classification benchmark. arXiv preprint arXiv:1609.08675 (2016)."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00356"},{"key":"e_1_3_2_2_3_1","volume-title":"Advances in Neural Information Processing Systems","volume":"34","author":"Araslanov Nikita","year":"2021","unstructured":"Nikita Araslanov , Simone Schaub-Meyer , and Stefan Roth . 2021 . Dense Unsupervised Learning for Video Segmentation . Advances in Neural Information Processing Systems , Vol. 34 (2021). Nikita Araslanov, Simone Schaub-Meyer, and Stefan Roth. 2021. Dense Unsupervised Learning for Video Segmentation. Advances in Neural Information Processing Systems , Vol. 34 (2021)."},{"key":"e_1_3_2_2_4_1","volume-title":"Can Temporal Information Help with Contrastive Self-Supervised Learning? arXiv preprint arXiv:2011.13046","author":"Bai Yutong","year":"2020","unstructured":"Yutong Bai , Haoqi Fan , Ishan Misra , Ganesh Venkatesh , Yongyi Lu , Yuyin Zhou , Qihang Yu , Vikas Chandra , and Alan Yuille . 2020. Can Temporal Information Help with Contrastive Self-Supervised Learning? arXiv preprint arXiv:2011.13046 ( 2020 ). Yutong Bai, Haoqi Fan, Ishan Misra, Ganesh Venkatesh, Yongyi Lu, Yuyin Zhou, Qihang Yu, Vikas Chandra, and Alan Yuille. 2020. Can Temporal Information Help with Contrastive Self-Supervised Learning? arXiv preprint arXiv:2011.13046 (2020)."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00911"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00994"},{"key":"e_1_3_2_2_7_1","volume-title":"Self-Supervised Video Representation Learning by Video Incoherence Detection. arXiv preprint arXiv:2109.12493","author":"Cao Haozhi","year":"2021","unstructured":"Haozhi Cao , Yuecong Xu , Jianfei Yang , Kezhi Mao , Lihua Xie , Jianxiong Yin , and Simon See . 2021. Self-Supervised Video Representation Learning by Video Incoherence Detection. arXiv preprint arXiv:2109.12493 ( 2021 ). Haozhi Cao, Yuecong Xu, Jianfei Yang, Kezhi Mao, Lihua Xie, Jianxiong Yin, and Simon See. 2021. Self-Supervised Video Representation Learning by Video Incoherence Detection. arXiv preprint arXiv:2109.12493 (2021)."},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_9"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00305"},{"key":"e_1_3_2_2_10_1","first-page":"9912","article-title":"Unsupervised learning of visual features by contrasting cluster assignments","volume":"33","author":"Caron Mathilde","year":"2020","unstructured":"Mathilde Caron , Ishan Misra , Julien Mairal , Priya Goyal , Piotr Bojanowski , and Armand Joulin . 2020 . Unsupervised learning of visual features by contrasting cluster assignments . Advances in Neural Information Processing Systems , Vol. 33 (2020), 9912 -- 9924 . Mathilde Caron, Ishan Misra, Julien Mairal, Priya Goyal, Piotr Bojanowski, and Armand Joulin. 2020. Unsupervised learning of visual features by contrasting cluster assignments. Advances in Neural Information Processing Systems , Vol. 33 (2020), 9912--9924.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16189"},{"key":"e_1_3_2_2_12_1","volume-title":"International conference on machine learning. PMLR, 1597--1607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen , Simon Kornblith , Mohammad Norouzi , and Geoffrey Hinton . 2020 b. A simple framework for contrastive learning of visual representations . In International conference on machine learning. PMLR, 1597--1607 . Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020b. A simple framework for contrastive learning of visual representations. In International conference on machine learning. PMLR, 1597--1607."},{"key":"e_1_3_2_2_13_1","volume-title":"Improved baselines with momentum contrastive learning. arXiv preprint arXiv:2003.04297","author":"Chen Xinlei","year":"2020","unstructured":"Xinlei Chen , Haoqi Fan , Ross Girshick , and Kaiming He. 2020a. Improved baselines with momentum contrastive learning. arXiv preprint arXiv:2003.04297 ( 2020 ). Xinlei Chen, Haoqi Fan, Ross Girshick, and Kaiming He. 2020a. Improved baselines with momentum contrastive learning. arXiv preprint arXiv:2003.04297 (2020)."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"e_1_3_2_2_15_1","volume-title":"Hyung Jin Chang, and Wonjun Hwang","author":"Cho Hyeon","year":"2020","unstructured":"Hyeon Cho , Taehoon Kim , Hyung Jin Chang, and Wonjun Hwang . 2020 . Self-supervised spatio-temporal representation learning using variable playback speed prediction. arXiv preprint arXiv:2003.02692 (2020). Hyeon Cho, Taehoon Kim, Hyung Jin Chang, and Wonjun Hwang. 2020. Self-supervised spatio-temporal representation learning using variable playback speed prediction. arXiv preprint arXiv:2003.02692 (2020)."},{"key":"e_1_3_2_2_16_1","volume-title":"Mamshad Nayeem Rizve, and Mubarak Shah","author":"Dave Ishan","year":"2022","unstructured":"Ishan Dave , Rohit Gupta , Mamshad Nayeem Rizve, and Mubarak Shah . 2022 . Tclr : Temporal contrastive learning for video representation. Computer Vision and Image Understanding ( 2022), 103406. Ishan Dave, Rohit Gupta, Mamshad Nayeem Rizve, and Mubarak Shah. 2022. Tclr: Temporal contrastive learning for video representation. Computer Vision and Image Understanding (2022), 103406."},{"key":"e_1_3_2_2_17_1","volume-title":"Imagenet: A large-scale hierarchical image database.","author":"Deng Jia","year":"2009","unstructured":"Jia Deng , Wei Dong , Richard Socher , Li-Jia Li , Kai Li , and Li Fei-Fei . 2009 . Imagenet: A large-scale hierarchical image database. Jia Deng, Wei Dong, Richard Socher, Li-Jia Li, Kai Li, and Li Fei-Fei. 2009. Imagenet: A large-scale hierarchical image database."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00153"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.167"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.607"},{"key":"e_1_3_2_2_22_1","volume-title":"Watching the world go by: Representation learning from unlabeled videos. arXiv preprint arXiv:2003.07990","author":"Gordon Daniel","year":"2020","unstructured":"Daniel Gordon , Kiana Ehsani , Dieter Fox , and Ali Farhadi . 2020. Watching the world go by: Representation learning from unlabeled videos. arXiv preprint arXiv:2003.07990 ( 2020 ). Daniel Gordon, Kiana Ehsani, Dieter Fox, and Ali Farhadi. 2020. Watching the world go by: Representation learning from unlabeled videos. arXiv preprint arXiv:2003.07990 (2020)."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"e_1_3_2_2_24_1","first-page":"21271","article-title":"Bootstrap your own latent-a new approach to self-supervised learning","volume":"33","author":"Grill Jean-Bastien","year":"2020","unstructured":"Jean-Bastien Grill , Florian Strub , Florent Altch\u00e9 , Corentin Tallec , Pierre Richemond , Elena Buchatskaya , Carl Doersch , Bernardo Avila Pires , Zhaohan Guo , Mohammad Gheshlaghi Azar , 2020 . Bootstrap your own latent-a new approach to self-supervised learning . Advances in Neural Information Processing Systems , Vol. 33 (2020), 21271 -- 21284 . Jean-Bastien Grill, Florian Strub, Florent Altch\u00e9 , Corentin Tallec, Pierre Richemond, Elena Buchatskaya, Carl Doersch, Bernardo Avila Pires, Zhaohan Guo, Mohammad Gheshlaghi Azar, et al. 2020. Bootstrap your own latent-a new approach to self-supervised learning. Advances in Neural Information Processing Systems , Vol. 33 (2020), 21271--21284.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_25_1","volume-title":"Proceedings of the thirteenth international conference on artificial intelligence and statistics. 297--304","author":"Gutmann Michael","year":"2010","unstructured":"Michael Gutmann and Aapo Hyv\"arinen. 2010 . Noise-contrastive estimation: A new estimation principle for unnormalized statistical models . In Proceedings of the thirteenth international conference on artificial intelligence and statistics. 297--304 . Michael Gutmann and Aapo Hyv\"arinen. 2010. Noise-contrastive estimation: A new estimation principle for unnormalized statistical models. In Proceedings of the thirteenth international conference on artificial intelligence and statistics. 297--304."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00186"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_19"},{"key":"e_1_3_2_2_28_1","first-page":"5679","article-title":"Self-supervised co-training for video representation learning","volume":"33","author":"Han Tengda","year":"2020","unstructured":"Tengda Han , Weidi Xie , and Andrew Zisserman . 2020 b. Self-supervised co-training for video representation learning . Advances in Neural Information Processing Systems , Vol. 33 (2020), 5679 -- 5690 . Tengda Han, Weidi Xie, and Andrew Zisserman. 2020b. Self-supervised co-training for video representation learning. Advances in Neural Information Processing Systems , Vol. 33 (2020), 5679--5690.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00685"},{"key":"e_1_3_2_2_30_1","volume-title":"Masked autoencoders are scalable vision learners. arXiv preprint arXiv:2111.06377","author":"He Kaiming","year":"2021","unstructured":"Kaiming He , Xinlei Chen , Saining Xie , Yanghao Li , Piotr Doll\u00e1r , and Ross Girshick . 2021. Masked autoencoders are scalable vision learners. arXiv preprint arXiv:2111.06377 ( 2021 ). Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Doll\u00e1r, and Ross Girshick. 2021. Masked autoencoders are scalable vision learners. arXiv preprint arXiv:2111.06377 (2021)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_2_32_1","volume-title":"Learning deep representations by mutual information estimation and maximization. arXiv preprint arXiv:1808.06670","author":"Hjelm R Devon","year":"2018","unstructured":"R Devon Hjelm , Alex Fedorov , Samuel Lavoie-Marchildon , Karan Grewal , Phil Bachman , Adam Trischler , and Yoshua Bengio . 2018. Learning deep representations by mutual information estimation and maximization. arXiv preprint arXiv:1808.06670 ( 2018 ). R Devon Hjelm, Alex Fedorov, Samuel Lavoie-Marchildon, Karan Grewal, Phil Bachman, Adam Trischler, and Yoshua Bengio. 2018. Learning deep representations by mutual information estimation and maximization. arXiv preprint arXiv:1808.06670 (2018)."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00799"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2016.10.018"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00982"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58604-1_26"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58604-1_26"},{"key":"e_1_3_2_2_38_1","volume-title":"Self-supervised spatiotemporal feature learning via video rotation prediction. arXiv preprint arXiv:1811.11387","author":"Jing Longlong","year":"2018","unstructured":"Longlong Jing , Xiaodong Yang , Jingen Liu , and Yingli Tian . 2018. Self-supervised spatiotemporal feature learning via video rotation prediction. arXiv preprint arXiv:1811.11387 ( 2018 ). Longlong Jing, Xiaodong Yang, Jingen Liu, and Yingli Tian. 2018. Self-supervised spatiotemporal feature learning via video rotation prediction. arXiv preprint arXiv:1811.11387 (2018)."},{"key":"e_1_3_2_2_39_1","unstructured":"Will Kay Joao Carreira Karen Simonyan Brian Zhang Chloe Hillier Sudheendra Vijayanarasimhan Fabio Viola Tim Green Trevor Back Paul Natsev etal 2017. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017).  Will Kay Joao Carreira Karen Simonyan Brian Zhang Chloe Hillier Sudheendra Vijayanarasimhan Fabio Viola Tim Green Trevor Back Paul Natsev et al. 2017. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)."},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018545"},{"key":"e_1_3_2_2_41_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Komodakis Nikos","year":"2018","unstructured":"Nikos Komodakis and Spyros Gidaris . 2018 . Unsupervised representation learning by predicting image rotations . In International Conference on Learning Representations (ICLR). Nikos Komodakis and Spyros Gidaris. 2018. Unsupervised representation learning by predicting image rotations. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00358"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.79"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00406"},{"key":"e_1_3_2_2_46_1","volume-title":"Prototypical Contrastive Learning of Unsupervised Representations. In International Conference on Learning Representations.","author":"Li Junnan","year":"2020","unstructured":"Junnan Li , Pan Zhou , Caiming Xiong , and Steven Hoi . 2020 . Prototypical Contrastive Learning of Unsupervised Representations. In International Conference on Learning Representations. Junnan Li, Pan Zhou, Caiming Xiong, and Steven Hoi. 2020. Prototypical Contrastive Learning of Unsupervised Representations. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_47_1","volume-title":"Dense Semantic Contrast for Self-Supervised Visual Representation Learning. In MM '21: ACM. 1368--1376","author":"Li Xiaoni","year":"2021","unstructured":"Xiaoni Li , Yu Zhou , Yifei Zhang , Aoting Zhang , Wei Wang , Ning Jiang , Haiying Wu , and Weiping Wang . 2021 . Dense Semantic Contrast for Self-Supervised Visual Representation Learning. In MM '21: ACM. 1368--1376 . Xiaoni Li, Yu Zhou, Yifei Zhang, Aoting Zhang, Wei Wang, Ning Jiang, Haiying Wu, and Weiping Wang. 2021. Dense Semantic Contrast for Self-Supervised Visual Representation Learning. In MM '21: ACM. 1368--1376."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00718"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3160860"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6840"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3473342"},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 6707--6717","author":"Misra Ishan","key":"e_1_3_2_2_52_1","unstructured":"Ishan Misra and Laurens van der Maaten. 2020. Self-supervised learning of pretext-invariant representations . In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 6707--6717 . Ishan Misra and Laurens van der Maaten. 2020. Self-supervised learning of pretext-invariant representations. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 6707--6717."},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_32"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46466-4_5"},{"key":"e_1_3_2_2_55_1","first-page":"4489","article-title":"Unsupervised learning of dense visual representations","volume":"33","author":"Pinheiro Pedro O O","year":"2020","unstructured":"Pedro O O Pinheiro , Amjad Almahairi , Ryan Benmalek , Florian Golemo , and Aaron C Courville . 2020 . Unsupervised learning of dense visual representations . Advances in Neural Information Processing Systems , Vol. 33 (2020), 4489 -- 4500 . Pedro O O Pinheiro, Amjad Almahairi, Ryan Benmalek, Florian Golemo, and Aaron C Courville. 2020. Unsupervised learning of dense visual representations. Advances in Neural Information Processing Systems , Vol. 33 (2020), 4489--4500.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_56_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord , Yazhe Li , and Oriol Vinyals . 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 ( 2018 ). Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01105"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.278"},{"key":"e_1_3_2_2_59_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. 7990--8001","author":"Ding Shuangrui","year":"2021","unstructured":"Rui Qian, Yuxi Li, Huabin Liu, John See, Shuangrui Ding , Xian Liu , Dian Li , and Weiyao Lin . 2021 a. Enhancing Self-supervised Video Representation Learning via Multi-level Feature Optimization . In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 7990--8001 . Rui Qian, Yuxi Li, Huabin Liu, John See, Shuangrui Ding, Xian Liu, Dian Li, and Weiyao Lin. 2021a. Enhancing Self-supervised Video Representation Learning via Multi-level Feature Optimization. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 7990--8001."},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00689"},{"key":"e_1_3_2_2_61_1","volume-title":"Fahad Shahbaz Khan, and Michael Ryoo","author":"Ranasinghe Kanchana","year":"2021","unstructured":"Kanchana Ranasinghe , Muzammal Naseer , Salman Khan , Fahad Shahbaz Khan, and Michael Ryoo . 2021 . Self-supervised Video Transformer . arXiv preprint arXiv:2112.01514 (2021). Kanchana Ranasinghe, Muzammal Naseer, Salman Khan, Fahad Shahbaz Khan, and Michael Ryoo. 2021. Self-supervised Video Transformer. arXiv preprint arXiv:2112.01514 (2021)."},{"volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. 1255--1265","author":"Recasens Adria","key":"e_1_3_2_2_62_1","unstructured":"Adria Recasens , Pauline Luc , Jean-Baptiste Alayrac , Luyu Wang , Florian Strub , Corentin Tallec , Mateusz Malinowski , Viorica Pua trua ucean, Florent Altch\u00e9 , Michal Valko, et al. 2021. Broaden your views for self-supervised video learning . In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 1255--1265 . Adria Recasens, Pauline Luc, Jean-Baptiste Alayrac, Luyu Wang, Florian Strub, Corentin Tallec, Mateusz Malinowski, Viorica Pua trua ucean, Florent Altch\u00e9 , Michal Valko, et al. 2021. Broaden your views for self-supervised video learning. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 1255--1265."},{"key":"e_1_3_2_2_63_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro , Amir Roshan Zamir, and Mubarak Shah . 2012 . UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012). Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_2_64_1","volume-title":"Learning video representations using contrastive bidirectional transformer. arXiv preprint arXiv:1906.05743","author":"Sun Chen","year":"2019","unstructured":"Chen Sun , Fabien Baradel , Kevin Murphy , and Cordelia Schmid . 2019. Learning video representations using contrastive bidirectional transformer. arXiv preprint arXiv:1906.05743 ( 2019 ). Chen Sun, Fabien Baradel, Kevin Murphy, and Cordelia Schmid. 2019. Learning video representations using contrastive bidirectional transformer. arXiv preprint arXiv:1906.05743 (2019)."},{"key":"e_1_3_2_2_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413694"},{"key":"e_1_3_2_2_66_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_45"},{"key":"e_1_3_2_2_67_1","volume-title":"VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training. arXiv preprint arXiv:2203.12602","author":"Tong Zhan","year":"2022","unstructured":"Zhan Tong , Yibing Song , Jue Wang , and Limin Wang . 2022. VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training. arXiv preprint arXiv:2203.12602 ( 2022 ). Zhan Tong, Yibing Song, Jue Wang, and Limin Wang. 2022. VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training. arXiv preprint arXiv:2203.12602 (2022)."},{"key":"e_1_3_2_2_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"e_1_3_2_2_69_1","volume-title":"Long-short temporal contrastive learning of video transformers. arXiv preprint arXiv:2106.09212","author":"Wang Jue","year":"2021","unstructured":"Jue Wang , Gedas Bertasius , Du Tran , and Lorenzo Torresani . 2021a. Long-short temporal contrastive learning of video transformers. arXiv preprint arXiv:2106.09212 ( 2021 ). Jue Wang, Gedas Bertasius, Du Tran, and Lorenzo Torresani. 2021a. Long-short temporal contrastive learning of video transformers. arXiv preprint arXiv:2106.09212 (2021)."},{"key":"e_1_3_2_2_70_1","first-page":"7","article-title":"Enhancing unsupervised video representation learning by decoupling the scene and the motion","volume":"1","author":"Wang Jinpeng","year":"2021","unstructured":"Jinpeng Wang , Yuting Gao , Ke Li , Xinyang Jiang , Xiaowei Guo , Rongrong Ji , and Xing Sun . 2021 b. Enhancing unsupervised video representation learning by decoupling the scene and the motion . In AAAI , Vol. 1. 7 . Jinpeng Wang, Yuting Gao, Ke Li, Xinyang Jiang, Xiaowei Guo, Rongrong Ji, and Xing Sun. 2021b. Enhancing unsupervised video representation learning by decoupling the scene and the motion. In AAAI, Vol. 1. 7.","journal-title":"AAAI"},{"key":"e_1_3_2_2_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01163"},{"key":"e_1_3_2_2_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00413"},{"key":"e_1_3_2_2_73_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58520-4_30"},{"key":"e_1_3_2_2_74_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"e_1_3_2_2_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00304"},{"key":"e_1_3_2_2_76_1","volume-title":"Exploring Set Similarity for Dense Self-supervised Representation Learning. arXiv preprint arXiv:2107.08712","author":"Wang Zhaoqing","year":"2021","unstructured":"Zhaoqing Wang , Qiang Li , Guoxin Zhang , Pengfei Wan , Wen Zheng , Nannan Wang , Mingming Gong , and Tongliang Liu . 2021d. Exploring Set Similarity for Dense Self-supervised Representation Learning. arXiv preprint arXiv:2107.08712 ( 2021 ). Zhaoqing Wang, Qiang Li, Guoxin Zhang, Pengfei Wan, Wen Zheng, Nannan Wang, Mingming Gong, and Tongliang Liu. 2021d. Exploring Set Similarity for Dense Self-supervised Representation Learning. arXiv preprint arXiv:2107.08712 (2021)."},{"key":"e_1_3_2_2_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00840"},{"key":"e_1_3_2_2_78_1","volume-title":"Weakly-supervised spatio-temporal anomaly detection in surveillance video. IJCAI","author":"Wu Jie","year":"2021","unstructured":"Jie Wu , Wei Zhang , Guanbin Li , Wenhao Wu , Xiao Tan , Yingying Li , Errui Ding , and Liang Lin . 2021b. Weakly-supervised spatio-temporal anomaly detection in surveillance video. IJCAI ( 2021 ). Jie Wu, Wei Zhang, Guanbin Li, Wenhao Wu, Xiao Tan, Yingying Li, Errui Ding, and Liang Lin. 2021b. Weakly-supervised spatio-temporal anomaly detection in surveillance video. IJCAI (2021)."},{"key":"e_1_3_2_2_79_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16401"},{"key":"e_1_3_2_2_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00632"},{"key":"e_1_3_2_2_81_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475344"},{"key":"e_1_3_2_2_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00393"},{"key":"e_1_3_2_2_83_1","volume-title":"Temporal Saliency Query Network for Efficient Video Recognition. ECCV","author":"Xia Boyang","year":"2022","unstructured":"Boyang Xia , Zhihao Wang , Wenhao Wu , Haoran Wang , and Jungong Han . 2022a. Temporal Saliency Query Network for Efficient Video Recognition. ECCV ( 2022 ). Boyang Xia, Zhihao Wang, Wenhao Wu, Haoran Wang, and Jungong Han. 2022a. Temporal Saliency Query Network for Efficient Video Recognition. ECCV (2022)."},{"key":"e_1_3_2_2_84_1","volume-title":"NSNet: Non-saliency Suppression Sampler for Efficient Video Recognition. ECCV","author":"Xia Boyang","year":"2022","unstructured":"Boyang Xia , Wenhao Wu , Haoran Wang , Rui Su , Dongliang He , Haosen Yang , Xiaoran Fan , and Wanli Ouyang . 2022b. NSNet: Non-saliency Suppression Sampler for Efficient Video Recognition. ECCV ( 2022 ). Boyang Xia, Wenhao Wu, Haoran Wang, Rui Su, Dongliang He, Haosen Yang, Xiaoran Fan, and Wanli Ouyang. 2022b. NSNet: Non-saliency Suppression Sampler for Efficient Video Recognition. ECCV (2022)."},{"key":"e_1_3_2_2_85_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"e_1_3_2_2_86_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01641"},{"key":"e_1_3_2_2_87_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01058"},{"key":"e_1_3_2_2_88_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_2_89_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01017"},{"key":"e_1_3_2_2_90_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20212"},{"key":"e_1_3_2_2_91_1","first-page":"7","article-title":"Seco: Exploring sequence supervision for unsupervised representation learning","volume":"2","author":"Yao Ting","year":"2021","unstructured":"Ting Yao , Yiheng Zhang , Zhaofan Qiu , Yingwei Pan , and Tao Mei . 2021 . Seco: Exploring sequence supervision for unsupervised representation learning . In AAAI , Vol. 2. 7 . Ting Yao, Yiheng Zhang, Zhaofan Qiu, Yingwei Pan, and Tao Mei. 2021. Seco: Exploring sequence supervision for unsupervised representation learning. In AAAI, Vol. 2. 7.","journal-title":"AAAI"},{"key":"e_1_3_2_2_92_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00658"},{"key":"e_1_3_2_2_93_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00637"},{"key":"e_1_3_2_2_94_1","volume-title":"Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer. arXiv preprint arXiv:1612.03928","author":"Zagoruyko Sergey","year":"2016","unstructured":"Sergey Zagoruyko and Nikos Komodakis . 2016. Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer. arXiv preprint arXiv:1612.03928 ( 2016 ). Sergey Zagoruyko and Nikos Komodakis. 2016. Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer. arXiv preprint arXiv:1612.03928 (2016)."},{"key":"e_1_3_2_2_95_1","volume-title":"International Conference on Machine Learning. PMLR, 12310--12320","author":"Zbontar Jure","year":"2021","unstructured":"Jure Zbontar , Li Jing , Ishan Misra , Yann LeCun , and St\u00e9phane Deny . 2021 . Barlow twins: Self-supervised learning via redundancy reduction . In International Conference on Machine Learning. PMLR, 12310--12320 . Jure Zbontar, Li Jing, Ishan Misra, Yann LeCun, and St\u00e9phane Deny. 2021. Barlow twins: Self-supervised learning via redundancy reduction. In International Conference on Machine Learning. PMLR, 12310--12320."},{"key":"e_1_3_2_2_96_1","first-page":"7025","article-title":"Contrastive learning of global and local video representations","volume":"34","author":"Zeng Zhaoyang","year":"2021","unstructured":"Zhaoyang Zeng , Daniel McDuff , Yale Song , 2021 . Contrastive learning of global and local video representations . Advances in Neural Information Processing Systems , Vol. 34 (2021), 7025 -- 7040 . Zhaoyang Zeng, Daniel McDuff, Yale Song, et al. 2021. Contrastive learning of global and local video representations. Advances in Neural Information Processing Systems , Vol. 34 (2021), 7025--7040.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_97_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46487-9_40"},{"key":"e_1_3_2_2_98_1","volume-title":"Contrastive Spatio-Temporal Pretext Learning for Self-supervised Video Representation. arXiv preprint arXiv:2112.08913","author":"Zhang Yujia","year":"2021","unstructured":"Yujia Zhang , Lai-Man Po , Xuyuan Xu , Mengyang Liu , Yexin Wang , Weifeng Ou , Yuzhi Zhao , and Wing-Yin Yu. 2021. Contrastive Spatio-Temporal Pretext Learning for Self-supervised Video Representation. arXiv preprint arXiv:2112.08913 ( 2021 ). Yujia Zhang, Lai-Man Po, Xuyuan Xu, Mengyang Liu, Yexin Wang, Weifeng Ou, Yuzhi Zhao, and Wing-Yin Yu. 2021. Contrastive Spatio-Temporal Pretext Learning for Self-supervised Video Representation. arXiv preprint arXiv:2112.08913 (2021)."}],"event":{"name":"MM '22: The 30th ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Lisboa Portugal","acronym":"MM '22"},"container-title":["Proceedings of the 30th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3547888","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3503161.3547888","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:00:30Z","timestamp":1750186830000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3547888"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,10]]},"references-count":98,"alternative-id":["10.1145\/3503161.3547888","10.1145\/3503161"],"URL":"https:\/\/doi.org\/10.1145\/3503161.3547888","relation":{},"subject":[],"published":{"date-parts":[[2022,10,10]]},"assertion":[{"value":"2022-10-10","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}