{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:04:22Z","timestamp":1765357462897,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":71,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Shenzhen Higher Education Institutions Stable Support Program (Key Project)","award":["GXWD20220817123150002"],"award-info":[{"award-number":["GXWD20220817123150002"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62006140"],"award-info":[{"award-number":["62006140"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shandong Provincial Natural Science Foundation","award":["ZR2020QF106"],"award-info":[{"award-number":["ZR2020QF106"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612131","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"2264-2274","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Fine-grained Key-Value Memory Enhanced Predictor for Video Representation Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6449-2727","authenticated-orcid":false,"given":"Xiaojie","family":"Li","sequence":"first","affiliation":[{"name":"Harbin Institute of Technology (Shenzhen) &amp; Peng Cheng Laboratory, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0247-5221","authenticated-orcid":false,"given":"Jianlong","family":"Wu","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology (Shenzhen), Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7214-0921","authenticated-orcid":false,"given":"Shaowei","family":"He","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology (Shenzhen), Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8167-8099","authenticated-orcid":false,"given":"Shuo","family":"Kang","sequence":"additional","affiliation":[{"name":"Sensetime Research, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9865-2212","authenticated-orcid":false,"given":"Yue","family":"Yu","sequence":"additional","affiliation":[{"name":"Peng Cheng Laboratory, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1476-0273","authenticated-orcid":false,"given":"Liqiang","family":"Nie","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology (Shenzhen), Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3895-5510","authenticated-orcid":false,"given":"Min","family":"Zhang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology (Shenzhen), Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2015.2496275"},{"key":"e_1_3_2_1_2_1","first-page":"9912","article-title":"Unsupervised learning of visual features by contrasting cluster assignments","volume":"33","author":"Caron Mathilde","year":"2020","unstructured":"Mathilde Caron, Ishan Misra, Julien Mairal, Priya Goyal, Piotr Bojanowski, and Armand Joulin. 2020. Unsupervised learning of visual features by contrasting cluster assignments. Advances in Neural Information Processing Systems, Vol. 33 (2020), 9912--9924.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the International Conference on Machine Learning. PMLR, 1597--1607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020b. A simple framework for contrastive learning of visual representations. In Proceedings of the International Conference on Machine Learning. PMLR, 1597--1607."},{"key":"e_1_3_2_1_5_1","volume-title":"Improved baselines with momentum contrastive learning. arXiv preprint arXiv:2003.04297","author":"Chen Xinlei","year":"2020","unstructured":"Xinlei Chen, Haoqi Fan, Ross Girshick, and Kaiming He. 2020a. Improved baselines with momentum contrastive learning. arXiv preprint arXiv:2003.04297 (2020)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"e_1_3_2_1_7_1","volume-title":"Caglar Gulcehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua Bengio.","author":"Cho Kyunghyun","year":"2014","unstructured":"Kyunghyun Cho, Bart Van Merri\u00ebnboer, Caglar Gulcehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua Bengio. 2014. Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078 (2014)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2022.103406"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00949"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547783"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2011.2108306"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00301"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCI.2018.2840334"},{"key":"e_1_3_2_1_16_1","first-page":"21271","article-title":"Bootstrap your own latent-a new approach to self-supervised learning","volume":"33","author":"Grill Jean-Bastien","year":"2020","unstructured":"Jean-Bastien Grill, Florian Strub, Florent Altch\u00e9, Corentin Tallec, Pierre Richemond, Elena Buchatskaya, Carl Doersch, Bernardo Avila Pires, Zhaohan Guo, Mohammad Gheshlaghi Azar, et al. 2020. Bootstrap your own latent-a new approach to self-supervised learning. Advances in Neural Information Processing Systems, Vol. 33 (2020), 21271--21284.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_17_1","volume-title":"Fixing the teacher-student knowledge discrepancy in distillation. arXiv preprint arXiv:2103.16844","author":"Han Jiangfan","year":"2021","unstructured":"Jiangfan Han, Mengya Gao, Yujie Wang, Quanquan Li, Hongsheng Li, and Xiaogang Wang. 2021. Fixing the teacher-student knowledge discrepancy in distillation. arXiv preprint arXiv:2103.16844 (2021)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00186"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_19"},{"key":"e_1_3_2_1_20_1","first-page":"5679","article-title":"Self-supervised co-training for video representation learning","volume":"33","author":"Han Tengda","year":"2020","unstructured":"Tengda Han, Weidi Xie, and Andrew Zisserman. 2020b. Self-supervised co-training for video representation learning. Advances in Neural Information Processing Systems, Vol. 33 (2020), 5679--5690.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2017.373"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00685"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_1_24_1","volume-title":"Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton, Oriol Vinyals, and Jeff Dean. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)."},{"key":"e_1_3_2_1_25_1","volume-title":"Long short-term memory. Neural computation","author":"Hochreiter Sepp","year":"1997","unstructured":"Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation, Vol. 9, 8 (1997), 1735--1780."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.310"},{"key":"e_1_3_2_1_27_1","volume-title":"International Conference on Machine Learning. pmlr, 448--456","author":"Ioffe Sergey","year":"2015","unstructured":"Sergey Ioffe and Christian Szegedy. 2015. Batch normalization: accelerating deep network training by reducing internal covariate shift. In International Conference on Machine Learning. pmlr, 448--456."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00982"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58604-1_26"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.784"},{"key":"e_1_3_2_1_31_1","unstructured":"Will Kay Joao Carreira Karen Simonyan Brian Zhang Chloe Hillier Sudheendra Vijayanarasimhan Fabio Viola Tim Green Trevor Back Paul Natsev et al. 2017. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018545"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3115626"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00036"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00358"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00813"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_39"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6840"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.751"},{"key":"e_1_3_2_1_41_1","volume-title":"Key-value memory networks for directly reading documents. arXiv preprint arXiv:1606.03126","author":"Miller Alexander","year":"2016","unstructured":"Alexander Miller, Adam Fisch, Jesse Dodge, Amir-Hossein Karimi, Antoine Bordes, and Jason Weston. 2016. Key-value memory networks for directly reading documents. arXiv preprint arXiv:1606.03126 (2016)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548180"},{"key":"e_1_3_2_1_43_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01105"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01438"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01430"},{"key":"e_1_3_2_1_47_1","volume-title":"Ji Woo Hong, Daehyeok Kim, Joshua Tian Jin Tee, and Chang D Yoo.","author":"Pham Trung X","year":"2022","unstructured":"Trung X Pham, Axi Niu, Zhang Kang, Sultan Rizky Madjid, Ji Woo Hong, Daehyeok Kim, Joshua Tian Jin Tee, and Chang D Yoo. 2022. Self-Supervised Visual Representation Learning via Residual Momentum. arXiv preprint arXiv:2211.09861 (2022)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19809-0_9"},{"key":"e_1_3_2_1_49_1","volume-title":"Proceedings of the IEEE International Conference on Computer Vision. 7990--8001","author":"Ding Shuangrui","year":"2021","unstructured":"Rui Qian, Yuxi Li, Huabin Liu, John See, Shuangrui Ding, Xian Liu, Dian Li, and Weiyao Lin. 2021a. Enhancing self-supervised video representation learning via multi-level feature optimization. In Proceedings of the IEEE International Conference on Computer Vision. 7990--8001."},{"key":"e_1_3_2_1_50_1","volume-title":"Controllable Augmentations for Video Representation Learning. arXiv preprint arXiv:2203.16632","author":"Dian Li.","year":"2022","unstructured":"Rui Qian, Weiyao Lin, John See, and Dian Li. 2022b. Controllable Augmentations for Video Representation Learning. arXiv preprint arXiv:2203.16632 (2022)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00689"},{"key":"e_1_3_2_1_52_1","volume-title":"Antoine Chassang, Carlo Gatta, and Yoshua Bengio.","author":"Romero Adriana","year":"2014","unstructured":"Adriana Romero, Nicolas Ballas, Samira Ebrahimi Kahou, Antoine Chassang, Carlo Gatta, and Yoshua Bengio. 2014. Fitnets: Hints for thin deep nets. arXiv preprint arXiv:1412.6550 (2014)."},{"key":"e_1_3_2_1_53_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462313"},{"key":"e_1_3_2_1_55_1","volume-title":"Advances in Neural Information Processing Systems","volume":"28","author":"Sukhbaatar Sainbayar","year":"2015","unstructured":"Sainbayar Sukhbaatar, Jason Weston, Rob Fergus, et al. 2015. End-to-end memory networks. Advances in Neural Information Processing Systems, Vol. 28 (2015)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.3005348"},{"key":"e_1_3_2_1_58_1","volume-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems","author":"Tong Zhan","year":"2022","unstructured":"Zhan Tong, Yibing Song, Jue Wang, and Limin Wang. 2022. Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems, Vol. 35 (2022), 10078--10093."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58520-4_30"},{"key":"e_1_3_2_1_61_1","volume-title":"Memory networks. arXiv preprint arXiv:1410.3916","author":"Weston Jason","year":"2014","unstructured":"Jason Weston, Sumit Chopra, and Antoine Bordes. 2014. Memory networks. arXiv preprint arXiv:1410.3916 (2014)."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3265261"},{"key":"e_1_3_2_1_63_1","first-page":"1205","article-title":"Online meta adaptation for fast video object segmentation","volume":"42","author":"Xiao Huaxin","year":"2019","unstructured":"Huaxin Xiao, Bingyi Kang, Yu Liu, Maojun Zhang, and Jiashi Feng. 2019. Online meta adaptation for fast video object segmentation. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 42, 5 (2019), 1205--1217.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01058"},{"key":"e_1_3_2_1_65_1","volume-title":"Video representation learning with visual tempo consistency. arXiv preprint arXiv:2006.15489","author":"Yang Ceyuan","year":"2020","unstructured":"Ceyuan Yang, Yinghao Xu, Bo Dai, and Bolei Zhou. 2020. Video representation learning with visual tempo consistency. arXiv preprint arXiv:2006.15489 (2020)."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2010.2050625"},{"key":"e_1_3_2_1_67_1","volume-title":"Large batch training of convolutional networks. arXiv preprint arXiv:1708.03888","author":"You Yang","year":"2017","unstructured":"Yang You, Igor Gitman, and Boris Ginsburg. 2017. Large batch training of convolutional networks. arXiv preprint arXiv:1708.03888 (2017)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20248"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2021.3139916"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.317"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00069"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612131","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612131","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:54:13Z","timestamp":1755820453000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612131"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":71,"alternative-id":["10.1145\/3581783.3612131","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612131","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}