{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T00:30:20Z","timestamp":1773361820856,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Shandong Project towards the Integration of Education and Industry","award":["2022PY009"],"award-info":[{"award-number":["2022PY009"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62272254"],"award-info":[{"award-number":["62272254"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Key R&D Program of Shandong (Major scientific and technological innovation projects)","award":["2022CXGC020107"],"award-info":[{"award-number":["2022CXGC020107"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611696","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"7101-7110","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["Sample Less, Learn More: Efficient Action Recognition via Frame Feature Restoration"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7436-0162","authenticated-orcid":false,"given":"Harry","family":"Cheng","sequence":"first","affiliation":[{"name":"Shandong University, Qingdao, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8691-5372","authenticated-orcid":false,"given":"Yangyang","family":"Guo","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1476-0273","authenticated-orcid":false,"given":"Liqiang","family":"Nie","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1109-5028","authenticated-orcid":false,"given":"Zhiyong","family":"Cheng","sequence":"additional","affiliation":[{"name":"Qilu University of Technology (Shandong Academy of Sciences), Jinan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4846-2015","authenticated-orcid":false,"given":"Mohan","family":"Kankanhalli","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"ViViT: A Video Vision Transformer. In International Conference on Computer Vision. IEEE, 6816--6826","author":"Arnab Anurag","year":"2021","unstructured":"Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lucic, and Cordelia Schmid. 2021. ViViT: A Video Vision Transformer. In International Conference on Computer Vision. IEEE, 6816--6826."},{"key":"e_1_3_2_1_2_1","volume-title":"International Conference on Machine Learning. PMLR, 813--824","author":"Bertasius Gedas","year":"2021","unstructured":"Gedas Bertasius, Heng Wang, and Lorenzo Torresani. 2021. Is Space-Time Attention All You Need for Video Understanding?. In International Conference on Machine Learning. PMLR, 813--824."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3118287"},{"key":"e_1_3_2_1_5_1","unstructured":"Mostafa Dehghani Josip Djolonga Basil Mustafa Piotr Padlewski Jonathan Heek Justin Gilmer Andreas Steiner Mathilde Caron Robert Geirhos Ibrahim Alabdulmohsin Rodolphe Jenatton Lucas Beyer Michael Tschannen Anurag Arnab Xiao Wang Carlos Riquelme Matthias Minderer Joan Puigcerver Utku Evci Manoj Kumar Sjoerd van Steenkiste Gamaleldin F. Elsayed Aravindh Mahendran Fisher Yu Avital Oliver Fantine Huot Jasmijn Bastings Mark Patrick Collier Alexey A. Gritsenko Vighnesh Birodkar Cristina Vasconcelos Yi Tay Thomas Mensink Alexander Kolesnikov Filip Pavetic Dustin Tran Thomas Kipf Mario Lucic Xiaohua Zhai Daniel Keysers Jeremiah Harmsen and Neil Houlsby. 2023. Scaling Vision Transformers to 22 Billion Parameters. CoRR Vol. abs\/2302.05442 (2023) 1--21."},{"key":"e_1_3_2_1_6_1","volume-title":"International Conference on Learning Representations. OpenReview.net, 1--12","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations. OpenReview.net, 1--12."},{"key":"e_1_3_2_1_7_1","volume-title":"Multiscale Vision Transformers. In International Conference on Computer Vision. IEEE, 6804--6815","author":"Fan Haoqi","year":"2021","unstructured":"Haoqi Fan, Bo Xiong, Karttikeya Mangalam, Yanghao Li, Zhicheng Yan, Jitendra Malik, and Christoph Feichtenhofer. 2021. Multiscale Vision Transformers. In International Conference on Computer Vision. IEEE, 6804--6815."},{"key":"e_1_3_2_1_8_1","volume-title":"DTR: An Information Bottleneck Based Regularization Framework for Video Action Recognition. In ACM International Conference on Multimedia. ACM, 3877--3885","author":"Fan Jiawei","year":"2022","unstructured":"Jiawei Fan, Yu Zhao, Xie Yu, Lihua Ma, Junqi Liu, Fangqiu Yi, and Boxun Li. 2022. DTR: An Information Bottleneck Based Regularization Framework for Video Action Recognition. In ACM International Conference on Multimedia. ACM, 3877--3885."},{"key":"e_1_3_2_1_9_1","volume-title":"Cox","author":"Fan Quanfu","year":"2019","unstructured":"Quanfu Fan, Chun-Fu (Richard) Chen, Hilde Kuehne, Marco Pistoia, and David D. Cox. 2019. More Is Less: Learning Efficient Video Representations by Big-Little Network and Depthwise Temporal Aggregation. In Advances in Neural Information Processing Systems. 2261--2270."},{"key":"e_1_3_2_1_10_1","volume-title":"SlowFast Networks for Video Recognition. In International Conference on Computer Vision. IEEE, 6201--6210","author":"Feichtenhofer Christoph","year":"2019","unstructured":"Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He. 2019. SlowFast Networks for Video Recognition. In International Conference on Computer Vision. IEEE, 6201--6210."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3565266"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3128322"},{"key":"e_1_3_2_1_13_1","volume-title":"Roy-Chowdhury","author":"Gupta Akash","year":"2020","unstructured":"Akash Gupta, Abhishek Aich, and Amit K. Roy-Chowdhury. 2020. ALANET: Adaptive Latent Attention Network for Joint Video Deblurring and Interpolation. In ACM Multimedia. ACM, 256--264."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00082"},{"key":"e_1_3_2_1_15_1","volume-title":"Deep Residual Learning for Image Recognition. In Conference on Computer Vision and Pattern Recognition. IEEE, 770--778","author":"He Kaiming","year":"2016","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In Conference on Computer Vision and Pattern Recognition. IEEE, 770--778."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"e_1_3_2_1_17_1","volume-title":"International Conference on Learning Representations. OpenReview.net, 1--13","author":"Huang Ziyuan","unstructured":"Ziyuan Huang, Shiwei Zhang, Liang Pan, Zhiwu Qing, Mingqian Tang, Ziwei Liu, and Marcelo H. Ang Jr. 2022. TAda! Temporally-Adaptive Convolutions for Video Understanding. In International Conference on Learning Representations. OpenReview.net, 1--13."},{"key":"e_1_3_2_1_18_1","volume-title":"STM: SpatioTemporal and Motion Encoding for Action Recognition. In International Conference on Computer Vision. IEEE","author":"Jiang Boyuan","year":"2019","unstructured":"Boyuan Jiang, Mengmeng Wang, Weihao Gan, Wei Wu, and Junjie Yan. 2019. STM: SpatioTemporal and Motion Encoding for Action Recognition. In International Conference on Computer Vision. IEEE, 2000--2009."},{"key":"e_1_3_2_1_19_1","volume-title":"Super SloMo: High Quality Estimation of Multiple Intermediate Frames for Video Interpolation. In Conference on Computer Vision and Pattern Recognition. CVF \/ IEEE, 9000--9008","author":"Jiang Huaizu","year":"2018","unstructured":"Huaizu Jiang, Deqing Sun, Varun Jampani, Ming-Hsuan Yang, Erik G. Learned-Miller, and Jan Kautz. 2018. Super SloMo: High Quality Estimation of Multiple Intermediate Frames for Video Interpolation. In Conference on Computer Vision and Pattern Recognition. CVF \/ IEEE, 9000--9008."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2938340"},{"key":"e_1_3_2_1_21_1","volume-title":"Prompting Visual-Language Models for Efficient Video Understanding. In European Conference on Computer Vision","volume":"13695","author":"Ju Chen","year":"2022","unstructured":"Chen Ju, Tengda Han, Kunhao Zheng, Ya Zhang, and Weidi Xie. 2022. Prompting Visual-Language Models for Efficient Video Understanding. In European Conference on Computer Vision, Vol. 13695. Springer, 105--124."},{"key":"e_1_3_2_1_22_1","volume-title":"Karen Simonyan, Brian Zhang, Chloe Hillier, Sudheendra Vijayanarasimhan, Fabio Viola, Tim Green, Trevor Back, Paul Natsev, Mustafa Suleyman, and Andrew Zisserman.","author":"Kay Will","year":"2017","unstructured":"Will Kay, Jo a o Carreira, Karen Simonyan, Brian Zhang, Chloe Hillier, Sudheendra Vijayanarasimhan, Fabio Viola, Tim Green, Trevor Back, Paul Natsev, Mustafa Suleyman, and Andrew Zisserman. 2017. The Kinetics Human Action Video Dataset. CoRR, Vol. abs\/1705.06950 (2017), 1--12. showeprint[arXiv]1705.06950"},{"key":"e_1_3_2_1_23_1","volume-title":"Regularization on Spatio-Temporally Smoothed Feature for Action Recognition. In Conference on Computer Vision and Pattern Recognition. CVF \/ IEEE, 12100--12109","author":"Kim Jinhyung","year":"2020","unstructured":"Jinhyung Kim, Seunghwan Cha, Dongyoon Wee, Soonmin Bae, and Junmo Kim. 2020. Regularization on Spatio-Temporally Smoothed Feature for Action Recognition. In Conference on Computer Vision and Pattern Recognition. CVF \/ IEEE, 12100--12109."},{"key":"e_1_3_2_1_24_1","volume-title":"Spatio-Temporal Transformer Network for Video Restoration. In European Conference on Computer Vision. Springer, 111--127","author":"Kim Tae Hyun","year":"2018","unstructured":"Tae Hyun Kim, Mehdi S. M. Sajjadi, Michael Hirsch, and Bernhard Sch\u00f6lkopf. 2018. Spatio-Temporal Transformer Network for Video Restoration. In European Conference on Computer Vision. Springer, 111--127."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"e_1_3_2_1_26_1","volume-title":"Hoi","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven C. H. Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. CoRR, Vol. abs\/2301.12597 (2023), 1--11."},{"key":"e_1_3_2_1_27_1","volume-title":"BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven C. H. Hoi. 2022a. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_28_1","volume-title":"AAAI Conference on Artificial Intelligence. AAAI Press, 1404--1411","author":"Fei Mengjuan","year":"2022","unstructured":"Shuyuan Li, Huabin Liu, Rui Qian, Yuxi Li, John See, Mengjuan Fei, Xiaoyuan Yu, and Weiyao Lin. 2022b. TA2N: Two-Stage Action Alignment Network for Few-Shot Action Recognition. In AAAI Conference on Artificial Intelligence. AAAI Press, 1404--1411."},{"key":"e_1_3_2_1_29_1","volume-title":"TEA: Temporal Excitation and Aggregation for Action Recognition. In Conference on Computer Vision and Pattern Recognition. CVF \/ IEEE, 906--915","author":"Li Yan","year":"2020","unstructured":"Yan Li, Bin Ji, Xintian Shi, Jianguo Zhang, Bin Kang, and Limin Wang. 2020. TEA: Temporal Excitation and Aggregation for Action Recognition. In Conference on Computer Vision and Pattern Recognition. CVF \/ IEEE, 906--915."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00476"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01352"},{"key":"e_1_3_2_1_32_1","volume-title":"TSM: Temporal Shift Module for Efficient Video Understanding. In International Conference on Computer Vision. IEEE, 7082--7092","author":"Lin Ji","year":"2019","unstructured":"Ji Lin, Chuang Gan, and Song Han. 2019. TSM: Temporal Shift Module for Efficient Video Understanding. In International Conference on Computer Vision. IEEE, 7082--7092."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2022.3158317"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3556537"},{"key":"e_1_3_2_1_35_1","volume-title":"Video Swin Transformer. In Conference on Computer Vision and Pattern Recognition. IEEE, 3192--3201","author":"Liu Ze","year":"2022","unstructured":"Ze Liu, Jia Ning, Yue Cao, Yixuan Wei, Zheng Zhang, Stephen Lin, and Han Hu. 2022. Video Swin Transformer. In Conference on Computer Vision and Pattern Recognition. IEEE, 3192--3201."},{"key":"e_1_3_2_1_36_1","volume-title":"Restoration of User Videos Shared on Social Media. In ACM International Conference on Multimedia. ACM, 2749--2757","author":"Luo Hongming","year":"2022","unstructured":"Hongming Luo, Fei Zhou, Kin-Man Lam, and Guoping Qiu. 2022. Restoration of User Videos Shared on Social Media. In ACM International Conference on Multimedia. ACM, 2749--2757."},{"key":"e_1_3_2_1_37_1","volume-title":"Search-oriented Micro-video Captioning. In ACM International Conference on Multimedia. ACM, 3234--3243","author":"Nie Liqiang","year":"2022","unstructured":"Liqiang Nie, Leigang Qu, Dai Meng, Min Zhang, Qi Tian, and Alberto Del Bimbo. 2022. Search-oriented Micro-video Captioning. In ACM International Conference on Multimedia. ACM, 3234--3243."},{"key":"e_1_3_2_1_38_1","volume-title":"Advances in Neural Information Processing Systems","author":"Ouyang Long","unstructured":"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Gray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. 2022. Training language models to follow instructions with human feedback. In Advances in Neural Information Processing Systems. MIT Press, 1--10."},{"key":"e_1_3_2_1_39_1","volume-title":"Dynamic Modality Interaction Modeling for Image-Text Retrieval. In International ACM SIGIR Conference on Research and Development in Information Retrieval. ACM, 1104--1113","author":"Qu Leigang","year":"2021","unstructured":"Leigang Qu, Meng Liu, Jianlong Wu, Zan Gao, and Liqiang Nie. 2021. Dynamic Modality Interaction Modeling for Image-Text Retrieval. In International ACM SIGIR Conference on Research and Development in Information Retrieval. ACM, 1104--1113."},{"key":"e_1_3_2_1_40_1","volume-title":"Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_41_1","volume-title":"Advances in Neural Information Processing Systems","author":"Rana Aayush","unstructured":"Aayush Rana and Yogesh S Rawat. 2022. Are all Frames Equal? Active Sparse Labeling for Video Action Detection. In Advances in Neural Information Processing Systems. MIT Press, 1--15."},{"key":"e_1_3_2_1_42_1","volume-title":"Two-Stream Convolutional Networks for Action Recognition in Videos. In Annual Conference on Neural Information Processing Systems. MIT Press, 568--576","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Two-Stream Convolutional Networks for Action Recognition in Videos. In Annual Conference on Neural Information Processing Systems. MIT Press, 568--576."},{"key":"e_1_3_2_1_43_1","volume-title":"Haoqi Fan, Vaibhav Aggarwal, Aaron Adcock, Armand Joulin, Piotr Doll\u00e1r, Christoph Feichtenhofer, Ross Girshick, Rohit Girdhar, and Ishan Misra.","author":"Singh Mannat","year":"2023","unstructured":"Mannat Singh, Quentin Duval, Kalyan Vasudev Alwala, Haoqi Fan, Vaibhav Aggarwal, Aaron Adcock, Armand Joulin, Piotr Doll\u00e1r, Christoph Feichtenhofer, Ross Girshick, Rohit Girdhar, and Ishan Misra. 2023. The effectiveness of MAE pre-pretraining for billion-scale pretraining. CoRR, Vol. abs\/2303.13496 (2023), 1--11."},{"key":"e_1_3_2_1_44_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild. CoRR, Vol. abs\/1212.0402 (2012), 1--7. [arXiv]1212.0402"},{"key":"e_1_3_2_1_45_1","volume-title":"Selective Dependency Aggregation for Action Classification. In ACM Multimedia Conference. ACM, 592--601","author":"Tan Yi","year":"2021","unstructured":"Yi Tan, Yanbin Hao, Xiangnan He, Yinwei Wei, and Xun Yang. 2021. Selective Dependency Aggregation for Action Classification. In ACM Multimedia Conference. ACM, 592--601."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01163"},{"key":"e_1_3_2_1_49_1","volume-title":"TDN: Temporal Difference Networks for Efficient Action Recognition. In Conference on Computer Vision and Pattern Recognition. CVF \/ IEEE","author":"Wang Limin","year":"2021","unstructured":"Limin Wang, Zhan Tong, Bin Ji, and Gangshan Wu. 2021b. TDN: Temporal Difference Networks for Efficient Action Recognition. In Conference on Computer Vision and Pattern Recognition. CVF \/ IEEE, 1895--1904."},{"key":"e_1_3_2_1_50_1","volume-title":"Temporal Segment Networks: Towards Good Practices for Deep Action Recognition. In European Conference on Computer Vision. Springer, 20--36","author":"Wang Limin","year":"2016","unstructured":"Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, and Luc Van Gool. 2016. Temporal Segment Networks: Towards Good Practices for Deep Action Recognition. In European Conference on Computer Vision. Springer, 20--36."},{"key":"e_1_3_2_1_51_1","volume-title":"ActionCLIP: A New Paradigm for Video Action Recognition. CoRR","author":"Wang Mengmeng","year":"2021","unstructured":"Mengmeng Wang, Jiazheng Xing, and Yong Liu. 2021c. ActionCLIP: A New Paradigm for Video Action Recognition. CoRR, Vol. abs\/2109.08472 (2021), 1--11."},{"key":"e_1_3_2_1_52_1","volume-title":"Perception-Aware Cross-Modal Signal Reconstruction: From Audio-Haptic to Visual","author":"Wei Xin","year":"2022","unstructured":"Xin Wei, Yuyuan Yao, Haoyu Wang, and Liang Zhou. 2022. Perception-Aware Cross-Modal Signal Reconstruction: From Audio-Haptic to Visual. IEEE Transactions on Multimedia (2022), 1--12."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00023"},{"key":"e_1_3_2_1_54_1","volume-title":"Revisiting Classifier: Transferring Vision-Language Models for Video Recognition. In AAAI Conference on Artificial Intelligence. AAAI Press, 1--10","author":"Wu Wenhao","year":"2023","unstructured":"Wenhao Wu, Zhun Sun, and Wanli Ouyang. 2023. Revisiting Classifier: Transferring Vision-Language Models for Video Recognition. In AAAI Conference on Artificial Intelligence. AAAI Press, 1--10."},{"key":"e_1_3_2_1_55_1","volume-title":"Rethinking Spatiotemporal Feature Learning: Speed-Accuracy Trade-offs in Video Classification. In European Conference on Computer Vision. Springer, 318--335","author":"Xie Saining","year":"2018","unstructured":"Saining Xie, Chen Sun, Jonathan Huang, Zhuowen Tu, and Kevin Murphy. 2018. Rethinking Spatiotemporal Feature Learning: Speed-Accuracy Trade-offs in Video Classification. In European Conference on Computer Vision. Springer, 318--335."},{"key":"e_1_3_2_1_56_1","volume-title":"MGSampler: An Explainable Sampling Strategy for Video Action Recognition. In International Conference on Computer Vision. IEEE, 1493--1502","author":"Zhi Yuan","year":"2021","unstructured":"Yuan Zhi, Zhan Tong, Limin Wang, and Gangshan Wu. 2021. MGSampler: An Explainable Sampling Strategy for Video Action Recognition. In International Conference on Computer Vision. IEEE, 1493--1502."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Chengcheng Zhou Zongqing Lu Linge Li Qiangyu Yan and Jing-Hao Xue. 2021. How Video Super-Resolution and Frame Interpolation Mutually Benefit. In ACM Multimedia. ACM 5445--5453.","DOI":"10.1145\/3474085.3475672"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611696","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611696","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:58:37Z","timestamp":1755820717000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611696"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":57,"alternative-id":["10.1145\/3581783.3611696","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611696","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}