{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:05:16Z","timestamp":1750309516179,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100004731","name":"Natural Science Foundation of Zhejiang Province","doi-asserted-by":"publisher","award":["LQ22F020007,LD24F020005,LDT23F0202,LDT23F02021F02"],"award-info":[{"award-number":["LQ22F020007,LD24F020005,LDT23F0202,LDT23F02021F02"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100004731","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62206250,62036009,62276237,62001418"],"award-info":[{"award-number":["62206250,62036009,62276237,62001418"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Ten Thousand Talent Program of Zhejiang Province"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681200","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"1024-1033","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Saliency-Guided Fine-Grained Temporal Mask Learning for Few-Shot Action Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-0597-1060","authenticated-orcid":false,"given":"Shuo","family":"Zheng","sequence":"first","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8302-1338","authenticated-orcid":false,"given":"Yuanjie","family":"Dang","sequence":"additional","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6122-0574","authenticated-orcid":false,"given":"Peng","family":"Chen","sequence":"additional","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2555-343X","authenticated-orcid":false,"given":"Ruohong","family":"Huan","sequence":"additional","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6032-5083","authenticated-orcid":false,"given":"Dongdong","family":"Zhao","sequence":"additional","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2077-9608","authenticated-orcid":false,"given":"Ronghua","family":"Liang","sequence":"additional","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"e_1_3_2_1_2_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Bao Hangbo","year":"2022","unstructured":"Hangbo Bao, Li Dong, Songhao Piao, and Furu Wei. 2022. Beit: Bert pre-training of image transformers. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00313"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01063"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_6_1","volume-title":"International Conference on Machine Learning (ICML). PMLR, 1691--1703","author":"Chen Mark","year":"2020","unstructured":"Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, and Ilya Sutskever. 2020. Generative pretraining from pixels. In International Conference on Machine Learning (ICML). PMLR, 1691--1703."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.177"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_9_1","first-page":"21981","article-title":"Crosstransformers: spatially-aware few-shot transfer","volume":"33","author":"Doersch Carl","year":"2020","unstructured":"Carl Doersch, Ankush Gupta, and Andrew Zisserman. 2020. Crosstransformers: spatially-aware few-shot transfer. In Advances in Neural Information Processing Systems (NeurIPS), Vol. 33. 21981--21993.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_1_10_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_1_13_1","first-page":"35946","article-title":"Masked autoencoders as spatiotemporal learners","volume":"35","author":"Feichtenhofer Christoph","year":"2022","unstructured":"Christoph Feichtenhofer, Yanghao Li, Kaiming He, et al. 2022. Masked autoencoders as spatiotemporal learners. In Advances in Neural Information Processing Systems (NeurIPS), Vol. 35. 35946--35958.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_1_14_1","volume-title":"Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks. In International Conference on Machine Learning (ICML). 1126--1135","author":"Finn Chelsea","year":"2017","unstructured":"Chelsea Finn, Pieter Abbeel, and Sergey Levine. 2017. Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks. In International Conference on Machine Learning (ICML). 1126--1135."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413502"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00633"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01241"},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)","volume":"36","author":"Fei Mengjuan","year":"2022","unstructured":"Shuyuan Li, Huabin Liu, Rui Qian, Yuxi Li, John See, Mengjuan Fei, Xiaoyuan Yu, and Weiyao Lin. 2022. TA2N: Two-stage action alignment network for few-shot action recognition. In Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), Vol. 36. 1404--1411."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the ACM International Conference on Multimedia (MM). 6230--6240","author":"Lin Weiyao","year":"2022","unstructured":"Huabin Liu, Weixian Lv, John See, and Weiyao Lin. 2022. Task-adaptive Spatial-Temporal Video Sampler for Few-shot Action Recognition. In Proceedings of the ACM International Conference on Multimedia (MM). 6230--6240."},{"key":"e_1_3_2_1_23_1","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","volume":"32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. In Advances in Neural Information Processing Systems (NeurIPS), Vol. 32."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00054"},{"key":"e_1_3_2_1_25_1","volume-title":"Mar: Masked autoencoders for efficient action recognition","author":"Qing Zhiwu","year":"2023","unstructured":"Zhiwu Qing, Shiwei Zhang, Ziyuan Huang, Xiang Wang, Yuehuan Wang, Yiliang Lv, Changxin Gao, and Nong Sang. 2023. Mar: Masked autoencoders for efficient action recognition. IEEE Transactions on Multimedia (TMM) (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning (ICML).","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_27_1","unstructured":"Jake Snell Kevin Swersky and Richard Zemel. 2017. Prototypical networks for few-shot learning. In Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_28_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 1199--1208","author":"Sung Flood","key":"e_1_3_2_1_29_1","unstructured":"Flood Sung, Yongxin Yang, Li Zhang, Tao Xiang, Philip H. S. Torr, and Timothy M. Hospedales. 2018. Learning to compare: Relation network for few-shot learning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 1199--1208."},{"key":"e_1_3_2_1_30_1","volume-title":"Vimpac: Video pre-training via masked token prediction and contrastive learning. arXiv preprint arXiv:2106.11250","author":"Tan Hao","year":"2021","unstructured":"Hao Tan, Jie Lei, Thomas Wolf, and Mohit Bansal. 2021. Vimpac: Video pre-training via masked token prediction and contrastive learning. arXiv preprint arXiv:2106.11250 (2021)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01933"},{"key":"e_1_3_2_1_32_1","first-page":"10078","article-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training","volume":"35","author":"Tong Zhan","year":"2022","unstructured":"Zhan Tong, Yibing Song, Jue Wang, and Limin Wang. 2022. Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. In Advances in Neural Information Processing Systems (NeurIPS), Vol. 35. 10078--10093.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_1_33_1","first-page":"3637","article-title":"Matching networks for one shot learning","volume":"29","author":"Vinyals Oriol","year":"2016","unstructured":"Oriol Vinyals, Charles Blundell, Timothy Lillicrap, Koray Kavukcuoglu, and Daan Wierstra. 2016. Matching networks for one shot learning. In Advances in Neural Information Processing Systems (NeurIPS), Vol. 29. 3637--3645.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01432"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01917-4"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01727"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01932"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00628"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01426"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00943"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25403"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00167"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00333"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_31"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/181"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19772-7_18"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_46"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681200","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681200","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:02Z","timestamp":1750295882000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681200"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":48,"alternative-id":["10.1145\/3664647.3681200","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681200","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}