{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:18:48Z","timestamp":1777655928016,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":65,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"The Shenzhen Science and Technology Program","award":["JSGG20220831093004008"],"award-info":[{"award-number":["JSGG20220831093004008"]}]},{"name":"The National Natural Science Foundation of China","award":["Grant No. U1903213"],"award-info":[{"award-number":["Grant No. U1903213"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658044","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"1-10","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":11,"title":["Multimodal Prototype-Enhanced Network for Few-Shot Action Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7845-7762","authenticated-orcid":false,"given":"Xinzhe","family":"Ni","sequence":"first","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3078-1598","authenticated-orcid":false,"given":"Yong","family":"Liu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3165-3859","authenticated-orcid":false,"given":"Hao","family":"Wen","sequence":"additional","affiliation":[{"name":"Tsinghua University, ShenZhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8096-2928","authenticated-orcid":false,"given":"Yatai","family":"Ji","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9615-4749","authenticated-orcid":false,"given":"Jing","family":"Xiao","sequence":"additional","affiliation":[{"name":"Ping An Insurance (Group) Company of China, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6427-1024","authenticated-orcid":false,"given":"Yujiu","family":"Yang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Advances in neural information processing systems (NIPS)","author":"Andrychowicz Marcin","unstructured":"Marcin Andrychowicz, Misha Denil, Sergio Gomez, Matthew W Hoffman, David Pfau, Tom Schaul, Brendan Shillingford, and Nando De Freitas. 2016. Learning to learn by gradient descent by gradient descent. In Advances in neural information processing systems (NIPS), Vol. 29."},{"key":"e_1_3_2_1_2_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Antoniou Antreas","year":"2018","unstructured":"Antreas Antoniou, Harrison Edwards, and Amos Storkey. 2018. How to train your MAML. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_3_1","volume-title":"The British Machine Vision Conference (BMVC). 154","author":"Bishay Mina","year":"2019","unstructured":"Mina Bishay, Georgios Zoumpourlis, and Ioannis Patras. 2019. Tarn: Temporal attentive relation network for few-shot and zero-shot action recognition. In The British Machine Vision Conference (BMVC). 154."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01063"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_6_1","volume-title":"Semantic feature augmentation in few-shot learning. arXiv preprint arXiv:1804.05299","author":"Chen Zitian","year":"2018","unstructured":"Zitian Chen, Yanwei Fu, Yinda Zhang, Yu-Gang Jiang, Xiangyang Xue, and Leonid Sigal. 2018. Semantic feature augmentation in few-shot learning. arXiv preprint arXiv:1804.05299, Vol. 86, 89 (2018), 2."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2910052"},{"key":"e_1_3_2_1_8_1","volume-title":"Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, (NAACL-HLT)","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, (NAACL-HLT), Vol. 1. 4171--4186."},{"key":"e_1_3_2_1_9_1","volume-title":"Advances in neural information processing systems (NIPS)","author":"Doersch Carl","year":"1981","unstructured":"Carl Doersch, Ankush Gupta, and Andrew Zisserman. 2020. Crosstransformers: spatially-aware few-shot transfer. In Advances in neural information processing systems (NIPS), Vol. 33. 21981--21993."},{"key":"e_1_3_2_1_10_1","volume-title":"International conference on machine learning (ICML). 1126--1135","author":"Finn Chelsea","year":"2017","unstructured":"Chelsea Finn, Pieter Abbeel, and Sergey Levine. 2017. Model-agnostic meta-learning for fast adaptation of deep networks. In International conference on machine learning (ICML). 1126--1135."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413502"},{"key":"e_1_3_2_1_12_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Garcia Victor","year":"2018","unstructured":"Victor Garcia and Joan Bruna. 2018. Few-shot learning with graph neural networks. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01237-3_27"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_17_1","volume-title":"Advances in neural information processing systems (NIPS)","author":"Hou Ruibing","unstructured":"Ruibing Hou, Hong Chang, Bingpeng Ma, Shiguang Shan, and Xilin Chen. 2019. Cross attention network for few-shot classification. In Advances in neural information processing systems (NIPS), Vol. 32."},{"key":"e_1_3_2_1_18_1","volume-title":"Advances in neural information processing systems (NIPS)","author":"Huang Yu","unstructured":"Yu Huang, Chenzhuang Du, Zihui Xue, Xuanyao Chen, Hang Zhao, and Longbo Huang. 2021. What makes multi-modal learning better than single (provably). In Advances in neural information processing systems (NIPS), Vol. 34. 10944--10956."},{"key":"e_1_3_2_1_19_1","volume-title":"Compound Prototype Matching for Few-shot Action Recognition. In European Conference on Computer Vision (ECCV).","author":"Huang Yifei","year":"2022","unstructured":"Yifei Huang, Lijin Yang, and Yoichi Sato. 2022. Compound Prototype Matching for Few-shot Action Recognition. In European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01199"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00010"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00166"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01091"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00009"},{"key":"e_1_3_2_1_26_1","volume-title":"MASTAF: A Spatio-Temporal Attention Fusion Network for Few-shot Video Classification. arXiv preprint arXiv:2112.04585","author":"Liu Rex","year":"2021","unstructured":"Rex Liu, Huanle Zhang, Hamed Pirsiavash, and Xin Liu. 2021. MASTAF: A Spatio-Temporal Attention Fusion Network for Few-shot Video Classification. arXiv preprint arXiv:2112.04585 (2021)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_24"},{"key":"e_1_3_2_1_28_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_29_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2019.00029"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00269"},{"key":"e_1_3_2_1_32_1","volume-title":"The effectiveness of data augmentation in image classification using deep learning. arXiv preprint arXiv:1712.04621","author":"Perez Luis","year":"2017","unstructured":"Luis Perez and Jason Wang. 2017. The effectiveness of data augmentation in image classification using deep learning. arXiv preprint arXiv:1712.04621 (2017)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00054"},{"key":"e_1_3_2_1_34_1","volume-title":"International conference on machine learning (ICML). 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning (ICML). 8748--8763."},{"key":"e_1_3_2_1_35_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"e_1_3_2_1_37_1","volume-title":"Advances in neural information processing systems (NIPS)","author":"Ratner Alexander J","unstructured":"Alexander J Ratner, Henry Ehrenberg, Zeshan Hussain, Jared Dunnmon, and Christopher R\u00e9. 2017. Learning to compose domain-specific transformations for data augmentation. In Advances in neural information processing systems (NIPS), Vol. 30."},{"key":"e_1_3_2_1_38_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Ravi Sachin","year":"2017","unstructured":"Sachin Ravi and Hugo Larochelle. 2017. Optimization as a model for few-shot learning. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_39_1","volume-title":"Advances in neural information processing systems (NIPS)","author":"Snell Jake","unstructured":"Jake Snell, Kevin Swersky, and Richard Zemel. 2017. Prototypical networks for few-shot learning. In Advances in neural information processing systems (NIPS), Vol. 30."},{"key":"e_1_3_2_1_40_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00049"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00131"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01933"},{"key":"e_1_3_2_1_44_1","volume-title":"Advances in neural information processing systems (NIPS)","author":"Tsimpoukelli Maria","unstructured":"Maria Tsimpoukelli, Jacob L Menick, Serkan Cabi, SM Eslami, Oriol Vinyals, and Felix Hill. 2021. Multimodal few-shot learning with frozen language models. In Advances in neural information processing systems (NIPS), Vol. 34. 200--212."},{"key":"e_1_3_2_1_45_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research, Vol. 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_46_1","volume-title":"Advances in neural information processing systems (NIPS)","author":"Vaswani Ashish","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in neural information processing systems (NIPS), Vol. 30."},{"key":"e_1_3_2_1_47_1","unstructured":"Oriol Vinyals Charles Blundell Timothy Lillicrap Daan Wierstra et al. 2016. Matching networks for one shot learning. In Advances in neural information processing systems (NIPS) Vol. 29."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"e_1_3_2_1_49_1","volume-title":"Actionclip: A new paradigm for video action recognition. arXiv preprint arXiv:2109.08472","author":"Wang Mengmeng","year":"2021","unstructured":"Mengmeng Wang, Jiazheng Xing, and Yong Liu. 2021. Actionclip: A new paradigm for video action recognition. arXiv preprint arXiv:2109.08472 (2021)."},{"key":"e_1_3_2_1_50_1","volume-title":"2023 a. CLIP-guided prototype modulating for few-shot action recognition. arXiv preprint arXiv:2303.02982","author":"Wang Xiang","year":"2023","unstructured":"Xiang Wang, Shiwei Zhang, Jun Cen, Changxin Gao, Yingya Zhang, Deli Zhao, and Nong Sang. 2023 a. CLIP-guided prototype modulating for few-shot action recognition. arXiv preprint arXiv:2303.02982 (2023)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01727"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01932"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00894"},{"key":"e_1_3_2_1_54_1","volume-title":"2023 a. Multimodal Adaptation of CLIP for Few-Shot Action Recognition. arXiv preprint arXiv:2308.01532","author":"Xing Jiazheng","year":"2023","unstructured":"Jiazheng Xing, Mengmeng Wang, Xiaojun Hou, Guang Dai, Jingdong Wang, and Yong Liu. 2023 a. Multimodal Adaptation of CLIP for Few-Shot Action Recognition. arXiv preprint arXiv:2308.01532 (2023)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00167"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01340"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00883"},{"key":"e_1_3_2_1_58_1","volume-title":"International conference on machine learning (ICML). 7115--7123","author":"Yoon Sung Whan","year":"2019","unstructured":"Sung Whan Yoon, Jun Seo, and Jaekyun Moon. 2019. Tapnet: Neural network augmented with task-adaptive projection for few-shot learning. In International conference on machine learning (ICML). 7115--7123."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00375"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_31"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107348"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/181"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19772-7_18"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_46"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.5244\/C.35.94"}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","location":"Phuket Thailand","acronym":"ICMR '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658044","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658044","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:52:17Z","timestamp":1755766337000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658044"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":65,"alternative-id":["10.1145\/3652583.3658044","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658044","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}