{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T16:15:42Z","timestamp":1775578542839,"version":"3.50.1"},"reference-count":84,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"9","license":[{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U22A2095"],"award-info":[{"award-number":["U22A2095"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62276281"],"award-info":[{"award-number":["62276281"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001321","name":"Guangdong Basic and Applied Basic Research Foundation","doi-asserted-by":"publisher","award":["2024A1515011882"],"award-info":[{"award-number":["2024A1515011882"]}],"id":[{"id":"10.13039\/501100001321","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Circuits Syst. Video Technol."],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1109\/tcsvt.2025.3558785","type":"journal-article","created":{"date-parts":[[2025,4,9]],"date-time":"2025-04-09T13:57:07Z","timestamp":1744207027000},"page":"9246-9260","source":"Crossref","is-referenced-by-count":2,"title":["Vision-Language Adaptive Clustering and Meta-Adaptation for Unsupervised Few-Shot Action Recognition"],"prefix":"10.1109","volume":"35","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-1448-8172","authenticated-orcid":false,"given":"Jiaxin","family":"Chen","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2299-500X","authenticated-orcid":false,"given":"Jiawen","family":"Peng","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5554-8706","authenticated-orcid":false,"given":"Yanzuo","family":"Lu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3883-2024","authenticated-orcid":false,"given":"Jian-Huang","family":"Lai","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0165-8416","authenticated-orcid":false,"given":"Andy J.","family":"Ma","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Sun Yat-sen University, Guangzhou, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00685"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3384875"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3262670"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3175923"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/181"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00837"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.14778\/3407790.3407813"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00992"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICEIEC49280.2020.9152261"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01105"},{"key":"ref14","first-page":"10078","article-title":"VideoMAE: Masked autoencoders are data-efficient learners for self-supervised video pre-training","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Tong"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00689"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00267"},{"key":"ref18","first-page":"1","article-title":"Unsupervised learning via meta-learning","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Hsu"},{"key":"ref19","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. 38th Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref20","first-page":"2790","article-title":"Parameter-efficient transfer learning for NLP","volume-title":"Proc. 36th Int. Conf. Mach. Learn."},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00300"},{"key":"ref22","first-page":"1126","article-title":"Model-agnostic meta-learning for fast adaptation of deep networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Finn"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01091"},{"key":"ref24","first-page":"3637","article-title":"Matching networks for one shot learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"29","author":"Vinyals"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00131"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICME55011.2023.00491"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.107951"},{"key":"ref28","first-page":"1","article-title":"Meta-GMVAE: Mixture of Gaussian VAE for unsupervised meta-learning","volume-title":"Proc. ICLR","author":"Lee"},{"key":"ref29","first-page":"10132","article-title":"Unsupervised meta-learning for few-shot image classification","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Khodadadeh"},{"key":"ref30","article-title":"Self-supervised prototypical transfer learning for few-shot classification","author":"Medina","year":"2020","journal-title":"arXiv:2006.11325"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3179368"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01063"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00054"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00167"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20029"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00894"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19772-7_18"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01933"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01932"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413502"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00628"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.112539"},{"key":"ref43","article-title":"Self-supervised spatiotemporal feature learning via video rotation prediction","author":"Jing","year":"2019","journal-title":"arXiv:1811.11387"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58520-4_30"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01058"},{"key":"ref46","first-page":"1","article-title":"Self-supervised co-training for video representation learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Han"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3284977"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.110110"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01826"},{"key":"ref50","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"Proc. 38th Int. Conf. Mach. Learn.","author":"Jia"},{"key":"ref51","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Li"},{"key":"ref52","first-page":"1","article-title":"CoCa: Contrastive captioners are image-text foundation models","volume-title":"Proc. Trans. Mach. Learn. Res.","author":"Yu"},{"key":"ref53","first-page":"23716","article-title":"Flamingo: A visual language model for few-shot learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Alayrac"},{"key":"ref54","first-page":"1","article-title":"GIT: A generative image-to-text transformer for vision and language","volume-title":"Proc. Trans. Mach. Learn. Res.","author":"Wang"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3424566"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref58","first-page":"1","article-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","volume-title":"Proc. 9th Int. Conf. Learn. Represent.","author":"Dosovitskiy"},{"key":"ref59","first-page":"5998","article-title":"Attention is all you need","volume-title":"Proc. Adv. Neural Inform. Process. Syst. (NIPS)","author":"Vaswani"},{"issue":"8","key":"ref60","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref61","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref62","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1007\/BF02289263"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_31"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1212.0402"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_46"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25403"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01727"},{"key":"ref72","article-title":"Few-shot action recognition with captioning foundation models","author":"Wang","year":"2023","journal-title":"arXiv:2310.10125"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01917-4"},{"key":"ref74","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2024.128649","article-title":"Consistency prototype module and motion compensation for few-shot action recognition (CLIP-CPM2C)","volume":"611","author":"Guo","year":"2025","journal-title":"Neurocomputing"},{"key":"ref75","article-title":"MVP-shot: Multi-velocity progressive-alignment framework for few-shot action recognition","author":"Qu","year":"2024","journal-title":"arXiv:2405.02077"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2025.111902"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_19"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01891-x"},{"key":"ref79","first-page":"1","article-title":"AIM: Adapting image models for efficient video action recognition","volume-title":"Proc. 11th Int. Conf. Learn. Represent.","author":"Yang"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72646-0_3"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01450"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_2"}],"container-title":["IEEE Transactions on Circuits and Systems for Video Technology"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/76\/11154820\/10960322.pdf?arnumber=10960322","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,10]],"date-time":"2025-09-10T17:49:14Z","timestamp":1757526554000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10960322\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9]]},"references-count":84,"journal-issue":{"issue":"9"},"URL":"https:\/\/doi.org\/10.1109\/tcsvt.2025.3558785","relation":{},"ISSN":["1051-8215","1558-2205"],"issn-type":[{"value":"1051-8215","type":"print"},{"value":"1558-2205","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9]]}}}