{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T03:56:50Z","timestamp":1781150210799,"version":"3.54.1"},"reference-count":85,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100009021","name":"Xi'an University of Technology","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100009021","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2026,9]]},"DOI":"10.1016\/j.neunet.2026.108930","type":"journal-article","created":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T06:59:42Z","timestamp":1775026782000},"page":"108930","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Multimodal-guided prototype calibration and temporal coherence-aware hybrid matching for few-shot action recognition"],"prefix":"10.1016","volume":"201","author":[{"given":"Yiyuan","family":"An","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yingmin","family":"Yi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yiwei","family":"Yuan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rui","family":"Yu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qiming","family":"Xue","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2026.108930_bib0004","series-title":"Proceedings of the IEEE\/CVF winter conference on applications of computer vision","first-page":"3330","article-title":"Star-transformer: A spatio-temporal cross attention transformer for human action recognition","author":"Ahn","year":"2023"},{"key":"10.1016\/j.neunet.2026.108930_bib0016","unstructured":"M. Bishay, G. Zoumpourlis, I. Patras, Tarn: Temporal attentive relation network for few-shot and zero-shot action recognition, 2019, arXiv:1907.09021."},{"key":"10.1016\/j.neunet.2026.108930_bib0006","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops","first-page":"3420","article-title":"Pitchernet: Powering the moneyball evolution in baseball video analytics","author":"Bright","year":"2024"},{"key":"10.1016\/j.neunet.2026.108930_bib0011","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10618","article-title":"Few-shot video classification via temporal alignment","author":"Cao","year":"2020"},{"key":"10.1016\/j.neunet.2026.108930_bib0046","series-title":"Proceedings of the European Conference on Computer Vision","first-page":"105","article-title":"Prompting visual-language models for efficient video understanding","author":"Chen","year":"2022"},{"key":"10.1016\/j.neunet.2026.108930_bib0075","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2025.128246","article-title":"MSVT: Multi-grained spatial and vmamba temporal for few-shot action recognition","volume":"288","author":"Chen","year":"2025","journal-title":"Expert Syst. Appl."},{"issue":"13","key":"10.1016\/j.neunet.2026.108930_bib0033","doi-asserted-by":"crossref","first-page":"5195","DOI":"10.1109\/JSEN.2019.2903645","article-title":"A robust framework for abnormal human action recognition using R-Transform and zernike moments in depth videos","volume":"19","author":"Dhiman","year":"2019","journal-title":"IEEE Sens. J."},{"key":"10.1016\/j.neunet.2026.108930_bib0035","doi-asserted-by":"crossref","first-page":"3835","DOI":"10.1109\/TIP.2020.2965299","article-title":"View-invariant deep architecture for human action recognition using two-stream motion and shape temporal dynamics","volume":"29","author":"Dhiman","year":"2020","journal-title":"IEEE Trans. Image Process."},{"issue":"3","key":"10.1016\/j.neunet.2026.108930_bib0036","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3441628","article-title":"Part-wise spatio-temporal attention driven CNN-based 3D human action recognition","volume":"17","author":"Dhiman","year":"2021","journal-title":"ACM Trans. Multimedia Comput. Commun. Appl."},{"key":"10.1016\/j.neunet.2026.108930_bib0012","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision Workshops","first-page":"1308","article-title":"Protogan: Towards few-shot learning for action recognition","author":"Dwivedi","year":"2019"},{"key":"10.1016\/j.neunet.2026.108930_bib0037","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"200","article-title":"X3D: Expanding architectures for efficient video recognition","author":"Feichtenhofer","year":"2020"},{"key":"10.1016\/j.neunet.2026.108930_bib0015","series-title":"Proceedings of the 28th ACM International Conference on Multimedia","first-page":"1142","article-title":"Depth guided adaptive meta-fusion network for few-shot video recognition","author":"Fu","year":"2020"},{"key":"10.1016\/j.neunet.2026.108930_bib0074","doi-asserted-by":"crossref","first-page":"2450","DOI":"10.1109\/TMM.2024.3521712","article-title":"Hierarchical motion-enhanced matching framework for few-shot action recognition","volume":"27","author":"Gao","year":"2025","journal-title":"IEEE Trans. Multimedia."},{"key":"10.1016\/j.neunet.2026.108930_bib0081","series-title":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","first-page":"1026","article-title":"Reliable and diverse hierarchical adapter for zero-shot video classification","author":"Ge","year":"2025"},{"key":"10.1016\/j.neunet.2026.108930_bib0062","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"5842","article-title":"The \u201csomething something\u201d video database for learning and evaluating visual common sense","author":"Goyal","year":"2017"},{"key":"10.1016\/j.neunet.2026.108930_bib0077","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2024.112539","article-title":"Multi-view distillation based on multi-modal fusion for few-shot action recognition (CLIP-MDMF)","volume":"304","author":"Guo","year":"2024","journal-title":"Knowl. - Based Syst."},{"key":"10.1016\/j.neunet.2026.108930_bib0076","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2024.128649","article-title":"Consistency prototype module and motion compensation for few-shot action recognition (CLIP-CPM2C)","volume":"611","author":"Guo","year":"2025","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neunet.2026.108930_bib0080","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2025.126411","article-title":"DMSD-CDFSAR: Distillation from mixed-source domain for cross-domain few-shot action recognition","volume":"270","author":"Guo","year":"2025","journal-title":"Expert Syst. Appl."},{"issue":"7","key":"10.1016\/j.neunet.2026.108930_bib0059","doi-asserted-by":"crossref","first-page":"2408","DOI":"10.1109\/TMM.2023.3295731","article-title":"Textual enhanced adaptive meta-fusion for few-shot visual recognition","volume":"26","author":"Han","year":"2024","journal-title":"IEEE Trans. Multimedia."},{"key":"10.1016\/j.neunet.2026.108930_bib0055","series-title":"Proceedings of the European Conference on Computer Vision","first-page":"351","article-title":"Compound prototype matching for few-shot action recognition","author":"Huang","year":"2022"},{"key":"10.1016\/j.neunet.2026.108930_bib0042","series-title":"Proceedings of the International conference on machine learning","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","author":"Jia","year":"2021"},{"key":"10.1016\/j.neunet.2026.108930_bib0052","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"2000","article-title":"STM: Spatiotemporal and motion encoding for action recognition","author":"Jiang","year":"2019"},{"key":"10.1016\/j.neunet.2026.108930_bib0073","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2025.129821","article-title":"Multi-temporal ensemble for few-shot action recognition","volume":"298","author":"Jiang","year":"2026","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.neunet.2026.108930_bib0061","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"2556","article-title":"HMDB: A large video database for human motion recognition","author":"Kuehne","year":"2011"},{"key":"10.1016\/j.neunet.2026.108930_bib0021","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"909","article-title":"TEA: Temporal excitation and aggregation for action recognition","author":"Li","year":"2020"},{"key":"10.1016\/j.neunet.2026.108930_bib0068","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"1404","article-title":"TA2N: Two-stage action alignment network for few-shot action recognition","author":"Li","year":"2021"},{"key":"10.1016\/j.neunet.2026.108930_bib0070","unstructured":"S. Li, H. Liu, R. Qian, Ttan: Two-stage temporal alignment network for few-shot action recognition, 2021, arXiv: 2107.04782."},{"issue":"12","key":"10.1016\/j.neunet.2026.108930_bib0010","doi-asserted-by":"crossref","first-page":"14938","DOI":"10.1109\/TPAMI.2023.3312125","article-title":"Libfewshot: A comprehensive library for few-shot learning","volume":"45","author":"Li","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach Intell."},{"key":"10.1016\/j.neunet.2026.108930_bib0022","article-title":"Spatio-temporal adaptive convolution and bidirectional motion difference fusion for video action recognition","author":"Li","year":"2024","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.neunet.2026.108930_bib0009","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2023.122173","article-title":"SSL-ProtoNet: Self-supervised learning prototypical networks for few-shot learning","volume":"238","author":"Lim","year":"2024","journal-title":"Expert Syst. Appl."},{"issue":"8","key":"10.1016\/j.neunet.2026.108930_bib0003","doi-asserted-by":"crossref","first-page":"4137","DOI":"10.1109\/TCSVT.2023.3240472","article-title":"Transkeleton: Hierarchical spatial\u2013temporal transformer for skeleton-based action recognition","volume":"33","author":"Liu","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.neunet.2026.108930_bib0071","unstructured":"H. Liu, W. Lin, T. Chen, Y. Li, S. Li, J. See, et al., Few-shot action recognition via intra- and inter-video information maximization, 2023, arXiv:2305.06114."},{"key":"10.1016\/j.neunet.2026.108930_bib0040","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"3202","article-title":"Video Swin transformer","author":"Liu","year":"2022"},{"key":"10.1016\/j.neunet.2026.108930_bib0019","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2024.111686","article-title":"Mixed resolution network with hierarchical motion modeling for efficient action recognition","volume":"294","author":"Lu","year":"2024","journal-title":"Knowl. - Based Syst."},{"key":"10.1016\/j.neunet.2026.108930_bib0056","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"471","article-title":"Inductive and transductive few-shot video classification via appearance and temporal alignments","author":"Nguyen","year":"2022"},{"key":"10.1016\/j.neunet.2026.108930_bib0047","series-title":"Proceedings of the European Conference on Computer Vision","first-page":"1","article-title":"Expanding language-image pretrained models for general video recognition","author":"Ni","year":"2022"},{"key":"10.1016\/j.neunet.2026.108930_bib0027","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"475","article-title":"Temporal-relational cross-transformers for few-shot action recognition","author":"Perrett","year":"2021"},{"key":"10.1016\/j.neunet.2026.108930_bib0030","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"9945","article-title":"Representation flow for action recognition","author":"Piergiovanni","year":"2019"},{"key":"10.1016\/j.neunet.2026.108930_bib0038","series-title":"Proceedings of the 28th ACM international conference on multimedia","first-page":"3007","article-title":"Few-shot ensemble learning for video classification with slowfast memory networks","author":"Qi","year":"2020"},{"key":"10.1016\/j.neunet.2026.108930_bib0001","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2024.106326","article-title":"TFRS: A task-level feature rectification and separation method for few-shot video action recognition","volume":"176","author":"Qin","year":"2024","journal-title":"Neural Netw"},{"key":"10.1016\/j.neunet.2026.108930_bib0031","series-title":"Proceedings of the International conference on machine learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.neunet.2026.108930_bib0049","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"6545","article-title":"Fine-tuned CLIP models are efficient video learners","author":"Rasheed","year":"2023"},{"key":"10.1016\/j.neunet.2026.108930_bib0050","first-page":"568","article-title":"Two-stream convolutional networks for action recognition in videos","volume":"27","author":"Simonyan","year":"2014","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108930_bib0045","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110648","article-title":"Prompt-guided DETR with RoI-pruned masked attention for open-vocabulary object detection","volume":"155","author":"Song","year":"2024","journal-title":"Pattern Recognit"},{"key":"10.1016\/j.neunet.2026.108930_bib0060","unstructured":"K. Soomro, A.R. Zamir, M. Shah, UCF101: A dataset of 101 human action classes from videos in the wild, 2012, arXiv:1212.0402."},{"key":"10.1016\/j.neunet.2026.108930_bib0017","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"19958","article-title":"Spatio-temporal relation modeling for few-shot action recognition","author":"Thatipelli","year":"2022"},{"issue":"11","key":"10.1016\/j.neunet.2026.108930_bib0032","doi-asserted-by":"crossref","first-page":"1595","DOI":"10.1007\/s00371-018-1560-4","article-title":"A unified model for human activity recognition using spatial distribution of gradients and difference of Gaussian kernel","volume":"35","author":"Vishwakarma","year":"2019","journal-title":"The Visual Computer"},{"key":"10.1016\/j.neunet.2026.108930_bib0051","series-title":"Proceedings of the European Conference on Computer Vision","first-page":"20","article-title":"Temporal segment networks: Towards good practices for deep action recognition","author":"Wang","year":"2016"},{"key":"10.1016\/j.neunet.2026.108930_bib0020","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"13214","article-title":"Action-Net: Multipath excitation for action recognition","author":"Wang","year":"2021"},{"key":"10.1016\/j.neunet.2026.108930_bib0034","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"13209","article-title":"ACTION-Net: multipath excitation for action recognition","author":"Wang","year":"2021"},{"key":"10.1016\/j.neunet.2026.108930_bib0039","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"1895","article-title":"TDN: temporal difference networks for efficient action recognition","author":"Wang","year":"2021"},{"key":"10.1016\/j.neunet.2026.108930_bib0072","series-title":"Proceedings of the 29th ACM International Conference on Multimedia","first-page":"816","article-title":"Semantic-guided relation propagation network for few-shot action recognition","author":"Wang","year":"2021"},{"issue":"3","key":"10.1016\/j.neunet.2026.108930_bib0023","first-page":"3347","article-title":"Learning spatiotemporal and motion features in a unified 2D network for action recognition","volume":"45","author":"Wang","year":"2022","journal-title":"IEEE Trans. Pattern Anal. Mach Intell."},{"key":"10.1016\/j.neunet.2026.108930_bib0018","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"19916","article-title":"Hybrid relation guided set matching for few-shot action recognition","author":"Wang","year":"2022"},{"key":"10.1016\/j.neunet.2026.108930_bib0063","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"19916","article-title":"Hybrid relation guided set matching for few-shot action recognition","author":"Wang","year":"2022"},{"key":"10.1016\/j.neunet.2026.108930_bib0029","unstructured":"X. Wang, H. Zhang, S. Zhang, C. Gao, Y. Shao, N. Sang, Context-aware proposal network for temporal action detection, 2022, arXiv: 2206.09082."},{"issue":"18","key":"10.1016\/j.neunet.2026.108930_bib0007","doi-asserted-by":"crossref","first-page":"15857","DOI":"10.1109\/JIOT.2023.3266247","article-title":"Data fusion in infrastructure-augmented autonomous driving system: why? where? and how?","volume":"10","author":"Wang","year":"2023","journal-title":"IEEE Internet Things J"},{"issue":"12","key":"10.1016\/j.neunet.2026.108930_bib0008","doi-asserted-by":"crossref","first-page":"7789","DOI":"10.1109\/TCSVT.2023.3282777","article-title":"Few-shot learning meets transformer: Unified query-support transformers for few-shot classification","volume":"33","author":"Wang","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.neunet.2026.108930_bib0026","article-title":"Actionclip: Adapting language-image pretrained models for video action recognition","author":"Wang","year":"2023","journal-title":"IEEE Trans. Neural Netw. Learn Syst"},{"issue":"10","key":"10.1016\/j.neunet.2026.108930_bib0064","doi-asserted-by":"crossref","first-page":"5932","DOI":"10.1109\/TCSVT.2023.3262670","article-title":"Task-aware dual-representation network for few shot action recognition","volume":"33","author":"Wang","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.neunet.2026.108930_bib0079","doi-asserted-by":"crossref","DOI":"10.1016\/j.cviu.2023.103737","article-title":"Cross-domain few-shot action recognition with unlabeled videos","volume":"233","author":"Wang","year":"2023","journal-title":"Computer Vision and Image Understanding"},{"key":"10.1016\/j.neunet.2026.108930_bib0028","series-title":"Proceedings of the 31st ACM International Conference on Multimedia","first-page":"5339","article-title":"Seeing in flowing: Adapting clip for action recognition with motion prompts learning","author":"Wang","year":"2023"},{"key":"10.1016\/j.neunet.2026.108930_bib0057","doi-asserted-by":"crossref","first-page":"1257","DOI":"10.1109\/TIP.2024.3354104","article-title":"Cross-modal contrastive learning network for few-shot action recognition","volume":"33","author":"Wang","year":"2024","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.neunet.2026.108930_bib0067","doi-asserted-by":"crossref","first-page":"1899","DOI":"10.1007\/s11263-023-01917-4","article-title":"CLIP-guided prototype modulating for few-shot action recognition","volume":"132","author":"Wang","year":"2024","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.neunet.2026.108930_bib0082","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"23034","article-title":"Vita-clip: Video and text adaptive clip via multimodal prompting","author":"Wasim","year":"2023"},{"key":"10.1016\/j.neunet.2026.108930_bib0005","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"21243","article-title":"Hierarchical temporal transformer for 3D hand pose estimation and action recognition from egocentric rgb videos","author":"Wen","year":"2023"},{"key":"10.1016\/j.neunet.2026.108930_bib0014","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"9151","article-title":"Motion-modulated temporal fragment alignment network for few-shot action recognition","author":"Wu","year":"2022"},{"key":"10.1016\/j.neunet.2026.108930_bib0048","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"2847","article-title":"Revisiting classifier: Transferring vision-language models for video recognition","author":"Wu","year":"2023"},{"key":"10.1016\/j.neunet.2026.108930_bib0083","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"9384","article-title":"Attentive region embedding network for zero-shot learning","author":"Xie","year":"2019"},{"key":"10.1016\/j.neunet.2026.108930_bib0084","doi-asserted-by":"crossref","DOI":"10.1109\/TNNLS.2025.3598191","article-title":"Attribute prompt alignment network for zero-shot learning","author":"Xie","year":"2025","journal-title":"IEEE Trans. Neural Netw. Learn Syst"},{"key":"10.1016\/j.neunet.2026.108930_bib0053","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"3001","article-title":"Revisiting the spatial and temporal modeling for few-shot action recognition","author":"Xing","year":"2023"},{"key":"10.1016\/j.neunet.2026.108930_bib0066","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"1740","article-title":"Boosting few-shot action recognition with graph-guided hybrid matching","author":"Xing","year":"2023"},{"key":"10.1016\/j.neunet.2026.108930_bib0078","article-title":"MA-FSAR: Multimodal adaptation of CLIP for few-shot action recognition","volume":"169","author":"Xing","year":"2025","journal-title":"Pattern Recogn"},{"key":"10.1016\/j.neunet.2026.108930_bib0041","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"14043","article-title":"Recurring the transformer for video action recognition","author":"Yang","year":"2022"},{"key":"10.1016\/j.neunet.2026.108930_bib0058","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"6492","article-title":"Active exploration of multimodal complementarity for few-shot action recognition","author":"Yang","year":"2023"},{"key":"10.1016\/j.neunet.2026.108930_bib0024","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.124183","article-title":"EPK-CLIP: External and priori knowledge CLIP for action recognition","volume":"252","author":"Yang","year":"2024","journal-title":"Expert Syst. Appl."},{"issue":"9","key":"10.1016\/j.neunet.2026.108930_bib0025","doi-asserted-by":"crossref","first-page":"8172","DOI":"10.1109\/TCSVT.2024.3390133","article-title":"GBC: Guided alignment and adaptive boosting CLIP bridging vision and language for robust action recognition","volume":"34","author":"Yang","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.neunet.2026.108930_bib0002","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2024.106622","article-title":"Dark-DSAR: Lightweight one-step pipeline for action recognition in dark videos","volume":"179","author":"Yin","year":"2024","journal-title":"Neural Netw"},{"key":"10.1016\/j.neunet.2026.108930_bib0013","series-title":"Proceedings of the European Conference on Computer Vision","first-page":"525","article-title":"Few-shot action recognition with permutation-invariant attention","author":"Zhang","year":"2020"},{"key":"10.1016\/j.neunet.2026.108930_bib0054","doi-asserted-by":"crossref","unstructured":"S. Zhang, J. Zhou, X. He, Learning implicit temporal alignment for few-shot video classification, 2021, arXiv:2105.04823.","DOI":"10.24963\/ijcai.2021\/181"},{"key":"10.1016\/j.neunet.2026.108930_bib0065","series-title":"Proceedings of the European Conference on Computer Vision","first-page":"297","article-title":"Few-shot action recognition with hierarchical matching and contrastive learning","author":"Zheng","year":"2022"},{"key":"10.1016\/j.neunet.2026.108930_bib0043","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"16793","article-title":"Regionclip: Region-based language-image pretraining","author":"Zhong","year":"2022"},{"issue":"9","key":"10.1016\/j.neunet.2026.108930_bib0085","doi-asserted-by":"crossref","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","article-title":"Learning to prompt for vision-language models","volume":"130","author":"Zhou","year":"2022","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.neunet.2026.108930_bib0044","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"16816","article-title":"Conditional prompt learning for vision-language models","author":"Zhou","year":"2022"},{"key":"10.1016\/j.neunet.2026.108930_bib0069","series-title":"Proceedings of the British Machine Vision Conference","article-title":"Few-shot action recognition with prototype-centered attentive learning","author":"Zhu","year":"2021"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026003916?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026003916?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T03:02:58Z","timestamp":1781146978000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608026003916"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,9]]},"references-count":85,"alternative-id":["S0893608026003916"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2026.108930","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2026,9]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Multimodal-guided prototype calibration and temporal coherence-aware hybrid matching for few-shot action recognition","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2026.108930","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"108930"}}