{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,19]],"date-time":"2026-05-19T21:05:53Z","timestamp":1779224753217,"version":"3.51.4"},"reference-count":47,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.neunet.2026.109082","type":"journal-article","created":{"date-parts":[[2026,5,11]],"date-time":"2026-05-11T06:31:13Z","timestamp":1778481073000},"page":"109082","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Unsupervised fine-tuning of vision-language models by fusing classifier tuning and visual prompt tuning"],"prefix":"10.1016","volume":"203","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-6879-5347","authenticated-orcid":false,"given":"Wenyang","family":"Chen","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7831-6713","authenticated-orcid":false,"given":"Zhanxuan","family":"Hu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yonghang","family":"Tai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Feiping","family":"Nie","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2026.109082_bib0001","unstructured":"Bahng, H., Jahanian, A., Sankaranarayanan, S., & Isola, P. (2022). Exploring visual prompts for adapting large-scale models. arXiv preprint arXiv: 2203.17274."},{"key":"10.1016\/j.neunet.2026.109082_bib0002","series-title":"Computer vision\u2013ECCV 2014: 13th European conference, Zurich, Switzerland, September 6-12, 2014, proceedings, part VI 13","first-page":"446","article-title":"Food-101\u2013mining discriminative components with random forests","author":"Bossard","year":"2014"},{"key":"10.1016\/j.neunet.2026.109082_bib0003","series-title":"International conference on learning representations","article-title":"Prompt learning with optimal transport for vision-language models","author":"Chen","year":"2023"},{"key":"10.1016\/j.neunet.2026.109082_bib0004","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"15750","article-title":"Exploring simple siamese representation learning","author":"Chen","year":"2021"},{"key":"10.1016\/j.neunet.2026.109082_bib0005","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"3606","article-title":"Describing textures in the wild","author":"Cimpoi","year":"2014"},{"key":"10.1016\/j.neunet.2026.109082_bib0006","series-title":"European conference on computer vision","first-page":"236","article-title":"Contrastive vision-language pre-training with limited resources","author":"Cui","year":"2022"},{"key":"10.1016\/j.neunet.2026.109082_bib0007","series-title":"2009 IEEE conference on computer vision and pattern recognition","first-page":"248","article-title":"ImageNet: A large-scale hierarchical image database","author":"Deng","year":"2009"},{"key":"10.1016\/j.neunet.2026.109082_bib0008","series-title":"International conference on learning representations","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2021"},{"issue":"2","key":"10.1016\/j.neunet.2026.109082_bib0009","doi-asserted-by":"crossref","first-page":"581","DOI":"10.1007\/s11263-023-01891-x","article-title":"Clip-adapter: Better vision-language models with feature adapters","volume":"132","author":"Gao","year":"2024","journal-title":"International Journal of Computer Vision"},{"key":"10.1016\/j.neunet.2026.109082_bib0010","series-title":"Proceedings of the 2022 conference on empirical methods in natural language processing","article-title":"CPL: Counterfactual prompt learning for vision and language models","author":"He","year":"2022"},{"issue":"7","key":"10.1016\/j.neunet.2026.109082_bib0011","doi-asserted-by":"crossref","first-page":"2217","DOI":"10.1109\/JSTARS.2019.2918242","article-title":"EuroSAT: A novel dataset and deep learning benchmark for land use and land cover classification","volume":"12","author":"Helber","year":"2019","journal-title":"IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing"},{"key":"10.1016\/j.neunet.2026.109082_bib0012","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"8340","article-title":"The many faces of robustness: A critical analysis of out-of-distribution generalization","author":"Hendrycks","year":"2021"},{"key":"10.1016\/j.neunet.2026.109082_bib0013","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"15262","article-title":"Natural adversarial examples","author":"Hendrycks","year":"2021"},{"key":"10.1016\/j.neunet.2026.109082_bib0014","unstructured":"Huang, T., Chu, J., & Wei, F. (2022). Unsupervised prompt learning for vision-language models. arXiv preprint arXiv: 2204.03649."},{"key":"10.1016\/j.neunet.2026.109082_bib0015","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"23773","article-title":"LP++: A surprisingly strong linear probe for few-shot clip","author":"Huang","year":"2024"},{"key":"10.1016\/j.neunet.2026.109082_bib0016","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"5070","article-title":"Label propagation for deep semi-supervised learning","author":"Iscen","year":"2019"},{"key":"10.1016\/j.neunet.2026.109082_bib0017","series-title":"International conference on machine learning","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","author":"Jia","year":"2021"},{"key":"10.1016\/j.neunet.2026.109082_bib0018","series-title":"European conference on computer vision","first-page":"709","article-title":"Visual prompt tuning","author":"Jia","year":"2022"},{"key":"10.1016\/j.neunet.2026.109082_bib0019","unstructured":"Kahana, J., Cohen, N., & Hoshen, Y. (2022). Improving zero-shot models with label distribution priors. arXiv preprint arXiv: 2212.00784."},{"key":"10.1016\/j.neunet.2026.109082_bib0020","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"19113","article-title":"Maple: Multi-modal prompt learning","author":"Khattak","year":"2023"},{"key":"10.1016\/j.neunet.2026.109082_bib0021","unstructured":"Krizhevsky, A., Hinton, G. et al. (2009). Learning multiple layers of features from tiny images. Technical report, University of Toronto."},{"key":"10.1016\/j.neunet.2026.109082_bib0022","series-title":"International conference on learning representations","article-title":"Masked unsupervised self-training for label-free image classification","author":"Li","year":"2023"},{"key":"10.1016\/j.neunet.2026.109082_bib0023","series-title":"International conference on learning representations","article-title":"Supervision exists everywhere: A data efficient contrastive language-image pre-training paradigm","author":"Li","year":"2022"},{"key":"10.1016\/j.neunet.2026.109082_bib0024","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"26617","article-title":"PromptKD: Unsupervised prompt distillation for vision-language models","author":"Li","year":"2024"},{"key":"10.1016\/j.neunet.2026.109082_bib0025","unstructured":"Maji, S., Rahtu, E., Kannala, J., Blaschko, M., & Vedaldi, A. (2013). Fine-grained visual classification of aircraft. arXiv preprint arXiv: 1306.5151."},{"key":"10.1016\/j.neunet.2026.109082_bib0026","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"28816","article-title":"Transductive zero-shot and few-shot CLIP","author":"Martin","year":"2024"},{"key":"10.1016\/j.neunet.2026.109082_bib0027","first-page":"5765","article-title":"LaFTer: Label-free tuning of zero-shot classifier using language and unlabeled image collections","volume":"36","author":"Mirza","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.109082_bib0028","series-title":"2008 Sixth Indian conference on computer vision, graphics & image processing","first-page":"722","article-title":"Automated flower classification over a large number of classes","author":"Nilsback","year":"2008"},{"key":"10.1016\/j.neunet.2026.109082_bib0029","series-title":"2012 IEEE conference on computer vision and pattern recognition","first-page":"3498","article-title":"Cats and dogs","author":"Parkhi","year":"2012"},{"key":"10.1016\/j.neunet.2026.109082_bib0030","first-page":"25461","article-title":"Intra-modal proxy learning for zero-shot visual categorization with clip","volume":"36","author":"Qian","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.109082_bib0031","series-title":"International conference on machine learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.neunet.2026.109082_bib0032","unstructured":"Rong, J., Chen, H., Chen, T., Ou, L., Yu, X., & Liu, Y. (2023). Retrieval-enhanced visual prompt learning for few-shot classification. arXiv preprint arXiv: 2306.02243."},{"key":"10.1016\/j.neunet.2026.109082_bib0033","first-page":"14274","article-title":"Test-time prompt tuning for zero-shot generalization in vision-language models","volume":"35","author":"Shu","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.109082_bib0034","unstructured":"Soomro, K., Zamir, A. R., & Shah, M. (2012). UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv: 1212.0402."},{"key":"10.1016\/j.neunet.2026.109082_bib0035","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"2818","article-title":"Rethinking the inception architecture for computer vision","author":"Szegedy","year":"2016"},{"key":"10.1016\/j.neunet.2026.109082_bib0036","series-title":"International conference on machine learning","first-page":"33816","article-title":"Pouf: Prompt-oriented unsupervised fine-tuning for large pre-trained models","author":"Tanwisuth","year":"2023"},{"key":"10.1016\/j.neunet.2026.109082_bib0037","first-page":"5998","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.109082_bib0038","first-page":"10506","article-title":"Learning robust global representations by penalizing local predictive power","volume":"32","author":"Wang","year":"2019","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.109082_bib0039","series-title":"International conference on learning representations","article-title":"A hard-to-beat baseline for training-free clip-based adaptation","author":"Wang","year":"2024"},{"key":"10.1016\/j.neunet.2026.109082_bib0040","series-title":"2010 IEEE computer society conference on computer vision and pattern recognition","first-page":"3485","article-title":"Sun database: Large-scale scene recognition from abbey to zoo","author":"Xiao","year":"2010"},{"key":"10.1016\/j.neunet.2026.109082_bib0041","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"10899","article-title":"Task residual for tuning vision-language models","author":"Yu","year":"2023"},{"key":"10.1016\/j.neunet.2026.109082_bib0042","unstructured":"Zang, Y., Li, W., Zhou, K., Huang, C., & Loy, C. C. (2022). Unified vision and language prompt learning. arXiv preprint arXiv: 2210.07225."},{"key":"10.1016\/j.neunet.2026.109082_bib0043","doi-asserted-by":"crossref","first-page":"5625","DOI":"10.1109\/TPAMI.2024.3369699","article-title":"Vision-language models for vision tasks: A survey","volume":"46","author":"Zhang","year":"2024","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.neunet.2026.109082_bib0044","series-title":"European conference on computer vision","first-page":"493","article-title":"Tip-adapter: Training-free adaption of clip for few-shot classification","author":"Zhang","year":"2022"},{"key":"10.1016\/j.neunet.2026.109082_bib0045","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"16816","article-title":"Conditional prompt learning for vision-language models","author":"Zhou","year":"2022"},{"issue":"9","key":"10.1016\/j.neunet.2026.109082_bib0046","doi-asserted-by":"crossref","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","article-title":"Learning to prompt for vision-language models","volume":"130","author":"Zhou","year":"2022","journal-title":"International Journal of Computer Vision"},{"key":"10.1016\/j.neunet.2026.109082_bib0047","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"15659","article-title":"Prompt-aligned gradient for prompt tuning","author":"Zhu","year":"2023"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026005423?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026005423?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,19]],"date-time":"2026-05-19T20:10:10Z","timestamp":1779221410000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608026005423"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":47,"alternative-id":["S0893608026005423"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109082","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Unsupervised fine-tuning of vision-language models by fusing classifier tuning and visual prompt tuning","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109082","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"109082"}}