{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,8]],"date-time":"2026-02-08T20:36:15Z","timestamp":1770582975573,"version":"3.49.0"},"reference-count":46,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376106"],"award-info":[{"award-number":["62376106"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,2]]},"DOI":"10.1016\/j.patcog.2025.112074","type":"journal-article","created":{"date-parts":[[2025,7,2]],"date-time":"2025-07-02T11:52:24Z","timestamp":1751457144000},"page":"112074","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":1,"special_numbering":"C","title":["Mixture of coarse and fine-grained prompt tuning for vision-language model"],"prefix":"10.1016","volume":"170","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9741-217X","authenticated-orcid":false,"given":"Yansheng","family":"Gao","sequence":"first","affiliation":[]},{"given":"Zixi","family":"Zhu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8503-8061","authenticated-orcid":false,"given":"Shengsheng","family":"Wang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2025.112074_bib0001","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.patcog.2025.112074_bib0002","series-title":"International Journal of Computer Vision","first-page":"2337","article-title":"Learning to prompt for vision-language models","author":"Zhou","year":"2022"},{"key":"10.1016\/j.patcog.2025.112074_bib0003","series-title":"Computer Vision and Pattern Recognition","first-page":"16795","article-title":"Conditional prompt learning for vision-language models","author":"Zhou","year":"2022"},{"key":"10.1016\/j.patcog.2025.112074_bib0004","series-title":"Computer Vision and Pattern Recognition","first-page":"6757","article-title":"Visual-language prompt tuning with knowledge-guided context optimization","author":"Yao","year":"2023"},{"key":"10.1016\/j.patcog.2025.112074_bib0005","series-title":"Computer Vision and Pattern Recognition","first-page":"19113","article-title":"Maple: Multi-modal prompt learning","author":"Khattak","year":"2023"},{"key":"10.1016\/j.patcog.2025.112074_bib0006","series-title":"Computer Vision and Pattern Recognition","first-page":"12924","article-title":"Dept: decoupled prompt tuning","author":"Zhang","year":"2024"},{"key":"10.1016\/j.patcog.2025.112074_bib0007","series-title":"International Conference on Computer Vision","first-page":"15190","article-title":"Self-regulating prompts: foundational model adaptation without forgetting","author":"Khattak","year":"2023"},{"key":"10.1016\/j.patcog.2025.112074_bib0008","series-title":"European Conference on Computer Vision","first-page":"709","article-title":"Visual prompt tuning","author":"Jia","year":"2022"},{"key":"10.1016\/j.patcog.2025.112074_bib0009","series-title":"Computer Vision and Pattern Recognition","first-page":"15028","article-title":"FashionSAP: symbols and attributes prompt for fine-grained fashion vision-language pre-training","author":"Han","year":"2023"},{"key":"10.1016\/j.patcog.2025.112074_bib0010","series-title":"International Conference on Computer Vision","first-page":"15659","article-title":"Prompt-aligned gradient for prompt tuning","author":"Zhu","year":"2023"},{"key":"10.1016\/j.patcog.2025.112074_bib0011","series-title":"Computer Vision and Pattern Recognition","first-page":"5206","article-title":"Prompt distribution learning","author":"Lu","year":"2022"},{"key":"10.1016\/j.patcog.2025.112074_bib0012","series-title":"European Conference on Computer Vision","first-page":"340","article-title":"SDPT: synchronous dual prompt tuning for fusion-based visual-language pre-trained models","author":"Zhou","year":"2024"},{"key":"10.1016\/j.patcog.2025.112074_bib0013","doi-asserted-by":"crossref","first-page":"2056","DOI":"10.1109\/TMM.2023.3291588","article-title":"Dual modality prompt tuning for vision-language pre-trained model","volume":"26","author":"Xing","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.patcog.2025.112074_bib0014","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110250","article-title":"Exploring low-resource medical image classification with weakly supervised prompt learning","volume":"149","author":"Zheng","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.112074_bib0015","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.110096","article-title":"F-SCP: An automatic prompt generation method for specific classes based on visual language pre-training models","volume":"147","author":"Han","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.112074_bib0016","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110648","article-title":"Prompt-guided DETR with RoI-pruned masked attention for open-vocabulary object detection","volume":"155","author":"Song","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.112074_bib0017","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110861","article-title":"Cluster prototype earth mover\u2019s distance adapters and alignment-guided prompt learning for vision\u2013language models","volume":"156","author":"Dong","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.112074_bib0018","series-title":"the Association for Computational Linguistics","first-page":"1559","article-title":"Measuring progress in fine-grained vision-and-language understanding","author":"Bugliarello","year":"2023"},{"key":"10.1016\/j.patcog.2025.112074_bib0019","series-title":"Computer Vision and Pattern Recognition","first-page":"19381","article-title":"Open-set fine-grained retrieval via prompting vision-language evaluator","author":"Wang","year":"2023"},{"issue":"6","key":"10.1016\/j.patcog.2025.112074_bib0020","doi-asserted-by":"crossref","first-page":"4366","DOI":"10.1109\/TPAMI.2024.3355461","article-title":"Content-aware rectified activation for zero-shot fine-grained image retrieval","volume":"46","author":"Wang","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2025.112074_bib0021","series-title":"International Journal of Computer Vision","first-page":"581","article-title":"Clip-adapter: better vision-language models with feature adapters","author":"Gao","year":"2024"},{"key":"10.1016\/j.patcog.2025.112074_bib0022","series-title":"European Conference on Computer Vision","first-page":"493","article-title":"Tip-adapter: Training-free adaption of clip for few-shot classification","author":"Zhang","year":"2022"},{"key":"10.1016\/j.patcog.2025.112074_bib0023","series-title":"Computer Vision and Pattern Recognition","first-page":"23826","article-title":"MMA: multi-modal adapter for vision-language models","author":"Yang","year":"2024"},{"key":"10.1016\/j.patcog.2025.112074_bib0024","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110559","article-title":"Ta-Adapter: enhancing few-shot CLIP with task-aware encoders","volume":"153","author":"Zhang","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.112074_bib0025","series-title":"arXiv","article-title":"Gpt-4 technical report","author":"Achiam","year":"2024"},{"issue":"1","key":"10.1016\/j.patcog.2025.112074_bib0026","doi-asserted-by":"crossref","first-page":"79","DOI":"10.1162\/neco.1991.3.1.79","article-title":"Adaptive mixtures of local experts","volume":"3","author":"Jacobs","year":"1991","journal-title":"Neural Comput."},{"key":"10.1016\/j.patcog.2025.112074_bib0027","series-title":"Computer Vision and Pattern Recognition","first-page":"248","article-title":"ImageNet: a large-scale hierarchical image database","author":"Deng","year":"2009"},{"key":"10.1016\/j.patcog.2025.112074_bib0028","series-title":"Computer Vision and Pattern Recognition workshop","first-page":"178","article-title":"Learning generative visual models from few training examples: an incremental Bayesian approach tested on 101 object categories","author":"Fei-Fei","year":"2004"},{"key":"10.1016\/j.patcog.2025.112074_bib0029","series-title":"Computer Vision and Pattern Recognition","first-page":"3498","article-title":"Cats and dogs","author":"Parkhi","year":"2012"},{"key":"10.1016\/j.patcog.2025.112074_bib0030","series-title":"International Conference on Computer Vision workshops","first-page":"554","article-title":"3D object representations for fine-grained categorization","author":"Krause","year":"2013"},{"key":"10.1016\/j.patcog.2025.112074_bib0031","series-title":"Sixth Indian Conference on Computer Vision","first-page":"722","article-title":"Automated flower classification over a large number of classes","author":"Nilsback","year":"2008"},{"key":"10.1016\/j.patcog.2025.112074_bib0032","series-title":"European Conference on Computer Vision","first-page":"446","article-title":"Food-101 - mining discriminative components with random forests","author":"Bossard","year":"2014"},{"key":"10.1016\/j.patcog.2025.112074_sbref0033","series-title":"Fine-grained visual classification of aircraft","author":"Maji","year":"2013"},{"issue":"7","key":"10.1016\/j.patcog.2025.112074_bib0034","doi-asserted-by":"crossref","first-page":"2217","DOI":"10.1109\/JSTARS.2019.2918242","article-title":"EuroSAT: A novel dataset and deep learning benchmark for land use and land cover classification","volume":"12","author":"Helber","year":"2019","journal-title":"IEEE J. Sel. Top. Appl. Earth Obs. Remote Sens."},{"key":"10.1016\/j.patcog.2025.112074_sbref0035","series-title":"UCF101: A dataset of 101 human actions classes from videos in the wild","author":"Kay","year":"2017"},{"key":"10.1016\/j.patcog.2025.112074_bib0036","series-title":"Computer Vision and Pattern Recognition","first-page":"3606","article-title":"Describing textures in the wild","author":"Cimpoi","year":"2014"},{"key":"10.1016\/j.patcog.2025.112074_bib0037","series-title":"Computer Vision and Pattern Recognition","first-page":"3485","article-title":"Sun database: large-scale scene recognition from abbey to zoo","author":"Xiao","year":"2010"},{"key":"10.1016\/j.patcog.2025.112074_bib0038","series-title":"International Conference on Machine Learning","first-page":"5389","article-title":"Do ImageNet classifiers generalize to ImageNet?","author":"Recht","year":"2019"},{"key":"10.1016\/j.patcog.2025.112074_bib0039","first-page":"1","article-title":"Learning robust global representations by penalizing local predictive power","volume":"32","author":"Wang","year":"2019","journal-title":"Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2025.112074_bib0040","series-title":"Computer Vision and Pattern Recognition","first-page":"15262","article-title":"Natural adversarial examples","author":"Hendrycks","year":"2021"},{"key":"10.1016\/j.patcog.2025.112074_bib0041","series-title":"International Conference on Computer Vision","first-page":"8340","article-title":"The many faces of robustness: a critical analysis of out-of-distribution generalization","author":"Hendrycks","year":"2021"},{"key":"10.1016\/j.patcog.2025.112074_bib0042","series-title":"International Conference on Computer Vision","first-page":"15645","article-title":"What does a platypus look like? Generating customized prompts for zero-shot image classification","author":"Pratt","year":"2023"},{"key":"10.1016\/j.patcog.2025.112074_bib0043","series-title":"AAAI Conference on Artificial Intelligence","first-page":"5749","article-title":"Learning hierarchical prompt with structured linguistic knowledge for vision-language models","author":"Wang","year":"2024"},{"key":"10.1016\/j.patcog.2025.112074_bib0044","series-title":"International Conference on Learning Representations","article-title":"Consistency-guided prompt learning for vision-language models","author":"Roy","year":"2024"},{"key":"10.1016\/j.patcog.2025.112074_bib0045","series-title":"Association for theAdvance of Artificial Intelligence","article-title":"TextRefiner: internal visual feature as efficient refiner for vision-language models prompt tuning","author":"Xie","year":"2025"},{"key":"10.1016\/j.patcog.2025.112074_bib0046","series-title":"International Conference on Computer Vision","first-page":"2704","article-title":"Diverse data augmentation with diffusions for effective test-time prompt tuning","author":"Feng","year":"2023"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320325007344?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320325007344?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,9,9]],"date-time":"2025-09-09T19:24:01Z","timestamp":1757445841000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320325007344"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2]]},"references-count":46,"alternative-id":["S0031320325007344"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2025.112074","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,2]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Mixture of coarse and fine-grained prompt tuning for vision-language model","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2025.112074","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"112074"}}