{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T02:02:02Z","timestamp":1780020122015,"version":"3.53.1"},"reference-count":43,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100017596","name":"Natural Science Basic Research Program of Shaanxi Province","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100017596","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100013101","name":"Scientific Research Plan Projects of Shaanxi Education Department","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100013101","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100011160","name":"Beihang University State Key Laboratory of Virtual Reality Technology and Systems","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100011160","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012165","name":"Key Technologies Research and Development Program","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012165","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.knosys.2026.116115","type":"journal-article","created":{"date-parts":[[2026,5,16]],"date-time":"2026-05-16T15:30:22Z","timestamp":1778945422000},"page":"116115","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Knowledge-injected prompt tuning with semantic regularization for fine-grained image recognition"],"prefix":"10.1016","volume":"346","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-7565-2045","authenticated-orcid":false,"given":"Qinyu","family":"Zhang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xinda","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-7213-0103","authenticated-orcid":false,"given":"Qi","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Pengbo","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Guohua","family":"Geng","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.knosys.2026.116115_b1","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.113127","article-title":"From multi-scale grids to dynamic regions: Dual-relation enhanced transformer for image captioning","volume":"311","author":"Zhou","year":"2025","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116115_b2","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2024.111433","article-title":"Top-down framework for weakly-supervised grounded image captioning","volume":"287","author":"Cai","year":"2024","journal-title":"Knowl.-Based Syst."},{"issue":"2","key":"10.1016\/j.knosys.2026.116115_b3","doi-asserted-by":"crossref","first-page":"1785","DOI":"10.1109\/TNNLS.2022.3185320","article-title":"Adaptive semantic-enhanced transformer for image captioning","volume":"35","author":"Zhang","year":"2024","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.knosys.2026.116115_b4","article-title":"Advancing causal intervention in image captioning with causal prompt","author":"Yu","year":"2024","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.knosys.2026.116115_b5","doi-asserted-by":"crossref","first-page":"6047","DOI":"10.1109\/TIP.2023.3328224","article-title":"Changes to captions: An attentive network for remote sensing change captioning","volume":"32","author":"Chang","year":"2023","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.knosys.2026.116115_b6","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2024.112077","article-title":"Diverse and tailored image generation for zero-shot multi-label classification","volume":"299","author":"Zhang","year":"2024","journal-title":"Knowl.-Based Syst."},{"issue":"7","key":"10.1016\/j.knosys.2026.116115_b7","doi-asserted-by":"crossref","first-page":"12326","DOI":"10.1109\/TNNLS.2024.3477467","article-title":"Few-shot image generation via style adaptation and content preservation","volume":"36","author":"He","year":"2025","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.knosys.2026.116115_b8","doi-asserted-by":"crossref","first-page":"5737","DOI":"10.1109\/TIP.2023.3323799","article-title":"Txt2Img-MHN: Remote sensing image generation from text using modern Hopfield networks","volume":"32","author":"Xu","year":"2023","journal-title":"IEEE Trans. Image Process."},{"issue":"12","key":"10.1016\/j.knosys.2026.116115_b9","doi-asserted-by":"crossref","first-page":"10309","DOI":"10.1109\/TNNLS.2022.3165573","article-title":"DR-GAN: Distribution regularization for text-to-image generation","volume":"34","author":"Tan","year":"2022","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.knosys.2026.116115_b10","doi-asserted-by":"crossref","first-page":"2572","DOI":"10.1109\/TIP.2024.3374211","article-title":"Convolution-enhanced bi-branch adaptive transformer with cross-task interaction for food category and ingredient recognition","volume":"33","author":"Liu","year":"2024","journal-title":"IEEE Trans. Image Process."},{"issue":"10","key":"10.1016\/j.knosys.2026.116115_b11","doi-asserted-by":"crossref","first-page":"13550","DOI":"10.1109\/TNNLS.2023.3269949","article-title":"FBANet: An effective data mining method for food olfactory EEG recognition","volume":"35","author":"Xia","year":"2023","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"issue":"8","key":"10.1016\/j.knosys.2026.116115_b12","doi-asserted-by":"crossref","first-page":"9932","DOI":"10.1109\/TPAMI.2023.3237871","article-title":"Large scale visual food recognition","volume":"45","author":"Min","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.knosys.2026.116115_b13","series-title":"Fine-grained image-text correspondence with cost aggregation for open-vocabulary part segmentation","author":"Choi","year":"2025"},{"issue":"9","key":"10.1016\/j.knosys.2026.116115_b14","doi-asserted-by":"crossref","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","article-title":"Learning to prompt for vision-language models","volume":"130","author":"Zhou","year":"2022","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.knosys.2026.116115_b15","doi-asserted-by":"crossref","unstructured":"K. Zhou, J. Yang, C.C. Loy, Z. Liu, Conditional prompt learning for vision-language models, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 16816\u201316825.","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"10.1016\/j.knosys.2026.116115_b16","doi-asserted-by":"crossref","unstructured":"H. Yao, R. Zhang, C. Xu, Visual-language prompt tuning with knowledge-guided context optimization, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 6757\u20136767.","DOI":"10.1109\/CVPR52729.2023.00653"},{"key":"10.1016\/j.knosys.2026.116115_b17","doi-asserted-by":"crossref","unstructured":"H. Yao, R. Zhang, C. Xu, TCP: Textual-based Class-aware Prompt tuning for Visual-Language Model, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 23438\u201323448.","DOI":"10.1109\/CVPR52733.2024.02212"},{"key":"10.1016\/j.knosys.2026.116115_b18","doi-asserted-by":"crossref","unstructured":"B. Kan, T. Wang, W. Lu, X. Zhen, W. Guan, F. Zheng, Knowledge-aware prompt tuning for generalizable vision-language models, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 15670\u201315680.","DOI":"10.1109\/ICCV51070.2023.01436"},{"key":"10.1016\/j.knosys.2026.116115_b19","doi-asserted-by":"crossref","unstructured":"M.U. Khattak, M.F. Naeem, M. Naseer, L. Van Gool, F. Tombari, Learning to prompt with text only supervision for vision-language models, in: Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 39, 2025, pp. 4230\u20134238.","DOI":"10.1609\/aaai.v39i4.32444"},{"key":"10.1016\/j.knosys.2026.116115_b20","doi-asserted-by":"crossref","unstructured":"Z. Li, Y. Song, M.-M. Cheng, X. Li, J. Yang, Advancing textual prompt learning with anchored attributes, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2025, pp. 3618\u20133627.","DOI":"10.1109\/ICCV51701.2025.00345"},{"issue":"8","key":"10.1016\/j.knosys.2026.116115_b21","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI Blog"},{"key":"10.1016\/j.knosys.2026.116115_b22","doi-asserted-by":"crossref","unstructured":"J. Devlin, M.-W. Chang, K. Lee, K. Toutanova, Bert: Pre-training of deep bidirectional transformers for language understanding, in: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), 2019, pp. 4171\u20134186.","DOI":"10.18653\/v1\/N19-1423"},{"key":"10.1016\/j.knosys.2026.116115_b23","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.knosys.2026.116115_b24","doi-asserted-by":"crossref","unstructured":"B. Zhu, Y. Niu, Y. Han, Y. Wu, H. Zhang, Prompt-aligned gradient for prompt tuning, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 15659\u201315669.","DOI":"10.1109\/ICCV51070.2023.01435"},{"key":"10.1016\/j.knosys.2026.116115_b25","unstructured":"S. Roy, A. Etemad, Consistency-guided Prompt Learning for Vision-Language Models, in: International Conference on Learning Representations, 2024, pp. 6485\u20136498."},{"key":"10.1016\/j.knosys.2026.116115_b26","doi-asserted-by":"crossref","unstructured":"J. Gao, J. Ruan, S. Xiang, Z. Yu, K. Ji, M. Xie, T. Liu, Y. Fu, Lamm: Label alignment for multi-modal prompt learning, in: Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 38, 2024, pp. 1815\u20131823.","DOI":"10.1609\/aaai.v38i3.27950"},{"key":"10.1016\/j.knosys.2026.116115_b27","doi-asserted-by":"crossref","unstructured":"J. Zhang, J. Huang, X. Zhang, L. Shao, S. Lu, Historical Test-time Prompt Tuning for Vision Foundation Models, in: The Thirty-Eighth Annual Conference on Neural Information Processing Systems, 2024.","DOI":"10.52202\/079017-0410"},{"issue":"13","key":"10.1016\/j.knosys.2026.116115_b28","doi-asserted-by":"crossref","first-page":"11907","DOI":"10.1007\/s11042-024-19410-6","article-title":"Attribute guided fusion network for obtaining fine-grained image captions","volume":"84","author":"Hossen","year":"2025","journal-title":"Multimedia Tools Appl."},{"key":"10.1016\/j.knosys.2026.116115_b29","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2026.114014","article-title":"Hierarchical region-context attention for image captioning","volume":"168","author":"Hossain","year":"2026","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.knosys.2026.116115_b30","doi-asserted-by":"crossref","DOI":"10.1016\/j.dsp.2025.105155","article-title":"ARAFNet: an attribute refinement attention fusion network for advanced visual captioning","volume":"162","author":"Hossen","year":"2025","journal-title":"Digit. Signal Process."},{"key":"10.1016\/j.knosys.2026.116115_b31","doi-asserted-by":"crossref","first-page":"477","DOI":"10.1007\/s41095-020-0184-6","article-title":"A new dataset of dog breed images and a benchmark for finegrained classification","volume":"6","author":"Zou","year":"2020","journal-title":"Comput. Vis. Media"},{"key":"10.1016\/j.knosys.2026.116115_b32","doi-asserted-by":"crossref","unstructured":"Z. Sun, Y. Yao, X.-S. Wei, Y. Zhang, F. Shen, J. Wu, J. Zhang, H.T. Shen, Webly supervised fine-grained recognition: Benchmark datasets and an approach, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 10602\u201310611.","DOI":"10.1109\/ICCV48922.2021.01043"},{"key":"10.1016\/j.knosys.2026.116115_b33","series-title":"ECCV","first-page":"446","article-title":"Food-101 \u2013 mining discriminative components with random forests","author":"Bossard","year":"2014"},{"key":"10.1016\/j.knosys.2026.116115_b34","series-title":"Fine-grained visual classification of aircraft","author":"Maji","year":"2013"},{"key":"10.1016\/j.knosys.2026.116115_b35","doi-asserted-by":"crossref","unstructured":"M.-E. Nilsback, A. Zisserman, Automated Flower Classification over a Large Number of Classes, in: Indian Conference on Computer Vision, Graphics & Image Processing, 2008.","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"10.1016\/j.knosys.2026.116115_b36","doi-asserted-by":"crossref","unstructured":"O.M. Parkhi, A. Vedaldi, A. Zisserman, C.V. Jawahar, Cats and dogs, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2012, pp. 3498\u20133505.","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"10.1016\/j.knosys.2026.116115_b37","doi-asserted-by":"crossref","unstructured":"J. Krause, M. Stark, J. Deng, L. Fei-Fei, 3d object representations for fine-grained categorization, in: Proceedings of the IEEE International Conference on Computer Vision Workshops, 2013, pp. 554\u2013561.","DOI":"10.1109\/ICCVW.2013.77"},{"key":"10.1016\/j.knosys.2026.116115_b38","unstructured":"A. Khosla, N. Jayadevaprakash, B. Yao, L. Fei-Fei, Novel Dataset for Fine-Grained Image Categorization, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Vol. 5, 2011, p. 2."},{"key":"10.1016\/j.knosys.2026.116115_b39","doi-asserted-by":"crossref","unstructured":"J. Deng, W. Dong, R. Socher, L.-J. Li, K. Li, L. Fei-Fei, Imagenet: A large-scale hierarchical image database, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2009, pp. 248\u2013255.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"10.1016\/j.knosys.2026.116115_b40","doi-asserted-by":"crossref","unstructured":"S. Hou, Y. Feng, Z. Wang, Vegfru: A domain-specific dataset for fine-grained visual categorization, in: Proceedings of the IEEE International Conference on Computer Vision, 2017, pp. 541\u2013549.","DOI":"10.1109\/ICCV.2017.66"},{"key":"10.1016\/j.knosys.2026.116115_b41","series-title":"The caltech-ucsd birds-200\u20132011 dataset","author":"Wah","year":"2011"},{"key":"10.1016\/j.knosys.2026.116115_b42","doi-asserted-by":"crossref","unstructured":"R.R. Selvaraju, M. Cogswell, A. Das, R. Vedantam, D. Parikh, D. Batra, Grad-cam: Visual explanations from deep networks via gradient-based localization, in: Proceedings of the IEEE International Conference on Computer Vision, 2017, pp. 618\u2013626.","DOI":"10.1109\/ICCV.2017.74"},{"key":"10.1016\/j.knosys.2026.116115_b43","series-title":"European Conference on Computer Vision","first-page":"310","article-title":"Long-clip: Unlocking the long-text capability of clip","author":"Zhang","year":"2024"}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126008415?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126008415?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T01:08:42Z","timestamp":1780016922000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705126008415"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":43,"alternative-id":["S0950705126008415"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116115","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Knowledge-injected prompt tuning with semantic regularization for fine-grained image recognition","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116115","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"116115"}}