{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T10:08:52Z","timestamp":1772878132592,"version":"3.50.1"},"reference-count":55,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/100015803","name":"Tencent","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100015803","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,10]]},"DOI":"10.1016\/j.patcog.2026.113316","type":"journal-article","created":{"date-parts":[[2026,2,27]],"date-time":"2026-02-27T07:41:18Z","timestamp":1772178078000},"page":"113316","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Towards fine-grained vision-language alignment for few-shot anomaly detection"],"prefix":"10.1016","volume":"178","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-6507-666X","authenticated-orcid":false,"given":"Yuanting","family":"Fan","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6985-8238","authenticated-orcid":false,"given":"Jun","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4326-4511","authenticated-orcid":false,"given":"Xiaochen","family":"Chen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2572-8156","authenticated-orcid":false,"given":"Bin-Bin","family":"Gao","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0242-6481","authenticated-orcid":false,"given":"Jian","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9945-8131","authenticated-orcid":false,"given":"Yong","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1887-6406","authenticated-orcid":false,"given":"Jinlong","family":"Peng","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4216-8090","authenticated-orcid":false,"given":"Chengjie","family":"Wang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2026.113316_bib0001","series-title":"Real-iad: A real-world multi-view dataset for benchmarking versatile industrial anomaly detection","first-page":"22883","author":"Wang","year":"2024"},{"key":"10.1016\/j.patcog.2026.113316_bib0002","series-title":"Mvtec ad-a comprehensive real-world dataset for unsupervised anomaly detection","first-page":"9592-9600","author":"Bergmann","year":"2019"},{"key":"10.1016\/j.patcog.2026.113316_bib0003","series-title":"Spot-the-difference self-supervised pre-training for anomaly detection and segmentation","first-page":"392","author":"Zou","year":"2022"},{"key":"10.1016\/j.patcog.2026.113316_bib0004","series-title":"Towards continual adaptation in industrial anomaly detection","author":"Li","year":"2022"},{"key":"10.1016\/j.patcog.2026.113316_bib0005","series-title":"M3dm-Nr: Rgb-3d noisy-resistant industrial anomaly detection via multimodal denoising","first-page":"2871-2880","author":"Wang","year":"2025"},{"key":"10.1016\/j.patcog.2026.113316_bib0006","article-title":"Softpatch : fully unsupervised anomaly classification and segmentation","volume":"161","author":"Wang","year":"2025","journal-title":"PR"},{"key":"10.1016\/j.patcog.2026.113316_bib0007","series-title":"Padim: a patch distribution modeling framework for anomaly detection and localization","first-page":"475","author":"Defard","year":"2021"},{"key":"10.1016\/j.patcog.2026.113316_bib0008","series-title":"Towards total recall in industrial anomaly detection","first-page":"14318","author":"Roth","year":"2022"},{"key":"10.1016\/j.patcog.2026.113316_bib0009","series-title":"Registration based few-shot anomaly detection","first-page":"303","author":"Huang","year":"2022"},{"key":"10.1016\/j.patcog.2026.113316_bib0010","series-title":"Winclip: Zero-\/few-shot anomaly classification and segmentation","first-page":"19606","author":"Jeong","year":"2023"},{"key":"10.1016\/j.patcog.2026.113316_bib0011","series-title":"Promptad: Learning prompts with only normal samples for few-shot anomaly detection","first-page":"16838","author":"Li","year":"2024"},{"key":"10.1016\/j.patcog.2026.113316_bib0012","doi-asserted-by":"crossref","first-page":"1932","DOI":"10.1609\/aaai.v38i3.27963","article-title":"Anomalygpt: detecting industrial anomalies using large vision-language models","volume":"38","author":"Gu","year":"2024","journal-title":"AAAI"},{"key":"10.1016\/j.patcog.2026.113316_bib0013","series-title":"Univad: A training-free unified model for few-shot visual anomaly detection","first-page":"15194","author":"Gu","year":"2025"},{"key":"10.1016\/j.patcog.2026.113316_bib0014","doi-asserted-by":"crossref","first-page":"7347","DOI":"10.1609\/aaai.v39i7.32790","article-title":"Kernel-aware graph prompt learning for few-shot anomaly detection","volume":"39","author":"Tao","year":"2025","journal-title":"AAAI"},{"key":"10.1016\/j.patcog.2026.113316_bib0015","first-page":"125287","article-title":"Resad: a simple framework for class generalizable anomaly detection","volume":"37","author":"Yao","year":"2024","journal-title":"NeurIPS"},{"key":"10.1016\/j.patcog.2026.113316_bib0016","series-title":"ICML","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.patcog.2026.113316_bib0017","series-title":"The first-ever comprehensive benchmark for multimodal large language models in industrial anomaly detection","author":"Jiang","year":"2024"},{"key":"10.1016\/j.patcog.2026.113316_bib0018","series-title":"Few-shot anomaly-driven generation for anomaly classification and segmentation","author":"Gui","year":"2024"},{"key":"10.1016\/j.patcog.2026.113316_bib0019","series-title":"Learning to detect multi-class anomalies with just one normal image prompt","author":"Gao","year":"2024"},{"key":"10.1016\/j.patcog.2026.113316_bib0020","doi-asserted-by":"crossref","first-page":"7096","DOI":"10.1609\/aaai.v38i7.28537","article-title":"Learning task-aware language-image representation for class-incremental object detection","volume":"38","author":"Zhang","year":"2024","journal-title":"AAAI"},{"key":"10.1016\/j.patcog.2026.113316_bib0021","unstructured":"H. Deng, Z. Zhang, J. Bao, X. Li, Anovl, Adapting vision-language models for unified zero-shot anomaly localization, Technical Report, arXiv preprint arXiv: 2308.15939, 2023."},{"key":"10.1016\/j.patcog.2026.113316_bib0022","series-title":"Metauas: Universal anomaly segmentation with one-prompt meta-learning","author":"Gao","year":"2024"},{"issue":"9","key":"10.1016\/j.patcog.2026.113316_bib0023","doi-asserted-by":"crossref","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","article-title":"Learning to prompt for vision-language models","volume":"130","author":"Zhou","year":"2022","journal-title":"IJCV"},{"key":"10.1016\/j.patcog.2026.113316_bib0024","unstructured":"T. Ren, S. Liu, A. Zeng, J. Lin, K. Li, H. Cao, J. Chen, X. Huang, Y. Chen, F. Yan, et al. Grounded sam: Assembling open-world models for diverse visual tasks, Technical Report, arXiv preprint arXiv: 2401.14159, 2024."},{"key":"10.1016\/j.patcog.2026.113316_bib0025","unstructured":"J. Achiam, S. Adler, S. Agarwal, L. Ahmad, I. Akkaya, F.L. Aleman, D. Almeida, J. Altenschmidt, S. Altman, S. Anadkat, et al, Gpt-4 technical report, arXiv preprint arXiv: 2303.08774, 2023."},{"key":"10.1016\/j.patcog.2026.113316_bib0026","series-title":"Student-teacher feature pyramid matching for anomaly detection","author":"Wang","year":"2021"},{"key":"10.1016\/j.patcog.2026.113316_bib0027","series-title":"A memory and retrieval transformer-based unsupervised learning model for anomaly detection and segmentation","first-page":"113004","author":"Guo","year":"2026"},{"key":"10.1016\/j.patcog.2026.113316_bib0028","series-title":"A unified anomaly synthesis strategy with gradient ascent for industrial anomaly detection and localization","author":"Chen","year":"2024"},{"key":"10.1016\/j.patcog.2026.113316_bib0029","series-title":"Unsupervised surface anomaly detection with diffusion probabilistic model","first-page":"6782","author":"Zhang","year":"2023"},{"key":"10.1016\/j.patcog.2026.113316_bib0030","series-title":"Dynamic addition of noise in a diffusion model for anomaly detection","author":"Tebbe","year":"2024"},{"key":"10.1016\/j.patcog.2026.113316_bib0031","series-title":"Dinomaly: The less is more philosophy in multi-class unsupervised anomaly detection","author":"Guo","year":"2025"},{"key":"10.1016\/j.patcog.2026.113316_bib0032","article-title":"Visual instruction tuning","volume":"36","author":"Liu","year":"2024","journal-title":"NeurIPS"},{"key":"10.1016\/j.patcog.2026.113316_bib0033","series-title":"Reproducible scaling laws for contrastive language-image learning","first-page":"2818","author":"Cherti","year":"2023"},{"key":"10.1016\/j.patcog.2026.113316_bib0034","series-title":"Adaptive region-aware dynamic acceleration diffusion model for real-world image super-resolution","author":"Fan","year":"2024"},{"key":"10.1016\/j.patcog.2026.113316_bib0035","series-title":"Dual-interrelated diffusion model for few-shot anomaly image generation","first-page":"30420","author":"Jin","year":"2025"},{"key":"10.1016\/j.patcog.2026.113316_bib0036","doi-asserted-by":"crossref","unstructured":"T. Shin, Y. Razeghi, R.L. Logan, V. I, E. Wallace, S. Singh, Eliciting knowledge from language models with automatically generated prompts, arXiv preprint arXiv: 2010.15980, 2020.","DOI":"10.18653\/v1\/2020.emnlp-main.346"},{"key":"10.1016\/j.patcog.2026.113316_bib0037","series-title":"Adaclip: Adapting clip with hybrid learnable prompts for zero-shot anomaly detection","author":"Cao","year":"2024"},{"key":"10.1016\/j.patcog.2026.113316_bib0038","series-title":"Anomalyclip: Object-agnostic prompt learning for zero-shot anomaly detection","author":"Zhou","year":"2024"},{"key":"10.1016\/j.patcog.2026.113316_bib0039","series-title":"Adapting clip for universal visual anomaly detection","author":"Gao","year":"2026"},{"key":"10.1016\/j.patcog.2026.113316_bib0040","unstructured":"N. Cohen, Y. Hoshen, Sub-image anomaly detection with deep pyramid correspondences, arXiv preprint arXiv: 2005.02357, 2020."},{"key":"10.1016\/j.patcog.2026.113316_bib0041","series-title":"Same same but differnet: Semi-supervised defect detection with normalizing flows","author":"Rudolph","year":"2021"},{"key":"10.1016\/j.patcog.2026.113316_bib0042","series-title":"One-shot unsupervised industrial anomaly detection: enhanced performance under extreme data scarcity","first-page":"112759","author":"Zhou","year":"2026"},{"key":"10.1016\/j.patcog.2026.113316_bib0043","series-title":"Exploring intrinsic normal prototypes within a single image for universal anomaly detection","author":"Luo","year":"2025"},{"key":"10.1016\/j.patcog.2026.113316_bib0044","unstructured":"M. Oquab, T. Darcet, T. Moutakanni, H. Vo, M. Szafraniec, V. Khalidov, P. Fernandez, D. Haziza, F. Massa, A. El-Nouby, et al, arXiv preprint arXiv: 2005.02357, 2023."},{"key":"10.1016\/j.patcog.2026.113316_bib0045","series-title":"Filo: zero-shot anomaly detection by fine-grained description and high-quality localization","author":"Gu","year":"2024"},{"key":"10.1016\/j.patcog.2026.113316_bib0046","unstructured":"A. Hurst, A. Lerer, A.P. Goucher, A. Perelman, A. Ramesh, A. Clark, A. Ostrow, A. Welihinda, A. Hayes, A. Radford, et al, Gpt-4o system card, arXiv preprint arXiv: 2410.21276, 2024."},{"key":"10.1016\/j.patcog.2026.113316_bib0047","unstructured":"G. Comanici, E. Bieber, M. Schaekermann, I. Pasupat, N. Sachdeva, I. Dhillon, M. Blistein, O. Ram, D. Zhang, E. Rosen, et al., Gemini 2.5: Pushing the frontier with advanced reasoning, multimodality, long context, and next generation agentic capabilities, arXiv preprint arXiv: 2507.06261, 2025."},{"issue":"1","key":"10.1016\/j.patcog.2026.113316_bib0048","doi-asserted-by":"crossref","first-page":"79","DOI":"10.1162\/neco.1991.3.1.79","article-title":"Adaptive mixtures of local experts","volume":"3","author":"Jacobs","year":"1991","journal-title":"Neural Comput."},{"key":"10.1016\/j.patcog.2026.113316_bib0049","first-page":"135","article-title":"Study on density peaks clustering based on k-nearest neighbors and principal component analysis,","volume":"99","author":"Du","year":"2016","journal-title":"KBS"},{"key":"10.1016\/j.patcog.2026.113316_bib0050","unstructured":"H. Wang, Q. Wang, X. Bai, Z. Qin, A. Chen, Instantstyle: Free lunch towards style-preserving in text-to-image generation, arXiv preprint arXiv: 2404.02733, 2024."},{"key":"10.1016\/j.patcog.2026.113316_bib0051","series-title":"International Conference on Machine Learning","first-page":"19730","article-title":"Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023"},{"key":"10.1016\/j.patcog.2026.113316_bib0052","series-title":"An image is worth 16x16 words: transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"10.1016\/j.patcog.2026.113316_bib0053","series-title":"Laion-400m: Open dataset of clip-filtered 400 million image-text pairs","author":"Schuhmann","year":"2021"},{"key":"10.1016\/j.patcog.2026.113316_bib0054","series-title":"Toward generalist anomaly detection via in-context residual learning with few-shot sample prompts","author":"Zhu","year":"2024"},{"key":"10.1016\/j.patcog.2026.113316_bib0055","series-title":"Dsr-a dual subspace re-projection network for surface anomaly detection","author":"Zavrtanik","year":"2022"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326002815?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326002815?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T16:26:14Z","timestamp":1772814374000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320326002815"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,10]]},"references-count":55,"alternative-id":["S0031320326002815"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113316","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,10]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Towards fine-grained vision-language alignment for few-shot anomaly detection","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113316","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"113316"}}