{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T16:54:57Z","timestamp":1776099297182,"version":"3.50.1"},"reference-count":44,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62502233"],"award-info":[{"award-number":["62502233"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100007129","name":"Natural Science Foundation of Shandong Province","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100007129","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100005374","name":"Nanjing University of Posts and Telecommunications","doi-asserted-by":"publisher","award":["NY224061"],"award-info":[{"award-number":["NY224061"]}],"id":[{"id":"10.13039\/501100005374","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004608","name":"Natural Science Foundation of Jiangsu Province","doi-asserted-by":"publisher","award":["BK20250650"],"award-info":[{"award-number":["BK20250650"]}],"id":[{"id":"10.13039\/501100004608","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Information Fusion"],"published-print":{"date-parts":[[2026,9]]},"DOI":"10.1016\/j.inffus.2026.104260","type":"journal-article","created":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T08:58:41Z","timestamp":1772096321000},"page":"104260","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Uncertainty and diversity based selection for active learning in vision-language models"],"prefix":"10.1016","volume":"133","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-7733-0371","authenticated-orcid":false,"given":"Fan","family":"Yang","sequence":"first","affiliation":[]},{"given":"Kejun","family":"Ren","sequence":"additional","affiliation":[]},{"given":"Jiahao","family":"Shen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7822-4651","authenticated-orcid":false,"given":"Mingcai","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Yuntao","family":"Du","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.inffus.2026.104260_bib0001","series-title":"Proceedings of the 38th International Conference on Machine Learning (ICML)","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.inffus.2026.104260_bib0002","series-title":"Proceedings of the 35th International Conference on Neural Information Processing Systems (NeurIPS)","article-title":"Align before fuse: vision and language representation learning with momentum distillation","author":"Li","year":"2021"},{"key":"10.1016\/j.inffus.2026.104260_bib0003","series-title":"Proceedings of the Tenth International Conference on Learning Representations (ICLR)","article-title":"FILIP: Fine-grained interactive language-Image pre-Training","author":"Yao","year":"2022"},{"key":"10.1016\/j.inffus.2026.104260_bib0004","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV)","first-page":"2922","article-title":"ALIP: Adaptive language-Image pre-Training with synthetic caption","author":"Yang","year":"2023"},{"key":"10.1016\/j.inffus.2026.104260_bib0005","series-title":"Proceedings of the 35th Conference on Neural Information Processing Systems (NeurIPS)","first-page":"22682","article-title":"Aligning pretraining for detection via object-Level contrastive learning","author":"Wei","year":"2021"},{"key":"10.1016\/j.inffus.2026.104260_bib0006","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.inffus.2026.104260_bib0007","series-title":"Proceedings of the 36th International Conference on Machine Learning (ICML)","first-page":"6105","article-title":"Efficientnet: rethinking model scaling for convolutional neural networks","author":"Tan","year":"2019"},{"key":"10.1016\/j.inffus.2026.104260_bib0008","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"16795","article-title":"Conditional prompt learning for vision-language models","author":"Zhou","year":"2022"},{"key":"10.1016\/j.inffus.2026.104260_bib0009","series-title":"International Journal of Computer Vision","first-page":"2337","article-title":"Learning to prompt for vision-Language models","author":"Zhou","year":"2022"},{"key":"10.1016\/j.inffus.2026.104260_bib0010","series-title":"International Journal of Computer Vision","first-page":"581","article-title":"CLIP-Adapter: Better vision-Language models with feature adapters","author":"Gao","year":"2024"},{"key":"10.1016\/j.inffus.2026.104260_bib0011","series-title":"European Conference on Computer Vision","first-page":"493","article-title":"Tip-adapter: training-free adaption of clip for few-shot classification","author":"Zhang","year":"2022"},{"key":"10.1016\/j.inffus.2026.104260_bib0012","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"19113","article-title":"Maple: multi-modal prompt learning","author":"Khattak","year":"2023"},{"key":"10.1016\/j.inffus.2026.104260_bib0013","series-title":"Proceedings of the 8th International Conference on Learning Representations (ICLR)","article-title":"Deep batch active learning by diverse, uncertain gradient lower bounds","author":"Ash","year":"2020"},{"key":"10.1016\/j.inffus.2026.104260_bib0014","series-title":"Proceedings of the 2008 IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops","first-page":"1","article-title":"Entropy-based active learning for object recognition","author":"Holub","year":"2008"},{"key":"10.1016\/j.inffus.2026.104260_bib0015","series-title":"Proceedings of the 6th International Conference on Learning Representations (ICLR)","article-title":"Active learning for convolutional neural networks: a core-Set approach","author":"Sener","year":"2018"},{"key":"10.1016\/j.inffus.2026.104260_bib0016","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"27004","article-title":"Active prompt learning in vision language models","author":"Bang","year":"2024"},{"key":"10.1016\/j.inffus.2026.104260_bib0017","series-title":"Proceedings of the 2025 IEEE Winter Conference on Applications of Computer Vision (WACV)","first-page":"4902","article-title":"Active learning for vision-Language models","author":"Safaei","year":"2025"},{"key":"10.1016\/j.inffus.2026.104260_bib0018","series-title":"Computer Vision - ECCV 2024","first-page":"147","article-title":"Robust calibration of large vision-Language adapters","author":"Murugesan","year":"2025"},{"key":"10.1016\/j.inffus.2026.104260_bib0019","series-title":"Proceedings of the 38th International Conference on Machine Learning (ICML)","first-page":"12697","article-title":"Calibrate before use: improving few-shot performance of language models","author":"Zhao","year":"2021"},{"key":"10.1016\/j.inffus.2026.104260_bib0020","series-title":"Proceedings of the 38th International Conference on Machine Learning (ICML)","first-page":"4904","article-title":"Scaling up visual and vision-Language representation learning with noisy text supervision","author":"Jia","year":"2021"},{"key":"10.1016\/j.inffus.2026.104260_bib0021","series-title":"Proceedings of the 9th International Conference on Learning Representations (ICLR)","article-title":"An image is worth 16x16 words: transformers for image recognition at scale","author":"Dosovitskiy","year":"2021"},{"key":"10.1016\/j.inffus.2026.104260_bib0022","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"15638","article-title":"Flava: a foundational language and vision alignment model","author":"Singh","year":"2022"},{"key":"10.1016\/j.inffus.2026.104260_bib0023","series-title":"Proceedings of the 39th International Conference on Machine Learning (ICML)","first-page":"12888","article-title":"BLIP: Bootstrapping language-Image pre-training for unified vision-Language understanding and generation","author":"Li","year":"2022"},{"key":"10.1016\/j.inffus.2026.104260_bib0024","series-title":"Proceedings of the 40th International Conference on Machine Learning (ICML)","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-Image pre-training with frozen image encoders and large language models","author":"Li","year":"2023"},{"key":"10.1016\/j.inffus.2026.104260_bib0025","series-title":"Proceedings of the 17th European Conference on Computer Vision (ECCV)","first-page":"529","article-title":"SLIP: Self-supervision meets language-Image pre-training","author":"Mu","year":"2022"},{"key":"10.1016\/j.inffus.2026.104260_bib0026","unstructured":"L. Yuan, D. Chen, Y.-L. Chen, Florence: A New Foundation Model for Computer Vision, arXiv: 2111.11432, 2021."},{"key":"10.1016\/j.inffus.2026.104260_bib0027","series-title":"Proceedings of the 2021 IEEE\/CVF International Conference on Computer Vision (ICCV)","first-page":"6816","article-title":"Vivit: a video vision transformer","author":"Arnab","year":"2021"},{"key":"10.1016\/j.inffus.2026.104260_bib0028","unstructured":"H. Touvron, L. Martin, K. Stone, Llama 2: Open Foundation and Fine-Tuned Chat Models, arXiv: 2307.09288, 2023."},{"key":"10.1016\/j.inffus.2026.104260_bib0029","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"709","article-title":"Visual prompt tuning","author":"Jia","year":"2022"},{"key":"10.1016\/j.inffus.2026.104260_bib0030","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"5196","article-title":"Prompt distribution learning","author":"Lu","year":"2022"},{"key":"10.1016\/j.inffus.2026.104260_bib0031","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV)","first-page":"15144","article-title":"Self-regulating prompts: foundational model adaptation without forgetting","author":"Khattak","year":"2023"},{"key":"10.1016\/j.inffus.2026.104260_bib0032","series-title":"Technical Report","article-title":"Active learning literature survey","author":"Settles","year":"2009"},{"key":"10.1016\/j.inffus.2026.104260_bib0033","series-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing","first-page":"9275","article-title":"Dataset cartography: mapping and diagnosing datasets with training dynamics","author":"Swayamdipta","year":"2020"},{"key":"10.1016\/j.inffus.2026.104260_bib0034","series-title":"Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (ACL-IJCNLP)","first-page":"7265","article-title":"Mind your outliers! investigating the negative impact of outliers on active learning for visual question answering","author":"Karamcheti","year":"2021"},{"key":"10.1016\/j.inffus.2026.104260_bib0035","unstructured":"S. Zhao, L. Chen, L. Yuan, Active Learning for Large Language Models: A Survey, arXiv: 2308.03823, 2023."},{"key":"10.1016\/j.inffus.2026.104260_bib0036","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"17748","article-title":"Efficient active learning for unsupervised domain adaptation","author":"Yang","year":"2022"},{"key":"10.1016\/j.inffus.2026.104260_bib0037","series-title":"Proceedings of the 38th International Conference on Machine Learning (ICML)","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.inffus.2026.104260_bib0038","doi-asserted-by":"crossref","first-page":"2217","DOI":"10.1109\/JSTARS.2019.2918242","article-title":"Eurosat: a novel dataset and deep learning benchmark for land use and land cover classification","author":"Helber","year":"2019","journal-title":"IEEE J. Sel. Top. Appl. Earth Obs. Remote Sens."},{"key":"10.1016\/j.inffus.2026.104260_bib0039","series-title":"Proceedings of the 2012 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"3498","article-title":"Cats and dogs","author":"Vedaldi","year":"2012"},{"key":"10.1016\/j.inffus.2026.104260_bib0040","series-title":"Proceedings of the 2014 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"3606","article-title":"Describing textures in the wild","author":"Cimpoi","year":"2014"},{"key":"10.1016\/j.inffus.2026.104260_bib0041","series-title":"Proceedings of the 2004 Conference on Computer Vision and Pattern Recognition Workshop","first-page":"178","article-title":"Learning generative visual models from few training examples: an incremental bayesian approach tested on 101 object categories","author":"Fei-Fei","year":"2004"},{"key":"10.1016\/j.inffus.2026.104260_bib0042","series-title":"Proceedings of the Sixth Indian Conference on Computer Vision, Graphics and Image Processing","first-page":"722","article-title":"Automated flower classification over a large number of classes","author":"Nilsback","year":"2008"},{"key":"10.1016\/j.inffus.2026.104260_bib0043","series-title":"Proceedings of the 2013 IEEE International Conference on Computer Vision Workshops (ICCVW)","first-page":"554","article-title":"3D Object representations for fine-Grained categorization","author":"Krause","year":"2013"},{"key":"10.1016\/j.inffus.2026.104260_bib0044","series-title":"Technical Report","article-title":"Fine-Grained Visual Classification of Aircraft","author":"Maji","year":"2013"}],"container-title":["Information Fusion"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1566253526001399?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1566253526001399?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T15:55:15Z","timestamp":1776095715000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1566253526001399"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,9]]},"references-count":44,"alternative-id":["S1566253526001399"],"URL":"https:\/\/doi.org\/10.1016\/j.inffus.2026.104260","relation":{},"ISSN":["1566-2535"],"issn-type":[{"value":"1566-2535","type":"print"}],"subject":[],"published":{"date-parts":[[2026,9]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Uncertainty and diversity based selection for active learning in vision-language models","name":"articletitle","label":"Article Title"},{"value":"Information Fusion","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.inffus.2026.104260","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"104260"}}