{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,28]],"date-time":"2025-08-28T02:40:08Z","timestamp":1756348808399,"version":"3.44.0"},"reference-count":53,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["32371877"],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012165","name":"Key Technologies Research and Development Program","doi-asserted-by":"publisher","award":["2022YFD2201005"],"id":[{"id":"10.13039\/501100012165","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Expert Systems with Applications"],"published-print":{"date-parts":[[2026,2]]},"DOI":"10.1016\/j.eswa.2025.129373","type":"journal-article","created":{"date-parts":[[2025,8,18]],"date-time":"2025-08-18T07:43:20Z","timestamp":1755503000000},"page":"129373","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PB","title":["Instance-Aware Visual Prompting helps multimodal models see better"],"prefix":"10.1016","volume":"297","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-5396-019X","authenticated-orcid":false,"given":"Xiaoyu","family":"Lin","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5551-9128","authenticated-orcid":false,"given":"Jingxu","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Liyong","family":"Fu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6135-3809","authenticated-orcid":false,"given":"He","family":"Yan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8793-8610","authenticated-orcid":false,"given":"Qiaolin","family":"Ye","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.eswa.2025.129373_bib0001","doi-asserted-by":"crossref","DOI":"10.1016\/j.eti.2023.103050","article-title":"Circular bioeconomy in palm oil industry: Current practices and future perspectives","volume":"30","author":"Cheah","year":"2023","journal-title":"Environmental Technology & Innovation"},{"key":"10.1016\/j.eswa.2025.129373_bib0002","unstructured":"Chen, J., Zhu, D., Shen, X., Li, X., Liu, Z., Zhang, P., Krishnamoorthi, R., Chandra, V., Xiong, Y., & Elhoseiny, M. (2023). Minigpt-v2: Large language model as a unified interface for vision-language multi-task learning. arXiv:2310.09478."},{"key":"10.1016\/j.eswa.2025.129373_bib0003","unstructured":"DeepSeek-AI (2024). Deepseek LLM: Scaling open-source language models with longtermism. arXiv:2401.02954. https:\/\/github.com\/deepseek-ai\/deepseek-llm."},{"key":"10.1016\/j.eswa.2025.129373_bib0004","series-title":"2024 5th International conference on information science, parallel and distributed systems (ISPDS)","first-page":"77","article-title":"Enhance image-to-image generation with llava-generated prompts","author":"Ding","year":"2024"},{"key":"10.1016\/j.eswa.2025.129373_bib0005","doi-asserted-by":"crossref","first-page":"257","DOI":"10.1016\/j.jmsy.2024.06.007","article-title":"Enhancing metal additive manufacturing training with the advanced vision language model: A pathway to immersive augmented reality training for non-experts","volume":"75","author":"Fan","year":"2024","journal-title":"Journal of Manufacturing Systems"},{"key":"10.1016\/j.eswa.2025.129373_bib0006","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"5356","article-title":"Lvis: A dataset for large vocabulary instance segmentation","author":"Gupta","year":"2019"},{"issue":"1","key":"10.1016\/j.eswa.2025.129373_bib0007","doi-asserted-by":"crossref","first-page":"20","DOI":"10.1038\/s43586-021-00018-1","article-title":"Clip and complementary methods","volume":"1","author":"Hafner","year":"2021","journal-title":"Nature Reviews Methods Primers"},{"key":"10.1016\/j.eswa.2025.129373_bib0008","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2024.102270","article-title":"From image to language: A critical analysis of visual question answering (vqa) approaches, challenges, and opportunities","volume":"106","author":"Ishmam","year":"2024","journal-title":"Information Fusion"},{"key":"10.1016\/j.eswa.2025.129373_bib0009","series-title":"European conference on computer vision","first-page":"709","article-title":"Visual prompt tuning","author":"Jia","year":"2022"},{"issue":"3","key":"10.1016\/j.eswa.2025.129373_bib0010","article-title":"Study and analysis of chat GPT and its impact on different fields of study","volume":"8","author":"Kalla","year":"2023","journal-title":"International Journal of Innovative Science and Research Technology"},{"issue":"2270","key":"10.1016\/j.eswa.2025.129373_bib0011","article-title":"Gpt-4 passes the bar exam","volume":"382","author":"Katz","year":"2024","journal-title":"Philosophical Transactions of the Royal Society A"},{"issue":"9","key":"10.1016\/j.eswa.2025.129373_bib0012","doi-asserted-by":"crossref","first-page":"5587","DOI":"10.1080\/10494820.2023.2220401","article-title":"Unlocking the potential of GPT-3 in education: Opportunities, limitations, and recommendations for effective integration","volume":"32","author":"Kikalishvili","year":"2024","journal-title":"Interactive Learning Environments"},{"key":"10.1016\/j.eswa.2025.129373_bib0013","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"4015","article-title":"Segment anything","author":"Kirillov","year":"2023"},{"key":"10.1016\/j.eswa.2025.129373_bib0014","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"3041","article-title":"Mask dino: Towards a unified transformer-based framework for object detection and segmentation","author":"Li","year":"2023"},{"key":"10.1016\/j.eswa.2025.129373_bib0015","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume":"34","author":"Li","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"2","key":"10.1016\/j.eswa.2025.129373_bib0016","doi-asserted-by":"crossref","first-page":"32","DOI":"10.1109\/MGRS.2024.3383473","article-title":"Vision-language models in remote sensing: Current progress and future trends","volume":"12","author":"Li","year":"2024","journal-title":"IEEE Geoscience and Remote Sensing Magazine"},{"key":"10.1016\/j.eswa.2025.129373_bib0017","doi-asserted-by":"crossref","first-page":"10952","DOI":"10.1109\/TMM.2024.3428317","article-title":"Lmeye: An interactive perception network for large language models","volume":"26","author":"Li","year":"2024","journal-title":"IEEE Transactions on Multimedia"},{"issue":"1","key":"10.1016\/j.eswa.2025.129373_bib0018","first-page":"33","article-title":"Deepseek large-scale model: Technical analysis and development prospect","volume":"7","author":"Liao","year":"2025","journal-title":"Journal of Computer Science and Electrical Engineering"},{"key":"10.1016\/j.eswa.2025.129373_bib0019","series-title":"Computer vision\u2013ECCV 2014: 13th European conference, Zurich, Switzerland, September 6-12, 2014, proceedings, part v 13","first-page":"740","article-title":"Microsoft coco: Common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.eswa.2025.129373_bib0020","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"26296","article-title":"Improved baselines with visual instruction tuning","author":"Liu","year":"2024"},{"key":"10.1016\/j.eswa.2025.129373_bib0021","series-title":"International conference on database systems for advanced applications","first-page":"419","article-title":"Flickr30k-CFQ: A compact and fragmented query dataset for text-image retrieval","author":"Liu","year":"2024"},{"issue":"9","key":"10.1016\/j.eswa.2025.129373_bib0022","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3560815","article-title":"Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing","volume":"55","author":"Liu","year":"2023","journal-title":"ACM Computing Surveys"},{"key":"10.1016\/j.eswa.2025.129373_bib0023","series-title":"European conference on computer vision","first-page":"38","article-title":"Grounding dino: Marrying dino with grounded pre-training for open-set object detection","author":"Liu","year":"2024"},{"key":"10.1016\/j.eswa.2025.129373_bib0024","series-title":"International conference on machine learning","first-page":"22631","article-title":"The flan collection: Designing data and methods for effective instruction tuning","author":"Longpre","year":"2023"},{"key":"10.1016\/j.eswa.2025.129373_bib0025","unstructured":"Qi, J., Ding, M., Wang, W., Bai, Y., Lv, Q., Hong, W., Xu, B., Hou, L., Li, J., Dong, Y. et al. (2024). Cogcom: Train large vision-language models diving into details through chain of manipulations."},{"key":"10.1016\/j.eswa.2025.129373_bib0026","series-title":"ICLR","article-title":"Learning clustering-based prototypes for compositional zero-shot learning","author":"Qu","year":"2025"},{"key":"10.1016\/j.eswa.2025.129373_bib0027","unstructured":"Ren, T., Liu, S., Zeng, A., Lin, J., Li, K., Cao, H., Chen, J., Huang, X., Chen, Y., Yan, F. et al. (2024). Grounded sam: Assembling open-world models for diverse visual tasks. arXiv:2401.14159."},{"key":"10.1016\/j.eswa.2025.129373_bib0028","unstructured":"Salehi, M., Farajtabar, M., Horton, M., Faghri, F., Pouransari, H., Vemulapalli, R., Tuzel, O., Farhadi, A., Rastegari, M., & Mehta, S. (2023). Clip meets model zoo experts: Pseudo-supervision for visual enhancement. arXiv:2310.14108."},{"key":"10.1016\/j.eswa.2025.129373_bib0029","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"11987","article-title":"What does clip know about a red circle? Visual prompt engineering for vlms","author":"Shtedritski","year":"2023"},{"key":"10.1016\/j.eswa.2025.129373_bib0030","series-title":"Proceedings of the 18th international workshop on semantic evaluation (semeval-2024)","first-page":"40","article-title":"T5-medical at semeval-2024 task 2: Using t5 medical embedding for natural language inference on clinical trial data","author":"Siino","year":"2024"},{"issue":"1","key":"10.1016\/j.eswa.2025.129373_bib0031","first-page":"1146","article-title":"Interactive and visual prompt engineering for ad-hoc task adaptation with large language models","volume":"29","author":"Strobelt","year":"2022","journal-title":"IEEE transactions on visualization and computer graphics"},{"key":"10.1016\/j.eswa.2025.129373_bib0032","doi-asserted-by":"crossref","DOI":"10.1016\/j.jvcir.2023.103830","article-title":"Rethinking PASCAL-VOC and MS-COCO dataset for small object detection","volume":"93","author":"Tong","year":"2023","journal-title":"Journal of Visual Communication and Image Representation"},{"key":"10.1016\/j.eswa.2025.129373_bib0033","series-title":"2024 International conference on advances in data engineering and intelligent computing systems (ADICS)","first-page":"1","article-title":"Yolov8: A novel object detection algorithm with enhanced performance and robustness","author":"Varghese","year":"2024"},{"issue":"2","key":"10.1016\/j.eswa.2025.129373_bib0034","doi-asserted-by":"crossref","first-page":"982","DOI":"10.1109\/TMI.2024.3473745","article-title":"Attriprompter: Auto-prompting with attribute semantics for zero-shot nuclei detection via visual-language pre-trained models","volume":"44","author":"Wu","year":"2024","journal-title":"IEEE Transactions on Medical Imaging"},{"key":"10.1016\/j.eswa.2025.129373_bib0035","unstructured":"Yang, J., Zhang, H., Li, F., Zou, X., Li, C., & Gao, J. (2023a). Set-of-mark prompting unleashes extraordinary visual grounding in GPT-4v. https:\/\/arxiv.org\/abs\/2310.11441."},{"key":"10.1016\/j.eswa.2025.129373_bib0036","unstructured":"Yang, J., Zhang, H., Li, F., Zou, X., Li, C., & Gao, J. (2023b). Set-of-mark prompting unleashes extraordinary visual grounding in gpt-4v. arXiv:2310.11441."},{"key":"10.1016\/j.eswa.2025.129373_bib0037","first-page":"24993","article-title":"Fine-grained visual prompting","volume":"36","author":"Yang","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.eswa.2025.129373_bib0038","unstructured":"Yang, Z., Li, L., Lin, K., Wang, J., Lin, C.-C., Liu, Z., & Wang, L. (2023d). The dawn of lmms: Preliminary explorations with gpt-4v (ision). arXiv:2309.17421, 9(1), 1."},{"issue":"2","key":"10.1016\/j.eswa.2025.129373_bib0039","article-title":"The general intelligence of GPT\u20134, its knowledge diffusive and societal influences, and its governance","volume":"2","author":"Yekta","year":"2024","journal-title":"Meta-Radiology"},{"key":"10.1016\/j.eswa.2025.129373_bib0040","doi-asserted-by":"crossref","first-page":"495","DOI":"10.1109\/TIP.2024.3523801","article-title":"3VL: Using trees to improve vision-language models\u2019 interpretability","volume":"34","author":"Yellinek","year":"2025","journal-title":"IEEE Transactions on Image Processing"},{"key":"10.1016\/j.eswa.2025.129373_bib0041","series-title":"European conference on computer vision","first-page":"106","article-title":"Open-vocabulary detr with conditional matching","author":"Zang","year":"2022"},{"key":"10.1016\/j.eswa.2025.129373_bib0042","unstructured":"Zhan, Y., Zhu, Y., Zhao, H., Yang, F., Tang, M., & Wang, J. (2024). Griffon v2: Advancing multimodal perception with high-resolution scaling and visual-language co-referring. arXiv:2403.09333."},{"key":"10.1016\/j.eswa.2025.129373_bib0043","series-title":"Proceedings of the IEEE\/CVF winter conference on applications of computer vision","first-page":"636","article-title":"Can vision-language models be a good guesser? exploring vlms for times and location reasoning","author":"Zhang","year":"2024"},{"key":"10.1016\/j.eswa.2025.129373_bib0044","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"1020","article-title":"A simple framework for open-vocabulary segmentation and detection","author":"Zhang","year":"2023"},{"key":"10.1016\/j.eswa.2025.129373_bib0045","unstructured":"Zhang, H., You, H., Dufter, P., Zhang, B., Chen, C., Chen, H.-Y., Fu, T.-J., Wang, W.Y., Chang, S.F., Gan, Z. et al. (2024b). Ferret-v2: An improved baseline for referring and grounding with large language models. arXiv:2404.07973."},{"key":"10.1016\/j.eswa.2025.129373_bib0046","doi-asserted-by":"crossref","unstructured":"Zhang, S., Sun, P., Chen, S., Xiao, M., Shao, W., Zhang, W., Liu, Y., Chen, K., & Luo, P. (2024c). Gpt4roi: Instruction tuning large language model on region-of-interest. https:\/\/arxiv.org\/abs\/2307.03601.","DOI":"10.1007\/978-3-031-91813-1_4"},{"key":"10.1016\/j.eswa.2025.129373_bib0047","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.112974","article-title":"Consistent prompt learning for vision-language models","volume":"310","author":"Zhang","year":"2025","journal-title":"Knowledge-Based Systems"},{"key":"10.1016\/j.eswa.2025.129373_bib0048","series-title":"2017 IEEE Conference on computer vision and pattern recognition","first-page":"5122","article-title":"Scene parsing through ADE20k dataset","author":"Zhou","year":"2017"},{"key":"10.1016\/j.eswa.2025.129373_bib0049","doi-asserted-by":"crossref","first-page":"302","DOI":"10.1007\/s11263-018-1140-0","article-title":"Semantic understanding of scenes through the ade20k dataset","volume":"127","author":"Zhou","year":"2019","journal-title":"International Journal of Computer Vision"},{"key":"10.1016\/j.eswa.2025.129373_bib0050","first-page":"1","article-title":"A comprehensive survey on pretrained foundation models: A history from bert to chatgpt","author":"Zhou","year":"2024","journal-title":"International Journal of Machine Learning and Cybernetics"},{"issue":"9","key":"10.1016\/j.eswa.2025.129373_bib0051","doi-asserted-by":"crossref","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","article-title":"Learning to prompt for vision-language models","volume":"130","author":"Zhou","year":"2022","journal-title":"International Journal of Computer Vision"},{"issue":"12","key":"10.1016\/j.eswa.2025.129373_bib0052","doi-asserted-by":"crossref","first-page":"8954","DOI":"10.1109\/TPAMI.2024.3413013","article-title":"A survey on open-vocabulary detection and segmentation: Past, present, and future","volume":"46","author":"Zhu","year":"2024","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.eswa.2025.129373_bib0053","first-page":"19769","article-title":"Segment everything everywhere all at once","volume":"36","author":"Zou","year":"2024","journal-title":"Advances in Neural Information Processing Systems"}],"container-title":["Expert Systems with Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417425029884?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417425029884?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,8,28]],"date-time":"2025-08-28T02:09:07Z","timestamp":1756346947000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0957417425029884"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2]]},"references-count":53,"alternative-id":["S0957417425029884"],"URL":"https:\/\/doi.org\/10.1016\/j.eswa.2025.129373","relation":{},"ISSN":["0957-4174"],"issn-type":[{"type":"print","value":"0957-4174"}],"subject":[],"published":{"date-parts":[[2026,2]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Instance-Aware Visual Prompting helps multimodal models see better","name":"articletitle","label":"Article Title"},{"value":"Expert Systems with Applications","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.eswa.2025.129373","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"129373"}}