{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T12:43:39Z","timestamp":1774010619295,"version":"3.50.1"},"reference-count":62,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62177006"],"award-info":[{"award-number":["62177006"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62077009"],"award-info":[{"award-number":["62077009"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010226","name":"Department of Education of Guangdong Province","doi-asserted-by":"publisher","award":["jx2024307"],"award-info":[{"award-number":["jx2024307"]}],"id":[{"id":"10.13039\/501100010226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002701","name":"Ministry of Education","doi-asserted-by":"publisher","award":["25YJA880099"],"award-info":[{"award-number":["25YJA880099"]}],"id":[{"id":"10.13039\/501100002701","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003453","name":"Natural Science Foundation of Guangdong Province","doi-asserted-by":"publisher","award":["2025A1515010136"],"award-info":[{"award-number":["2025A1515010136"]}],"id":[{"id":"10.13039\/501100003453","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003453","name":"Natural Science Foundation of Guangdong Province","doi-asserted-by":"publisher","award":["2022A1515011541"],"award-info":[{"award-number":["2022A1515011541"]}],"id":[{"id":"10.13039\/501100003453","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002726","name":"Beijing Normal University","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100002726","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neurocomputing"],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1016\/j.neucom.2026.132780","type":"journal-article","created":{"date-parts":[[2026,1,19]],"date-time":"2026-01-19T07:24:14Z","timestamp":1768807454000},"page":"132780","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Multi-scale steering of large vision language models via visual information intervention"],"prefix":"10.1016","volume":"673","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3832-7773","authenticated-orcid":false,"given":"Dongliang","family":"Zhao","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1168-1051","authenticated-orcid":false,"given":"Bo","family":"Sun","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3017-2108","authenticated-orcid":false,"given":"Jun","family":"He","sequence":"additional","affiliation":[]},{"given":"Yinghui","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"issue":"12","key":"10.1016\/j.neucom.2026.132780_bib0005","doi-asserted-by":"crossref","DOI":"10.1093\/nsr\/nwae403","article-title":"A survey on multimodal large language models","volume":"11","author":"Yin","year":"2024","journal-title":"Natl. Sci. Rev."},{"key":"10.1016\/j.neucom.2026.132780_bib0010","series-title":"IEEE International Conference on Big Data","first-page":"2247","article-title":"Multimodal large language models: a survey","author":"Wu","year":"2023"},{"key":"10.1016\/j.neucom.2026.132780_bib0015","series-title":"Proceedings of the 3rd International Conference on Computer, Artificial Intelligence and Control Engineering","first-page":"405","article-title":"A survey of multimodel large language models","author":"Liang","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0020","series-title":"International Conference on Machine Learning","first-page":"19730","article-title":"BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models","volume":"vol. 202","author":"Li","year":"2023"},{"key":"10.1016\/j.neucom.2026.132780_bib0025","series-title":"Findings of the Association for Computational Linguistics","first-page":"15840","article-title":"Mitigating hallucinations in large vision-language models with instruction contrastive decoding","author":"Wang","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0030","author":"Lee"},{"key":"10.1016\/j.neucom.2026.132780_bib0035","series-title":"Proceedings of the 32nd ACM International Conference on Multimedia","first-page":"5289","article-title":"Weakly supervised gaussian contrastive grounding with large multimodal models for video question answering","author":"Wang","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0040","series-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","article-title":"LLaVA-Med: training a large language-and-vision assistant for biomedicine in one day","author":"Li","year":"2023"},{"key":"10.1016\/j.neucom.2026.132780_bib0045","author":"Sun"},{"key":"10.1016\/j.neucom.2026.132780_bib0050","series-title":"Proceedings of the 23rd Workshop on Biomedical Natural Language Processing","first-page":"597","article-title":"MAIRA at RRG24: a specialised large multimodal model for radiology report generation","author":"Srivastav","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0055","author":"Bannur"},{"key":"10.1016\/j.neucom.2026.132780_bib0060","series-title":"Proceedings of the 38th International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume":"vol. 139","author":"Radford","year":"2021"},{"key":"10.1016\/j.neucom.2026.132780_bib0065","author":"Bai"},{"key":"10.1016\/j.neucom.2026.132780_bib0070","author":"Liu"},{"key":"10.1016\/j.neucom.2026.132780_bib0075","series-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"13872","article-title":"Mitigating object hallucinations in large vision-language models through visual contrastive decoding","author":"Leng","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0080","series-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"14303","article-title":"Multi-modal hallucination control by visual information grounding","author":"Favero","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0085","series-title":"The Twelfth International Conference on Learning Representations","article-title":"Instructive decoding: instruction-tuned large language models are self-refiner from noisy instructions","author":"Kim","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0090","series-title":"European Conference on Computer Vision","first-page":"125","article-title":"Paying more attention to image: a training-free method for alleviating hallucination in lvlms","author":"Liu","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0095","series-title":"Forty-second International Conference on Machine Learning","article-title":"The hidden life of tokens: Reducing hallucination of large vision-language models via visual information steering","author":"Li","year":"2025"},{"key":"10.1016\/j.neucom.2026.132780_bib0100","series-title":"Generalizing from Limited Resources in the Open World","first-page":"16","article-title":"Event-priori-based vision-language model for efficient visual understanding","author":"Qin","year":"2025"},{"key":"10.1016\/j.neucom.2026.132780_bib0105","series-title":"The Thirteenth International Conference on Learning Representations","article-title":"MLLMs know where to look: training-free perception of small visual details with multimodal LLMs","author":"Zhang","year":"2025"},{"key":"10.1016\/j.neucom.2026.132780_bib0110","series-title":"International Symposium on Circuits and Systems","first-page":"480","article-title":"The efficient VLSI design of BI-CUBIC convolution interpolation for digital image processing","author":"Lin","year":"2008"},{"issue":"9","key":"10.1016\/j.neucom.2026.132780_bib0115","doi-asserted-by":"crossref","first-page":"1904","DOI":"10.1109\/TPAMI.2015.2389824","article-title":"Spatial pyramid pooling in deep convolutional networks for visual recognition","volume":"37","author":"He","year":"2015","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.neucom.2026.132780_bib0120","series-title":"Forty-first International Conference on Machine Learning","article-title":"In-context vectors: making in context learning more effective and controllable through latent space steering","author":"Liu","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0125","series-title":"Activation Addition: Steering Language Models Without Optimization","author":"Turner","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0130","series-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"26286","article-title":"Improved baselines with visual instruction tuning","author":"Liu","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0135","author":"Bai"},{"key":"10.1016\/j.neucom.2026.132780_bib0140","doi-asserted-by":"crossref","DOI":"10.1073\/pnas.2305016120","article-title":"Chatgpt outperforms crowd workers for text-annotation tasks","volume":"120","author":"Gilardi","year":"2023","journal-title":"Proc. Natl. Acad. Sci. U.S.A."},{"key":"10.1016\/j.neucom.2026.132780_bib0145","author":"Touvron"},{"key":"10.1016\/j.neucom.2026.132780_bib0150","unstructured":"W. Chiang, Z. Li, Z. Lin, Y. Sheng, Z. Wu, H. Zhang, L. Zheng, S. Zhuang, Y. Zhuang, J.E. Gonzalez, I. Stoica, E.P. Xing, Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality, Mar, 2023."},{"key":"10.1016\/j.neucom.2026.132780_bib0155","author":"Chen"},{"key":"10.1016\/j.neucom.2026.132780_bib0160","series-title":"The Twelfth International Conference on Learning Representations","article-title":"MiniGPT-4: enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0165","series-title":"Advances in Neural Information Processing Systems 36","article-title":"LLaVA-Med: training a large language-and-vision assistant for biomedicine in one day","author":"Li","year":"2023"},{"key":"10.1016\/j.neucom.2026.132780_bib0170","series-title":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","first-page":"4035","article-title":"Object hallucination in image captioning","author":"Rohrbach","year":"2018"},{"key":"10.1016\/j.neucom.2026.132780_bib0175","series-title":"Proceedings of the 41st International Conference on Machine Learning","article-title":"How language model hallucinations can snowball","author":"Zhang","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0180","series-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"14375","article-title":"Hallusionbench: an advanced diagnostic suite for entangled language hallucination and visual illusion in large vision-language models","author":"Guan","year":"2024"},{"issue":"12","key":"10.1016\/j.neucom.2026.132780_bib0185","doi-asserted-by":"crossref","first-page":":248:1","DOI":"10.1145\/3571730","article-title":"Survey of hallucination in natural language generation","volume":"55","author":"Ji","year":"2023","journal-title":"ACM Comput. Surv."},{"key":"10.1016\/j.neucom.2026.132780_bib0190","series-title":"The Twelfth International Conference on Learning Representations","article-title":"Mitigating hallucination in large multi-modal models via robust instruction tuning","author":"Liu","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0195","series-title":"The Thirteenth International Conference on Learning Representations","article-title":"PerturboLLaVA: Reducing multimodal hallucinations with perturbative visual training","author":"Chen","year":"2025"},{"key":"10.1016\/j.neucom.2026.132780_bib0200","series-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10610","article-title":"Mitigating hallucinations in large vision-language models via DPO: on-policy data hold the key","author":"Yang","year":"2025"},{"key":"10.1016\/j.neucom.2026.132780_bib0205","series-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"27026","article-title":"Hallucination augmented contrastive learning for multimodal large language model","author":"Jiang","year":"2024"},{"issue":"9","key":"10.1016\/j.neucom.2026.132780_bib0210","doi-asserted-by":"crossref","first-page":":258:1","DOI":"10.1145\/3742434","article-title":"Alleviating hallucination in large vision-language models with active retrieval augmentation","volume":"21","author":"Qu","year":"2025","journal-title":"ACM Trans. Multim. Comput. Commun. Appl."},{"key":"10.1016\/j.neucom.2026.132780_bib0215","series-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"13418","article-title":"OPERA: alleviating hallucination in multi-modal large language models via over-trust penalty and retrospection-allocation","author":"Huang","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0220","series-title":"Findings of the Association for Computational Linguistics","first-page":"6944","article-title":"Logical closed loop: uncovering object hallucinations in large vision-language models","author":"Wu","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0225","series-title":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics","first-page":"14664","article-title":"Don\u2019t hallucinate, abstain: identifying LLM knowledge gaps via multi-LLM collaboration","author":"Feng","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0230","series-title":"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing","first-page":"9004","article-title":"SelfCheckGPT: Zero-resource black-box hallucination detection for generative large language models","author":"Manakul","year":"2023"},{"key":"10.1016\/j.neucom.2026.132780_bib0235","series-title":"IEEE International Conference on Multimedia and Expo","first-page":"1","article-title":"Beyond multimodal hallucinations: enhancing LVLMS through hallucination-aware direct preference optimization","author":"Zhao","year":"2025"},{"key":"10.1016\/j.neucom.2026.132780_bib0240","series-title":"ICLR 2024 Workshop on Reliable and Responsible Foundation Models","article-title":"Seeing is believing: mitigating hallucination in large vision-language models via CLIP-guided decoding","author":"Deng","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0245","series-title":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics","first-page":"20619","article-title":"Multi-attribute steering of language models via targeted intervention","author":"Nguyen","year":"2025"},{"key":"10.1016\/j.neucom.2026.132780_bib0250","series-title":"Advances in Neural Information Processing Systems","first-page":"34892","article-title":"Visual instruction tuning","volume":"vol. 36","author":"Liu","year":"2023"},{"key":"10.1016\/j.neucom.2026.132780_bib0255","series-title":"2017 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"936","article-title":"Feature pyramid networks for object detection","author":"Lin","year":"2017"},{"key":"10.1016\/j.neucom.2026.132780_bib0260","series-title":"IEEE Conference on Computer Vision and Pattern Recognition","first-page":"6399","article-title":"Panoptic feature pyramid networks","author":"Kirillov","year":"2019"},{"key":"10.1016\/j.neucom.2026.132780_bib0265","series-title":"IEEE Conference on Computer Vision and Pattern Recognition","first-page":"779","article-title":"You only look once: unified, real-time object detection","author":"Redmon","year":"2016"},{"key":"10.1016\/j.neucom.2026.132780_bib0270","first-page":"1","article-title":"DinoV2: learning robust visual features without supervision","author":"Oquab","year":"2024","journal-title":"Trans. Mach. Learn. Res."},{"key":"10.1016\/j.neucom.2026.132780_bib0275","series-title":"Advances in Neural Information Processing Systems 35","article-title":"LAION-5B: an open large-scale dataset for training next generation image-text models","author":"Schuhmann","year":"2022"},{"key":"10.1016\/j.neucom.2026.132780_bib0280","series-title":"The Twelfth International Conference on Learning Representations, ICLR 2024, Vienna, Austria, May 7-11, 2024","article-title":"DOLA: Decoding by contrasting layers improves factuality in large language models","author":"Chuang","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0285","series-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"25004","article-title":"Devils in middle layers of large vision-language models: interpreting, detecting and mitigating object hallucinations via attention lens","author":"Jiang","year":"2025"},{"key":"10.1016\/j.neucom.2026.132780_bib0290","series-title":"Advances in Neural Information Processing Systems 36","article-title":"InstructBLIP: towards general-purpose vision-language models with instruction tuning","author":"Dai","year":"2023"},{"key":"10.1016\/j.neucom.2026.132780_bib0295","series-title":"European Conference on Computer Vision","first-page":"740","article-title":"Microsoft COCO: common objects in context","volume":"vol. 8693","author":"Lin","year":"2014"},{"key":"10.1016\/j.neucom.2026.132780_bib0300","series-title":"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing","first-page":"292","article-title":"Evaluating object hallucination in large vision-language models","author":"Li","year":"2023"},{"key":"10.1016\/j.neucom.2026.132780_bib0305","series-title":"Findings of the Association for Computational Linguistics","first-page":"13088","article-title":"Aligning large multimodal models with factually augmented RLHF","author":"Sun","year":"2024"},{"key":"10.1016\/j.neucom.2026.132780_bib0310","series-title":"Perspectives in Electronic Structure Theory","first-page":"371","article-title":"Elements of information theory","author":"Nalewajski","year":"2011"}],"container-title":["Neurocomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231226001773?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231226001773?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T10:39:41Z","timestamp":1774003181000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0925231226001773"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4]]},"references-count":62,"alternative-id":["S0925231226001773"],"URL":"https:\/\/doi.org\/10.1016\/j.neucom.2026.132780","relation":{},"ISSN":["0925-2312"],"issn-type":[{"value":"0925-2312","type":"print"}],"subject":[],"published":{"date-parts":[[2026,4]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Multi-scale steering of large vision language models via visual information intervention","name":"articletitle","label":"Article Title"},{"value":"Neurocomputing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neucom.2026.132780","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Published by Elsevier B.V.","name":"copyright","label":"Copyright"}],"article-number":"132780"}}