{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,12]],"date-time":"2026-06-12T16:09:37Z","timestamp":1781280577023,"version":"3.54.1"},"reference-count":49,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100004826","name":"Beijing Natural Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004826","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Information Fusion"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.inffus.2026.104478","type":"journal-article","created":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T12:10:14Z","timestamp":1778760614000},"page":"104478","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["CoSE: connectivity-oriented semantic enhancement for mitigating hallucinations in multimodal LLMs"],"prefix":"10.1016","volume":"135","author":[{"given":"Yuanze","family":"Hu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhaoxin","family":"Fan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Gen","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhichao","family":"Yang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xinyu","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ye","family":"Qiu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wenjun","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kejian","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yifan","family":"Sun","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaotie","family":"Deng","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jin","family":"Dong","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ziyu","family":"Jia","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.inffus.2026.104478_bib0001","doi-asserted-by":"crossref","first-page":"49250","DOI":"10.52202\/075280-2142","article-title":"Instructblip: towards general-purpose vision-language models with instruction tuning","volume":"36","author":"Dai","year":"2023","journal-title":"Adv. Neural Infor. Process. Syst."},{"key":"10.1016\/j.inffus.2026.104478_bib0002","unstructured":"X. Dong, P. Zhang, Y. Zang, Y. Cao, B. Wang, L. Ouyang, X. Wei, S. Zhang, H. Duan, M. Cao, W. Zhang, Y. Li, H. Yan, Y. Gao, X. Zhang, W. Li, J. Li, K. Chen, C. He, X. Zhang, Y. Qiao, D. Lin, J. Wang, InternLM-XComposer2: Mastering Free-form Text-Image Composition and Comprehension in Vision-Language Large Model, 2024, https:\/\/arxiv.org\/abs\/2401.16420. 2401.16420."},{"key":"10.1016\/j.inffus.2026.104478_bib0003","first-page":"34892","article-title":"Visual instruction tuning","volume":"36","author":"Liu","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.inffus.2026.104478_bib0004","unstructured":"J. Bai, S. Bai, S. Yang, S. Wang, S. Tan, P. Wang, J. Lin, C. Zhou, J. Zhou, Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond, 2023,. https:\/\/arxiv.org\/abs\/2308.12966. 2308.12966."},{"key":"10.1016\/j.inffus.2026.104478_bib0005","unstructured":"Google DeepMind, Gemini Pro - Google DeepMind, 2025, (https:\/\/deepmind.google\/models\/gemini\/pro\/)."},{"key":"10.1016\/j.inffus.2026.104478_bib0006","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"2425","article-title":"Vqa: visual question answering","author":"Antol","year":"2015"},{"key":"10.1016\/j.inffus.2026.104478_bib0007","series-title":"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing","first-page":"4895","article-title":"GQA: Training generalized multi-Query transformer models from multi-Head checkpoints","author":"Ainslie","year":"2023"},{"key":"10.1016\/j.inffus.2026.104478_bib0008","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"8317","article-title":"Towards vqa models that can read","author":"Singh","year":"2019"},{"key":"10.1016\/j.inffus.2026.104478_bib0009","first-page":"2507","article-title":"Learn to explain: multimodal reasoning via thought chains for science question answering","volume":"35","author":"Lu","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.inffus.2026.104478_bib0010","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"9556","article-title":"Mmmu: a massive multi-discipline multimodal understanding and reasoning benchmark for expert agi","author":"Yue","year":"2024"},{"key":"10.1016\/j.inffus.2026.104478_bib0011","unstructured":"B. Zhou, Y. Hu, X. Weng, J. Jia, J. Luo, X. Liu, J. Wu, L. Huang, TinyLLaVA: A Framework of Small-scale Large Multimodal Models, 2024, https:\/\/arxiv.org\/abs\/2402.14289. 2402.14289."},{"key":"10.1016\/j.inffus.2026.104478_bib0012","article-title":"MiniCPM-V: a GPT-4V level MLLM on your phone","author":"Yao","year":"2024","journal-title":"Corr"},{"key":"10.1016\/j.inffus.2026.104478_bib0013","series-title":"The Thirty-ninth Annual Conference on Neural Information Processing Systems Datasets and Benchmarks Track","article-title":"Mme: a comprehensive evaluation benchmark for multimodal large language models","author":"Fu","year":"2025"},{"key":"10.1016\/j.inffus.2026.104478_bib0014","series-title":"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing","first-page":"292","article-title":"Evaluating object hallucination in large vision-Language models","author":"Li","year":"2023"},{"key":"10.1016\/j.inffus.2026.104478_bib0015","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"14375","article-title":"Hallusionbench: an advanced diagnostic suite for entangled language hallucination and visual illusion in large vision-language models","author":"Guan","year":"2024"},{"key":"10.1016\/j.inffus.2026.104478_bib0016","series-title":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","first-page":"4035","article-title":"Object hallucination in image captioning","author":"Rohrbach","year":"2018"},{"key":"10.1016\/j.inffus.2026.104478_bib0017","unstructured":"Z. Bai, P. Wang, T. Xiao, T. He, Z. Han, Z. Zhang, M.Z. Shou, Hallucination of Multimodal Large Language Models: A Survey, 2024, https:\/\/arxiv.org\/abs\/2404.18930. 2404.18930."},{"key":"10.1016\/j.inffus.2026.104478_bib0018","doi-asserted-by":"crossref","first-page":"23716","DOI":"10.52202\/068431-1723","article-title":"Flamingo: a visual language model for few-shot learning","volume":"35","author":"Alayrac","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.inffus.2026.104478_bib0019","series-title":"International Conference on Machine Learning","first-page":"12888","article-title":"Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022"},{"key":"10.1016\/j.inffus.2026.104478_bib0020","series-title":"International Conference on Machine Learning","first-page":"19730","article-title":"Blip-2: bootstrapping language-image pre-trainingwith frozen image encoders and large language models","author":"Li","year":"2023"},{"key":"10.1016\/j.inffus.2026.104478_bib0021","unstructured":"S. Bai, K. Chen, X. Liu, J. Wang, W. Ge, S. Song, K. Dang, P. Wang, S. Wang, J. Tang, H. Zhong, Y. Zhu, M. Yang, Z. Li, J. Wan, P. Wang, W. Ding, Z. Fu, Y. Xu, J. Ye, X. Zhang, T. Xie, Z. Cheng, H. Zhang, Z. Yang, H. Xu, J. Lin, Qwen2.5-VL Technical Report, 2025, https:\/\/arxiv.org\/abs\/2502.13923. 2502.13923."},{"key":"10.1016\/j.inffus.2026.104478_bib0022","unstructured":"P. Sarkar, S. Ebrahimi, A. Etemad, A. Beirami, S.O. Arik, T. Pfister, Mitigating object hallucination in MLLMs via data-augmented phrase-level alignment, in: The Thirteenth International Conference on Learning Representations,."},{"key":"10.1016\/j.inffus.2026.104478_bib0023","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"13872","article-title":"Mitigating object hallucinations in large vision-language models through visual contrastive decoding","author":"Leng","year":"2024"},{"key":"10.1016\/j.inffus.2026.104478_bib0024","series-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","first-page":"26147","article-title":"Seeing far and clearly: mitigating hallucinations in MLLMs with attention causal decoding","author":"Tang","year":"2025"},{"key":"10.1016\/j.inffus.2026.104478_bib0025","unstructured":"C. Wang, X. Chen, N. Zhang, B. Tian, H. Xu, S. Deng, H. Chen, MLLM Can see? dynamic correction decoding for hallucination mitigation, in: The Thirteenth International Conference on Learning Representations,."},{"key":"10.1016\/j.inffus.2026.104478_bib0026","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"23369","article-title":"Reveal: retrieval-augmented visual-language pre-training with multi-source multimodal knowledge memory","author":"Hu","year":"2023"},{"key":"10.1016\/j.inffus.2026.104478_bib0027","article-title":"RAVEN: In-Context learning with retrieval augmented encoder-Decoder language models","author":"Huang","year":"2023","journal-title":"CoRR"},{"key":"10.1016\/j.inffus.2026.104478_bib0028","unstructured":"X. Chu, L. Qiao, X. Lin, S. Xu, Y. Yang, Y. Hu, F. Wei, X. Zhang, B. Zhang, X. Wei, C. Shen, MobileVLM : A Fast, Strong and Open Vision Language Assistant for Mobile Devices, 2023, https:\/\/arxiv.org\/abs\/2312.16886. 2312.16886."},{"key":"10.1016\/j.inffus.2026.104478_bib0029","unstructured":"A. Marafioti, O. Zohar, M. Farr\u00e9, M. Noyan, E. Bakouch, P. Cuenca, C. Zakka, L.B. Allal, A. Lozhkov, N. Tazi, V. Srivastav, J. Lochner, H. Larcher, M. Morlon, L. Tunstall, L. von Werra, T. Wolf, SmolVLM: Redefining small and efficient multimodal models, 2025, https:\/\/arxiv.org\/abs\/2504.05299. 2504.05299."},{"key":"10.1016\/j.inffus.2026.104478_bib0030","unstructured":"Y. Ren, D.J. Sutherland, Learning dynamics of LLM finetuning, in: The Thirteenth International Conference on Learning Representations,."},{"key":"10.1016\/j.inffus.2026.104478_bib0031","unstructured":"Y. Zhang, Q. Dong, Empirical Investigation of Latent Representational Dynamics in Large Language Models: A Manifold Evolution Perspective, 2025, https:\/\/arxiv.org\/abs\/2505.20340. 2505.20340."},{"key":"10.1016\/j.inffus.2026.104478_bib0032","unstructured":"A. Modell, P. Rubin-Delanchy, N. Whiteley, The Origins of Representation Manifolds in Large Language Models, 2025, https:\/\/arxiv.org\/abs\/2505.18235. 2505.18235."},{"issue":"1","key":"10.1016\/j.inffus.2026.104478_bib0033","doi-asserted-by":"crossref","first-page":"1","DOI":"10.3934\/fods.2019001","article-title":"Consistent manifold representation for topological data analysis","volume":"1","author":"Berry","year":"2019","journal-title":"Found. Data Sci."},{"key":"10.1016\/j.inffus.2026.104478_bib0034","unstructured":"X. Li, A. Sarwate, Unraveling the Localized Latents: Learning Stratified Manifold Structures in LLM Embedding Space with Sparse Mixture-of-Experts, 2025, https:\/\/arxiv.org\/abs\/2502.13577. 2502.13577."},{"key":"10.1016\/j.inffus.2026.104478_bib0035","first-page":"66856","article-title":"Back to the continuous attractor","volume":"37","author":"S\u00e1godi","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.inffus.2026.104478_bib0036","unstructured":"S. Tian, H. Liu, Y. Yang, J. Yu, Z. Miao, X. Huang, Z. Liu, Z. Yi, A Differential Manifold Perspective and Universality Analysis of Continuous Attractors in Artificial Neural Networks, 2025, https:\/\/arxiv.org\/abs\/2509.10514. 2509.10514."},{"key":"10.1016\/j.inffus.2026.104478_bib0037","series-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics","article-title":"Quantifying attention flow in transformers","author":"Abnar","year":"2020"},{"issue":"1","key":"10.1016\/j.inffus.2026.104478_bib0038","first-page":"262","article-title":"Attentionviz: a global view of transformer attention","volume":"30","author":"Yeh","year":"2023","journal-title":"IEEE Trans. Vis. Comput. Graph."},{"key":"10.1016\/j.inffus.2026.104478_bib0039","first-page":"40699","article-title":"LLM Circuit analyses are consistent across training and scale","volume":"37","author":"Tigges","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.inffus.2026.104478_bib0040","unstructured":"A.D. Hakimi, A. Modarressi, P. Wicke, H. Sch\u00fctze, Time Course MechInterp: Analyzing the Evolution of Components and Knowledge in Large Language Models, 2025, https:\/\/arxiv.org\/abs\/2506.03434. 2506.03434."},{"key":"10.1016\/j.inffus.2026.104478_bib0041","series-title":"Findings of the Association for Computational Linguistics ACL 2024","first-page":"6944","article-title":"Logical closed loop: uncovering object hallucinations in large vision-language models","author":"Wu","year":"2024"},{"issue":"12","key":"10.1016\/j.inffus.2026.104478_bib0042","doi-asserted-by":"crossref","DOI":"10.1007\/s11432-024-4251-x","article-title":"Woodpecker: hallucination correction for multimodal large language models","volume":"67","author":"Yin","year":"2024","journal-title":"Sci. China Inf. Sci."},{"key":"10.1016\/j.inffus.2026.104478_bib0043","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"1818","article-title":"Wiki-llava: hierarchical retrieval-augmented generation for multimodal llms","author":"Caffagni","year":"2024"},{"key":"10.1016\/j.inffus.2026.104478_bib0044","series-title":"Proceedings of the IEEE\/CVFConference on Computer Vision and Pattern Recognition","first-page":"1724","article-title":"Recognize anything: a strong image tagging model","author":"Zhang","year":"2024"},{"key":"10.1016\/j.inffus.2026.104478_bib0045","unstructured":"A. Jaegle, S. Borgeaud, J.-B. Alayrac, C. Doersch, C. Ionescu, D. Ding, S. Koppula, D. Zoran, A. Brock, E. Shelhamer, et al., Perceiver IO: a general architecture for structured inputs & outputs, in: International Conference on Learning Representations,."},{"key":"10.1016\/j.inffus.2026.104478_bib0046","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"11975","article-title":"Sigmoid loss for language image pre-training","author":"Zhai","year":"2023"},{"key":"10.1016\/j.inffus.2026.104478_bib0047","article-title":"The faiss library","author":"Douze","year":"2025","journal-title":"IEEE Trans. Big Data"},{"key":"10.1016\/j.inffus.2026.104478_bib0048","unstructured":"W. Yu, Z. Yang, L. Li, J. Wang, K. Lin, Z. Liu, X. Wang, L. Wang, MM-Vet: Evaluating large multimodal models for integrated capabilities, in: Forty-first International Conference on Machine Learning,."},{"key":"10.1016\/j.inffus.2026.104478_bib0049","series-title":"European Conference on Computer Vision","first-page":"370","article-title":"Sharegpt4v: improving large multi-modal models with better captions","author":"Chen","year":"2024"}],"container-title":["Information Fusion"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S156625352600357X?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S156625352600357X?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,12]],"date-time":"2026-06-12T15:54:51Z","timestamp":1781279691000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S156625352600357X"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":49,"alternative-id":["S156625352600357X"],"URL":"https:\/\/doi.org\/10.1016\/j.inffus.2026.104478","relation":{},"ISSN":["1566-2535"],"issn-type":[{"value":"1566-2535","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"CoSE: connectivity-oriented semantic enhancement for mitigating hallucinations in multimodal LLMs","name":"articletitle","label":"Article Title"},{"value":"Information Fusion","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.inffus.2026.104478","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"104478"}}