{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T15:07:56Z","timestamp":1780931276816,"version":"3.54.1"},"reference-count":54,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,12]]},"DOI":"10.1016\/j.patcog.2026.114023","type":"journal-article","created":{"date-parts":[[2026,5,23]],"date-time":"2026-05-23T15:07:15Z","timestamp":1779548835000},"page":"114023","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PA","title":["Challenging and enhancing the reasoning capacity of multimodal LLMs in context-violating images"],"prefix":"10.1016","volume":"180","author":[{"given":"Yuyang","family":"Chen","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hongxi","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qiyuan","family":"Cheng","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2056-6947","authenticated-orcid":false,"given":"Xinxiao","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2026.114023_b1","doi-asserted-by":"crossref","unstructured":"T. Thrush, R. Jiang, M. Bartolo, A. Singh, A. Williams, D. Kiela, C. Ross, Winoground: probing vision and language models for visio-linguistic compositionality, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 5238\u20135248.","DOI":"10.1109\/CVPR52688.2022.00517"},{"key":"10.1016\/j.patcog.2026.114023_b2","doi-asserted-by":"crossref","unstructured":"S. Tong, Z. Liu, Y. Zhai, Y. Ma, Y. LeCun, S. Xie, Eyes wide shut? exploring the visual shortcomings of multimodal llms, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 9568\u20139578.","DOI":"10.1109\/CVPR52733.2024.00914"},{"key":"10.1016\/j.patcog.2026.114023_b3","unstructured":"M. Shukor, A. Ram\u00e9, C. Dancette, M. Cord, Beyond task performance: evaluating and reducing the flaws of large multimodal models with in-context-learning, in: International Conference on Learning Representations, 2024, pp. 12363\u201312393."},{"key":"10.1016\/j.patcog.2026.114023_b4","unstructured":"Z. Huang, C. Liu, Y. Dong, H. Su, S. Zheng, T. Liu, Machine Vision Therapy: Multimodal Large Language Models Can Enhance Visual Robustness via Denoising In-Context Learning, in: Forty-First International Conference on Machine Learning, ICML, 2024, pp. 19973\u201320003."},{"key":"10.1016\/j.patcog.2026.114023_b5","unstructured":"H. Wang, Z. Huang, Z. Lin, T. Liu, NoiseGPT: Label Noise Detection and Rectification through Probability Curvature, in: Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems, NeurIPS, 2024."},{"key":"10.1016\/j.patcog.2026.114023_b6","doi-asserted-by":"crossref","unstructured":"N. Bitton-Guetta, Y. Bitton, J. Hessel, L. Schmidt, Y. Elovici, G. Stanovsky, R. Schwartz, Breaking common sense: whoops! a vision-and-language benchmark of synthetic and compositional images, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 2616\u20132627.","DOI":"10.1109\/ICCV51070.2023.00247"},{"key":"10.1016\/j.patcog.2026.114023_b7","series-title":"Findings of the Association for Computational Linguistics: EMNLP","first-page":"10185","article-title":"ROME: evaluating pre-trained vision-language models on reasoning beyond visual common sense","author":"Zhou","year":"2023"},{"key":"10.1016\/j.patcog.2026.114023_b8","doi-asserted-by":"crossref","unstructured":"M.R. Taesiri, T. Feng, C.-P. Bezemer, A. Nguyen, GlitchBench: can large multimodal models detect video game glitches?, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 22444\u201322455.","DOI":"10.1109\/CVPR52733.2024.02118"},{"key":"10.1016\/j.patcog.2026.114023_b9","doi-asserted-by":"crossref","unstructured":"J. Hessel, A. Marasovi\u0107, J.D. Hwang, L. Lee, J. Da, R. Zellers, R. Mankoff, Y. Choi, Do androids laugh at electric sheep? humor \u201dunderstanding\u201d benchmarks from the new yorker caption contest, in: Proceedings of the Annual Meeting of the Association for Computational Linguistics, ACL, 2023, pp. 688\u2013714.","DOI":"10.18653\/v1\/2023.acl-long.41"},{"key":"10.1016\/j.patcog.2026.114023_b10","doi-asserted-by":"crossref","unstructured":"V. Kougia, S. Fetzel, T. Kirchmair, E. \u00c7ano, S.M. Baharlou, S. Sharifzadeh, B. Roth, Memegraphs: linking memes to knowledge graphs, in: International Conference on Document Analysis and Recognition, 2023, pp. 534\u2013551.","DOI":"10.1007\/978-3-031-41676-7_31"},{"key":"10.1016\/j.patcog.2026.114023_b11","doi-asserted-by":"crossref","unstructured":"R. Cao, R.K.-W. Lee, W.-H. Chong, J. Jiang, Prompting for multimodal hateful meme classification, in: Proceedings of the Conference on Empirical Methods in Natural Language Processing, 2022, pp. 321\u2013332.","DOI":"10.18653\/v1\/2022.emnlp-main.22"},{"key":"10.1016\/j.patcog.2026.114023_b12","doi-asserted-by":"crossref","unstructured":"H. Lin, Z. Luo, W. Gao, J. Ma, B. Wang, R. Yang, Towards explainable harmful meme detection through multimodal debate between large language models, in: Proceedings of the ACM on Web Conference, 2024, pp. 2359\u20132370.","DOI":"10.1145\/3589334.3645381"},{"key":"10.1016\/j.patcog.2026.114023_b13","doi-asserted-by":"crossref","unstructured":"B. Xie, S. Zhang, Z. Zhou, B. Li, Y. Zhang, J. Hessel, J. Yang, Z. Liu, Funqa: towards surprising video comprehension, in: European Conference on Computer Vision, 2024, pp. 39\u201357.","DOI":"10.1007\/978-3-031-73232-4_3"},{"key":"10.1016\/j.patcog.2026.114023_b14","unstructured":"Y. Zeng, W. Kang, Y. Chen, H.I. Koo, K. Lee, Can MLLMs Perform Text-to-Image In-Context Learning?, in: COLM, 2024."},{"key":"10.1016\/j.patcog.2026.114023_b15","article-title":"II-bench: An image implication understanding benchmark for multimodal large language models","volume":"vol. 37","author":"Bai","year":"2024"},{"key":"10.1016\/j.patcog.2026.114023_b16","doi-asserted-by":"crossref","first-page":"96","DOI":"10.1038\/s42256-024-00963-y","article-title":"Visual cognition in multimodal large language models","volume":"7","author":"Buschoff","year":"2023","journal-title":"Nat. Mach. Intell."},{"key":"10.1016\/j.patcog.2026.114023_b17","series-title":"VideoHallu: Evaluating and mitigating multi-modal hallucinations on synthetic video understanding","author":"Li","year":"2025"},{"issue":"3","key":"10.1016\/j.patcog.2026.114023_b18","first-page":"16","article-title":"Advanced embedding techniques in multimodal retrieval augmented generation a comprehensive study on cross modal ai applications","volume":"13","author":"Zhou","year":"2024","journal-title":"J. Comput. Electron. Inf. Manag."},{"key":"10.1016\/j.patcog.2026.114023_b19","doi-asserted-by":"crossref","unstructured":"P. Joshi, A. Gupta, P. Kumar, M. Sisodia, Robust multimodel rag pipeline for documents containing text, table & images, in: Proceedings of the International Conference on Applied Artificial Intelligence and Computing, 2024, pp. 993\u2013999.","DOI":"10.1109\/ICAAIC60222.2024.10574972"},{"key":"10.1016\/j.patcog.2026.114023_b20","doi-asserted-by":"crossref","unstructured":"P. Lerner, O. Ferret, C. Guinaudeau, Cross-modal retrieval for knowledge-based visual question answering, in: Advances in the European Conference on Information Retrieval, 2024, pp. 421\u2013438.","DOI":"10.1007\/978-3-031-56027-9_26"},{"key":"10.1016\/j.patcog.2026.114023_b21","series-title":"Findings of the Association for Computational Linguistics: EMNLP","first-page":"247","article-title":"Snapntell: enhancing entity-centric visual question answering with retrieval augmented multimodal llm","author":"Qiu","year":"2024"},{"key":"10.1016\/j.patcog.2026.114023_b22","series-title":"Proceedings of the Association for Computational Linguistics: EMNLP","first-page":"1081","article-title":"Rule: reliable multimodal rag for factuality in medical vision language models","author":"Xia","year":"2024"},{"key":"10.1016\/j.patcog.2026.114023_b23","series-title":"Advances in Neural Information Processing Systems","first-page":"28541","article-title":"Llava-med: training a large language-and-vision assistant for biomedicine in one day","author":"Li","year":"2023"},{"key":"10.1016\/j.patcog.2026.114023_b24","series-title":"Retrieval meets reasoning: even high-school textbook knowledge benefits multimodal reasoning","author":"Tan","year":"2024"},{"key":"10.1016\/j.patcog.2026.114023_b25","doi-asserted-by":"crossref","unstructured":"D. Caffagni, F. Cocchi, N. Moratelli, S. Sarto, M. Cornia, L. Baraldi, R. Cucchiara, Wiki-LLaVA: Hierarchical Retrieval-Augmented Generation for Multimodal LLMs, in: 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, CVPRW, 2024, pp. 1818\u20131826.","DOI":"10.1109\/CVPRW63382.2024.00188"},{"key":"10.1016\/j.patcog.2026.114023_b26","doi-asserted-by":"crossref","unstructured":"X. Long, Z. Ma, E. Hua, K. Zhang, B. Qi, B. Zhou, Retrieval-Augmented Visual Question Answering via Built-in Autoregressive Search Engines, in: AAAI, 2025, pp. 24723\u201324731.","DOI":"10.1609\/aaai.v39i23.34653"},{"key":"10.1016\/j.patcog.2026.114023_b27","article-title":"Improving image generation with better captions","author":"Betker","year":"2023","journal-title":"Comput. Sci."},{"key":"10.1016\/j.patcog.2026.114023_b28","doi-asserted-by":"crossref","unstructured":"O. Honovich, L. Choshen, R. Aharoni, E. Neeman, I. Szpektor, O. Abend, Q2: evaluating factual consistency in knowledge-grounded dialogues via question generation and question answering, in: Proceedings of the Association for Computational Linguistics: EMNLP, 2021, pp. 7856\u20137870.","DOI":"10.18653\/v1\/2021.emnlp-main.619"},{"key":"10.1016\/j.patcog.2026.114023_b29","doi-asserted-by":"crossref","unstructured":"S. Changpinyo, D. Kukliansky, I. Szpektor, X. Chen, N. Ding, R. Soricut, All you may need for vqa are image captions, in: Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics, 2022, pp. 1947\u20131963.","DOI":"10.18653\/v1\/2022.naacl-main.142"},{"key":"10.1016\/j.patcog.2026.114023_b30","unstructured":"L.H. Li, P. Zhang, H. Zhang, J. Yang, C. Li, Y. Zhong, L. Wang, L. Yuan, L. Zhang, J.-N. Hwang, et al., Grounded language-image pre-training, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 10965\u201310975."},{"issue":"11","key":"10.1016\/j.patcog.2026.114023_b31","doi-asserted-by":"crossref","first-page":"39","DOI":"10.1145\/219717.219748","article-title":"WordNet: a lexical database for english","volume":"38","author":"Miller","year":"1995","journal-title":"Commun. ACM"},{"key":"10.1016\/j.patcog.2026.114023_b32","doi-asserted-by":"crossref","unstructured":"J. Devlin, M. Chang, K. Lee, K. Toutanova, Bert: pre-training of deep bidirectional transformers for Language understanding, in: Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics, 2019, pp. 4171\u20134186.","DOI":"10.18653\/v1\/N19-1423"},{"key":"10.1016\/j.patcog.2026.114023_b33","unstructured":"J. Li, D. Li, S. Savarese, S. Hoi, Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models, in: International Conference on Machine Learning, 2023, pp. 19730\u201319742."},{"key":"10.1016\/j.patcog.2026.114023_b34","series-title":"Advances in Neural Information Processing Systems","first-page":"49250","article-title":"Instructblip: towards general-purpose vision-language models with instruction tuning","author":"Dai","year":"2023"},{"key":"10.1016\/j.patcog.2026.114023_b35","series-title":"Mplug-owl: modularization empowers large language models with multimodality","author":"Ye","year":"2023"},{"key":"10.1016\/j.patcog.2026.114023_b36","series-title":"Mplug-owl2: revolutionizing multi-modal large language model with modality collaboration","author":"Ye","year":"2023"},{"key":"10.1016\/j.patcog.2026.114023_b37","unstructured":"H. Zhao, Z. Cai, S. Si, X. Ma, K. An, L. Chen, Z. Liu, S. Wang, W. Han, B. Chang, Mmicl: empowering vision-language model with multi-modal in-context learning, in: International Conference on Learning Representations, 2024, pp. 31432\u201331470."},{"key":"10.1016\/j.patcog.2026.114023_b38","series-title":"Mimic-it: multi-modal in-context instruction tuning","author":"Li","year":"2023"},{"key":"10.1016\/j.patcog.2026.114023_b39","series-title":"Openflamingo: an open-source framework for training large autoregressive vision-language models","author":"Awadalla","year":"2023"},{"key":"10.1016\/j.patcog.2026.114023_b40","doi-asserted-by":"crossref","unstructured":"H. Liu, C. Li, Y. Li, Y.J. Lee, Improved baselines with visual instruction tuning, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 26296\u201326306.","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"10.1016\/j.patcog.2026.114023_b41","series-title":"Llava-next: improved reasoning, OCR, and world knowledge","author":"Liu","year":"2024"},{"key":"10.1016\/j.patcog.2026.114023_b42","series-title":"Deepseek-vl: towards real-world vision-language understanding","author":"Lu","year":"2024"},{"key":"10.1016\/j.patcog.2026.114023_b43","series-title":"R-4b: incentivizing general-purpose auto-thinking capability in mllms via bi-mode annealing and reinforce learning","author":"Yang","year":"2025"},{"key":"10.1016\/j.patcog.2026.114023_b44","series-title":"Qwen2.5-vl technical report","author":"Bai","year":"2025"},{"key":"10.1016\/j.patcog.2026.114023_b45","series-title":"Internvl3.5: advancing open-source multimodal models in versatility, reasoning, and efficiency","author":"Wang","year":"2025"},{"key":"10.1016\/j.patcog.2026.114023_b46","series-title":"Ovis2.5 technical report","author":"Lu","year":"2025"},{"key":"10.1016\/j.patcog.2026.114023_b47","series-title":"Gpt-4 technical report","author":"Achiam","year":"2023"},{"key":"10.1016\/j.patcog.2026.114023_b48","series-title":"Cocot: contrastive chain-of-thought prompting for large multimodal models with multiple image inputs","author":"Zhang","year":"2024"},{"key":"10.1016\/j.patcog.2026.114023_b49","doi-asserted-by":"crossref","unstructured":"K. Papineni, S. Roukos, T. Ward, W.-J. Zhu, Bleu: a method for automatic evaluation of machine translation, in: Proceedings of the Annual Meeting of the Association for Computational Linguistics, 2002, pp. 311\u2013318.","DOI":"10.3115\/1073083.1073135"},{"key":"10.1016\/j.patcog.2026.114023_b50","series-title":"Text Summarization Branches Out","first-page":"74","article-title":"Rouge: a package for automatic evaluation of summaries","author":"Lin","year":"2004"},{"key":"10.1016\/j.patcog.2026.114023_b51","doi-asserted-by":"crossref","unstructured":"R. Vedantam, C. Lawrence Zitnick, D. Parikh, Cider: consensus-based image description evaluation, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2015, pp. 4566\u20134575.","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"10.1016\/j.patcog.2026.114023_b52","doi-asserted-by":"crossref","unstructured":"Z. Wu, M. Palmer, Verb semantics and lexical selection, in: Proceedings of the Annual Meeting of the Association for Computational Linguistics, 1994, pp. 133\u2013138.","DOI":"10.3115\/981732.981751"},{"key":"10.1016\/j.patcog.2026.114023_b53","doi-asserted-by":"crossref","first-page":"211","DOI":"10.1023\/B:BTTJ.0000047600.45421.6d","article-title":"ConceptNet \u2014 A practical commonsense reasoning tool-kit","volume":"22","author":"Liu","year":"2004","journal-title":"BT Technol. J."},{"key":"10.1016\/j.patcog.2026.114023_b54","doi-asserted-by":"crossref","first-page":"1956","DOI":"10.1007\/s11263-020-01316-z","article-title":"The open images dataset V4","volume":"128","author":"Kuznetsova","year":"2018","journal-title":"Int. J. Comput. Vis."}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S003132032600988X?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S003132032600988X?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T14:56:28Z","timestamp":1780930588000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S003132032600988X"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,12]]},"references-count":54,"alternative-id":["S003132032600988X"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.114023","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Challenging and enhancing the reasoning capacity of multimodal LLMs in context-violating images","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.114023","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"114023"}}