{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T15:58:08Z","timestamp":1780934288831,"version":"3.54.1"},"reference-count":54,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,12]]},"DOI":"10.1016\/j.patcog.2026.113955","type":"journal-article","created":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T02:18:11Z","timestamp":1778811491000},"page":"113955","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PA","title":["Reinforcement learning-powered co-optimization: Bridging critic model and multimodal LLM reasoning abilities"],"prefix":"10.1016","volume":"180","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3843-3920","authenticated-orcid":false,"given":"Qing","family":"Wang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shuhang","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhenrong","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2387-0389","authenticated-orcid":false,"given":"Jun","family":"Du","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jianshu","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Quan","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2026.113955_b1","unstructured":"P. Lu, H. Bansal, T. Xia, J. Liu, C. Li, H. Hajishirzi, H. Cheng, K.-W. Chang, M. Galley, J. Gao, MathVista: Evaluating mathematical reasoning of foundation models in visual contexts, in: Proceedings of the International Conference on Learning Representations, 2024."},{"key":"10.1016\/j.patcog.2026.113955_b2","article-title":"Visual reasoning consistency and robustness analysis of multimodal LLMs","author":"Jegham","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113955_b3","series-title":"European Conference on Computer Vision","first-page":"169","article-title":"MathVerse: Does your multi-modal llm truly see the diagrams in visual math problems?","author":"Zhang","year":"2024"},{"key":"10.1016\/j.patcog.2026.113955_b4","article-title":"A multi-expert framework for enhancing multimodal large language models in industrial anomaly detection","author":"Chen","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113955_b5","series-title":"OpenAI o1 system card","author":"Jaech","year":"2024"},{"issue":"8081","key":"10.1016\/j.patcog.2026.113955_b6","doi-asserted-by":"crossref","first-page":"633","DOI":"10.1038\/s41586-025-09422-z","article-title":"DeepSeek-R1 incentivizes reasoning in LLMs through reinforcement learning","volume":"645","author":"Guo","year":"2025","journal-title":"Nature"},{"key":"10.1016\/j.patcog.2026.113955_b7","first-page":"8634","article-title":"Reflexion: Language agents with verbal reinforcement learning","volume":"vol. 36","author":"Shinn","year":"2023"},{"key":"10.1016\/j.patcog.2026.113955_b8","doi-asserted-by":"crossref","unstructured":"P. Jian, J. Wu, W. Sun, C. Wang, S. Ren, J. Zhang, Look again, think slowly: Enhancing visual reflection in vision-language models, in: Proceedings of the Conference on Empirical Methods in Natural Language Processing, 2025, pp. 9262\u20139281.","DOI":"10.18653\/v1\/2025.emnlp-main.470"},{"key":"10.1016\/j.patcog.2026.113955_b9","unstructured":"S. Welleck, X. Lu, P. West, F. Brahman, T. Shen, D. Khashabi, Y. Choi, Generating sequences by learning to self-correct, in: Proceedings of the International Conference on Learning Representations, 2023."},{"key":"10.1016\/j.patcog.2026.113955_b10","article-title":"A reinforcement learning framework for energy-optimal UAV path planning in wind fields","author":"Lian","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113955_b11","unstructured":"W. Huang, B. Jia, Z. Zhai, S. Cao, Z. Ye, F. Zhao, Z. Xu, Y. Hu, S. Lin, Vision-R1: Incentivizing reasoning capability in multimodal large language models, in: Proceedings of the International Conference on Learning Representations, 2026."},{"key":"10.1016\/j.patcog.2026.113955_b12","doi-asserted-by":"crossref","unstructured":"Y. Yang, X. He, H. Pan, X. Jiang, Y. Deng, X. Yang, H. Lu, D. Yin, F. Rao, M. Zhu, et al., R1-Onevision: Advancing generalized multimodal reasoning through cross-modal formalization, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2025, pp. 2376\u20132385.","DOI":"10.1109\/ICCV51701.2025.00229"},{"key":"10.1016\/j.patcog.2026.113955_b13","doi-asserted-by":"crossref","unstructured":"J. Zhang, X. Wang, F. Mo, Y. Zhou, W. Gao, K. Liu, Entropy-based exploration conduction for multi-step reasoning, in: Findings of the Association for Computational Linguistics, 2025, pp. 3895\u20133906.","DOI":"10.18653\/v1\/2025.findings-acl.201"},{"key":"10.1016\/j.patcog.2026.113955_b14","series-title":"Enhancing LLM reasoning via critique models with test-time and training-time supervision","author":"Xi","year":"2024"},{"key":"10.1016\/j.patcog.2026.113955_b15","series-title":"LLM critics help catch LLM bugs","author":"McAleese","year":"2024"},{"key":"10.1016\/j.patcog.2026.113955_b16","doi-asserted-by":"crossref","unstructured":"A.F. Aky\u00fcrek, E. Aky\u00fcrek, A. Kalyan, P. Clark, D.T. Wijaya, N. Tandon, RL4F: Generating natural language feedback with reinforcement learning for repairing model outputs, in: Proceedings of the Annual Meeting of the Association for Computational Linguistics, 2023, pp. 7716\u20137733.","DOI":"10.18653\/v1\/2023.acl-long.427"},{"key":"10.1016\/j.patcog.2026.113955_b17","doi-asserted-by":"crossref","unstructured":"D. Zhang, J. Lei, J. Li, X. Wang, Y. Liu, Z. Yang, J. Li, W. Wang, S. Yang, J. Wu, et al., Critic-V: VLM critics help catch VLM errors in multimodal reasoning, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2025, pp. 9050\u20139061.","DOI":"10.1109\/CVPR52734.2025.00846"},{"key":"10.1016\/j.patcog.2026.113955_b18","unstructured":"Y. Wang, X. Yue, W. Chen, Critique fine-tuning: Learning to critique is more effective than learning to imitate, in: Conference on Language Modeling, 2025."},{"key":"10.1016\/j.patcog.2026.113955_b19","series-title":"DeepSeekMath: Pushing the limits of mathematical reasoning in open language models","author":"Shao","year":"2024"},{"key":"10.1016\/j.patcog.2026.113955_b20","doi-asserted-by":"crossref","unstructured":"S. Liu, Z. Zhang, P. Hu, J. Ma, J. Du, Q. Wang, J. Zhang, Q. Liu, J. Gao, F. Ma, MMC: Iterative refinement of VLM reasoning via MCTS-based multimodal critique, in: Proceedings of the International Workshop on Large Generative Models Meet Multimodal Applications, 2025, pp. 11\u201320.","DOI":"10.1145\/3728422.3762145"},{"key":"10.1016\/j.patcog.2026.113955_b21","series-title":"Proceedings of the International Conference on Computers and Games","first-page":"72","article-title":"Efficient selectivity and backup operators in Monte-Carlo tree search","author":"Coulom","year":"2006"},{"key":"10.1016\/j.patcog.2026.113955_b22","series-title":"Inference-time scaling for generalist reward modeling","author":"Liu","year":"2025"},{"key":"10.1016\/j.patcog.2026.113955_b23","unstructured":"Z. Ankner, M. Paul, B. Cui, J.D. Chang, P. Ammanabrolu, Critique-out-loud reward models, in: Advances in Neural Information Processing Systems, 2024."},{"key":"10.1016\/j.patcog.2026.113955_b24","unstructured":"H. Lightman, V. Kosaraju, Y. Burda, H. Edwards, B. Baker, T. Lee, J. Leike, J. Schulman, I. Sutskever, K. Cobbe, Let\u2019s verify step by step, in: Proceedings of the International Conference on Learning Representations, 2024."},{"key":"10.1016\/j.patcog.2026.113955_b25","series-title":"Atomthink: A slow thinking framework for multimodal mathematical reasoning","author":"Xiang","year":"2024"},{"key":"10.1016\/j.patcog.2026.113955_b26","doi-asserted-by":"crossref","unstructured":"G. Xu, P. Jin, Z. Wu, H. Li, Y. Song, L. Sun, L. Yuan, LLaVA-CoT: Let vision language models reason step-by-step, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2025, pp. 2087\u20132098.","DOI":"10.1109\/ICCV51701.2025.00202"},{"key":"10.1016\/j.patcog.2026.113955_b27","unstructured":"R. Luo, Z. Zheng, L. Wang, Y. Wang, X. Ni, Z. Lin, S. Jiang, Y. Yu, C. Shi, R. Chu, et al., Unlocking Multimodal Mathematical Reasoning via Process Reward Model, in: Proceedings of the Conference on Neural Information Processing Systems, 2025."},{"key":"10.1016\/j.patcog.2026.113955_b28","unstructured":"Q. Cao, R. Wang, R. Zhang, S.A. Somayajula, P. Xie, DreamPRM: Domain-Reweighted Process Reward Model for Multimodal Reasoning, in: Proceedings of the Conference on Neural Information Processing Systems, 2025."},{"key":"10.1016\/j.patcog.2026.113955_b29","series-title":"Agent-R: Training language model agents to reflect via iterative self-training","author":"Yuan","year":"2025"},{"key":"10.1016\/j.patcog.2026.113955_b30","article-title":"Reasoning elicitation and multi-granularity contrastive learning for text-rich image understanding in large vision-language models","author":"Xia","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113955_b31","series-title":"LMM-R1: Empowering 3B LLMs with strong reasoning abilities through two-stage rule-based RL","author":"Peng","year":"2025"},{"key":"10.1016\/j.patcog.2026.113955_b32","series-title":"Advances in Neural Information Processing Systems","article-title":"Reason-RFT: Reinforcement fine-tuning for visual reasoning of vision language models","author":"Tan","year":"2025"},{"key":"10.1016\/j.patcog.2026.113955_b33","series-title":"Advances in Neural Information Processing Systems","article-title":"VL-Rethinker: Incentivizing self-reflection of vision-language models with reinforcement learning","author":"Wang","year":"2025"},{"key":"10.1016\/j.patcog.2026.113955_b34","doi-asserted-by":"crossref","unstructured":"F. Wang, Z. Zhao, Y. Liu, D. Zhang, J. Gao, H. Sun, X. Li, SVGen: Interpretable vector graphics generation with large language models, in: Proceedings of the ACM International Conference on Multimedia, 2025, pp. 9608\u20139617.","DOI":"10.1145\/3746027.3755011"},{"key":"10.1016\/j.patcog.2026.113955_b35","first-page":"53728","article-title":"Direct preference optimization: Your language model is secretly a reward model","volume":"vol. 36","author":"Rafailov","year":"2023"},{"key":"10.1016\/j.patcog.2026.113955_b36","article-title":"Parameter-efficient action planning with large language models for vision-and-language navigation","author":"Mohammadi","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113955_b37","doi-asserted-by":"crossref","unstructured":"D. Zhang, F. Wang, B. Li, Z. Zhao, J. Gao, X. Li, KAID: Knowledge-Aware Interactive Distillation for Vision-Language Models, in: Proceedings of the ACM International Conference on Multimedia, 2025, pp. 3212\u20133221.","DOI":"10.1145\/3746027.3755008"},{"key":"10.1016\/j.patcog.2026.113955_b38","series-title":"Qwen2-VL: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024"},{"key":"10.1016\/j.patcog.2026.113955_b39","series-title":"Claude 3.5 sonnet model card addendum","author":"Anthropic","year":"2024"},{"key":"10.1016\/j.patcog.2026.113955_b40","series-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","author":"Reid","year":"2024"},{"key":"10.1016\/j.patcog.2026.113955_b41","series-title":"MM-Eureka: Exploring the frontiers of multimodal reasoning with rule-based reinforcement learning","author":"Meng","year":"2025"},{"key":"10.1016\/j.patcog.2026.113955_b42","unstructured":"Y. Zhan, Z. Wu, Y. Zhu, R. Xue, R. Luo, Z. Chen, C. Zhang, Y. Li, Z. He, Z. Yang, et al., GThinker: Towards General Multimodal Reasoning via Cue-Guided Rethinking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2026."},{"key":"10.1016\/j.patcog.2026.113955_b43","doi-asserted-by":"crossref","unstructured":"Y. Zha, K. Zhou, Y. Wu, Y. Wang, J. Feng, Z. Xu, S. Hao, Z. Liu, E.P. Xing, Z. Hu, Vision-G1: Towards General Reasoning Vision-Language Models via Reinforcement Learning, in: AAAI Conference on Artificial Intelligence, 2026, p. 28131\u201328139.","DOI":"10.1609\/aaai.v40i33.40039"},{"key":"10.1016\/j.patcog.2026.113955_b44","unstructured":"L. Zhu, Y. Guan, D. Liang, J. Ju, Z. Luo, B. Qin, J. Luan, Y. Liu, X. Bai, Shuffle-R1: Efficient RL framework for multimodal large language models via data-centric dynamic shuffle, in: Proceedings of the International Conference on Learning Representations, 2026."},{"key":"10.1016\/j.patcog.2026.113955_b45","first-page":"6155","article-title":"MathVision: An accessible intelligent agent for visually impaired people to understand mathematical equations","author":"Awais","year":"2024","journal-title":"IEEE Access"},{"key":"10.1016\/j.patcog.2026.113955_b46","series-title":"Proceedings of the Annual Meeting of the Association for Computational Linguistics","first-page":"8199","article-title":"M3CoT: A novel benchmark for multi-domain multi-step multi-modal chain-of-thought","author":"Chen","year":"2024"},{"key":"10.1016\/j.patcog.2026.113955_b47","doi-asserted-by":"crossref","unstructured":"A. Masry, X.L. Do, J.Q. Tan, S. Joty, E. Hoque, ChartQA: A benchmark for question answering about charts with visual and logical reasoning, in: Findings of the Association for Computational Linguistics, 2022, pp. 2263\u20132279.","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"10.1016\/j.patcog.2026.113955_b48","first-page":"27056","article-title":"Are we on the right way for evaluating large vision-language models?","volume":"vol. 37","author":"Chen","year":"2024"},{"key":"10.1016\/j.patcog.2026.113955_b49","doi-asserted-by":"crossref","unstructured":"X. Yue, Y. Ni, K. Zhang, T. Zheng, R. Liu, G. Zhang, S. Stevens, D. Jiang, W. Ren, Y. Sun, et al., MMMU: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 9556\u20139567.","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"10.1016\/j.patcog.2026.113955_b50","doi-asserted-by":"crossref","unstructured":"X. Yue, T. Zheng, Y. Ni, Y. Wang, K. Zhang, S. Tong, Y. Sun, B. Yu, G. Zhang, H. Sun, et al., MMMU-Pro: A more robust multi-discipline multimodal understanding benchmark, in: Proceedings of the Annual Meeting of the Association for Computational Linguistics, 2025, pp. 15134\u201315186.","DOI":"10.18653\/v1\/2025.acl-long.736"},{"key":"10.1016\/j.patcog.2026.113955_b51","doi-asserted-by":"crossref","unstructured":"M. Mathew, D. Karatzas, C. Jawahar, DocVQA: A dataset for vqa on document images, in: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2021, pp. 2200\u20132209.","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"10.1016\/j.patcog.2026.113955_b52","doi-asserted-by":"crossref","unstructured":"M. Mathew, V. Bagal, R. Tito, D. Karatzas, E. Valveny, C. Jawahar, InfographicVQA, in: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2022, pp. 1697\u20131706.","DOI":"10.1109\/WACV51458.2022.00264"},{"key":"10.1016\/j.patcog.2026.113955_b53","doi-asserted-by":"crossref","unstructured":"W. Shi, Z. Hu, Y. Bin, J. Liu, Y. Yang, S.K. Ng, L. Bing, R.K.-W. Lee, Math-LLaVA: Bootstrapping mathematical reasoning for multimodal large language models, in: Findings of the Association for Computational Linguistics, EMNLP 2024, 2024, pp. 4663\u20134680.","DOI":"10.18653\/v1\/2024.findings-emnlp.268"},{"key":"10.1016\/j.patcog.2026.113955_b54","series-title":"Expanding performance boundaries of open-source multimodal models with model, data, and test-time scaling","author":"Chen","year":"2024"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326009209?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326009209?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T15:05:12Z","timestamp":1780931112000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320326009209"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,12]]},"references-count":54,"alternative-id":["S0031320326009209"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113955","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Reinforcement learning-powered co-optimization: Bridging critic model and multimodal LLM reasoning abilities","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113955","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"113955"}}