{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T15:04:58Z","timestamp":1780931098208,"version":"3.54.1"},"reference-count":45,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,12]]},"DOI":"10.1016\/j.patcog.2026.114108","type":"journal-article","created":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T15:51:12Z","timestamp":1780069872000},"page":"114108","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PA","title":["Physical plausibility reasoning via HCM-GRPO: Empowering compact model for superior performance"],"prefix":"10.1016","volume":"180","author":[{"given":"Zhiyuan","family":"Hu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zheng","family":"Sun","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yi","family":"Wei","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Long","family":"Yu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2026.114108_b1","doi-asserted-by":"crossref","unstructured":"Z. Chen, J. Wu, W. Wang, W. Su, G. Chen, S. Xing, M. Zhong, Q. Zhang, X. Zhu, L. Lu, et al., Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 24185\u201324198.","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"10.1016\/j.patcog.2026.114108_b2","series-title":"Qwen2.5-VL","author":"Team","year":"2025"},{"key":"10.1016\/j.patcog.2026.114108_b3","doi-asserted-by":"crossref","unstructured":"Z. Yang, J. Tang, Z. Li, P. Wang, J. Wan, H. Zhong, X. Liu, M. Yang, P. Wang, S. Bai, et al., Cc-ocr: A comprehensive and challenging ocr benchmark for evaluating large multimodal models in literacy, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2025, pp. 21744\u201321754.","DOI":"10.1109\/ICCV51701.2025.02019"},{"key":"10.1016\/j.patcog.2026.114108_b4","first-page":"23678","article-title":"Comt: A novel benchmark for chain of multi-modal thought on large vision-language models","volume":"vol. 39","author":"Cheng","year":"2025"},{"key":"10.1016\/j.patcog.2026.114108_b5","doi-asserted-by":"crossref","unstructured":"Z. Liu, Z. Sun, Y. Zang, X. Dong, Y. Cao, H. Duan, D. Lin, J. Wang, Visual-rft: Visual reinforcement fine-tuning, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2025, pp. 2034\u20132044.","DOI":"10.1109\/ICCV51701.2025.00197"},{"key":"10.1016\/j.patcog.2026.114108_b6","doi-asserted-by":"crossref","unstructured":"Q. Sun, Y. Cui, X. Zhang, F. Zhang, Q. Yu, Y. Wang, Y. Rao, J. Liu, T. Huang, X. Wang, Generative multimodal models are in-context learners, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 14398\u201314409.","DOI":"10.1109\/CVPR52733.2024.01365"},{"key":"10.1016\/j.patcog.2026.114108_b7","first-page":"84839","article-title":"Visual autoregressive modeling: Scalable image generation via next-scale prediction","volume":"37","author":"Tian","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.114108_b8","doi-asserted-by":"crossref","unstructured":"Z. Huang, S. Zhuang, C. Fu, B. Yang, Y. Zhang, C. Sun, Z. Zhang, Y. Wang, C. Li, Z.-J. Zha, Wegen: A unified model for interactive multimodal generation as we chat, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 23679\u201323689.","DOI":"10.1109\/CVPR52734.2025.02205"},{"key":"10.1016\/j.patcog.2026.114108_b9","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.114108_b10","doi-asserted-by":"crossref","unstructured":"R. Rombach, A. Blattmann, D. Lorenz, P. Esser, B. Ommer, High-resolution image synthesis with latent diffusion models, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 10684\u201310695.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"10.1016\/j.patcog.2026.114108_b11","doi-asserted-by":"crossref","unstructured":"W. Peebles, S. Xie, Scalable diffusion models with transformers, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 4195\u20134205.","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"10.1016\/j.patcog.2026.114108_b12","series-title":"Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models","author":"Ye","year":"2023"},{"key":"10.1016\/j.patcog.2026.114108_b13","first-page":"1","article-title":"Multimodal learning with next-token prediction for large multimodal models","author":"Wang","year":"2026","journal-title":"Nature"},{"key":"10.1016\/j.patcog.2026.114108_b14","doi-asserted-by":"crossref","unstructured":"L. Zhang, A. Rao, M. Agrawala, Adding conditional control to text-to-image diffusion models, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 3836\u20133847.","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"10.1016\/j.patcog.2026.114108_b15","doi-asserted-by":"crossref","unstructured":"Z. Tan, S. Liu, X. Yang, Q. Xue, X. Wang, Ominicontrol: Minimal and universal control for diffusion transformer, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2025, pp. 14940\u201314950.","DOI":"10.1109\/ICCV51701.2025.01386"},{"issue":"9","key":"10.1016\/j.patcog.2026.114108_b16","doi-asserted-by":"crossref","first-page":"4798","DOI":"10.1109\/TCSVT.2023.3249185","article-title":"Theme-aware visual attribute reasoning for image aesthetics assessment","volume":"33","author":"Li","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.patcog.2026.114108_b17","doi-asserted-by":"crossref","unstructured":"Y. Bin, W. Shi, Y. Ding, Z. Hu, Z. Wang, Y. Yang, S.-K. Ng, H.T. Shen, Gallerygpt: Analyzing paintings with large multimodal models, in: Proceedings of the 32nd ACM International Conference on Multimedia, 2024, pp. 7734\u20137743.","DOI":"10.1145\/3664647.3681656"},{"key":"10.1016\/j.patcog.2026.114108_b18","first-page":"1","article-title":"Bilateral reference for high-resolution dichotomous image segmentation","volume":"3","author":"Zheng","year":"2024","journal-title":"CAAI Artif. Intell. Res."},{"key":"10.1016\/j.patcog.2026.114108_b19","series-title":"FLUX","author":"Labs","year":"2024"},{"issue":"8081","key":"10.1016\/j.patcog.2026.114108_b20","doi-asserted-by":"crossref","first-page":"633","DOI":"10.1038\/s41586-025-09422-z","article-title":"DeepSeek-R1 incentivizes reasoning in LLMs through reinforcement learning","volume":"645","author":"Guo","year":"2025","journal-title":"Nature"},{"key":"10.1016\/j.patcog.2026.114108_b21","series-title":"Glm-4.5 v and glm-4.1 v-thinking: Towards versatile multimodal reasoning with scalable reinforcement learning","author":"Hong","year":"2025"},{"key":"10.1016\/j.patcog.2026.114108_b22","first-page":"17044","article-title":"Naturalbench: Evaluating vision-language models on natural adversarial samples","volume":"37","author":"Li","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.114108_b23","article-title":"Unlocking human intent perception through multimodal large models","author":"Wang","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114108_b24","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.111641","article-title":"Cross-scene visual context parsing with large vision-language model","volume":"166","author":"Zhang","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114108_b25","article-title":"SWG-fusion: Soft weather-guided multimodal fusion with VLM-assistance for BEV object detection under harsh weather","author":"Wang","year":"2026","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114108_b26","article-title":"Event-based facial expression recognition via large vision-language models","author":"Li","year":"2026","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114108_b27","first-page":"113222","article-title":"Dapo: An open-source llm reinforcement learning system at scale","volume":"38","author":"Yu","year":"2026","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.114108_b28","article-title":"Value decomposition with maximum correntropy for multi-agent deep reinforcement learning","author":"Liu","year":"2026","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114108_b29","unstructured":"Y. Wang, Z. Li, Y. Zang, C. Wang, Q. Lu, C. Jin, J. Wang, Unified Multimodal Chain-of-Thought Reward Model through Reinforcement Fine-Tuning, in: The Thirty-Ninth Annual Conference on Neural Information Processing Systems."},{"key":"10.1016\/j.patcog.2026.114108_b30","doi-asserted-by":"crossref","unstructured":"J. Zhang, J. Huang, H. Yao, S. Liu, X. Zhang, S. Lu, D. Tao, R1-VL: Learning to Reason with Multimodal Large Language Models via Step-wise Group Relative Policy Optimization, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2025, pp. 1859\u20131869.","DOI":"10.1109\/ICCV51701.2025.00181"},{"key":"10.1016\/j.patcog.2026.114108_b31","unstructured":"Alibaba Cloud, Qwen-VL-max, https:\/\/www.aliyun.com."},{"key":"10.1016\/j.patcog.2026.114108_b32","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.111097","article-title":"Eye-movement-prompted large image captioning model","volume":"159","author":"Yang","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114108_b33","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2026.113316","article-title":"Towards fine-grained vision-language alignment for few-shot anomaly detection","author":"Fan","year":"2026","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114108_b34","series-title":"InternVL3.5: Advancing open-source multimodal models in versatility, reasoning, and efficiency","author":"Wang","year":"2025"},{"key":"10.1016\/j.patcog.2026.114108_b35","series-title":"Qwen3.5: Accelerating productivity with native multimodal agents","author":"Team","year":"2026"},{"key":"10.1016\/j.patcog.2026.114108_b36","unstructured":"OpenAI, GPT - OpenAI, https:\/\/openai.com."},{"key":"10.1016\/j.patcog.2026.114108_b37","unstructured":"Google, Gemini3 - Google, https:\/\/gemini.google.com."},{"key":"10.1016\/j.patcog.2026.114108_b38","unstructured":"Anthropic, Claude Sonnet4.5 - Anthropic, URL https:\/\/www.anthropic.com\/news\/claude-sonnet-4-5."},{"key":"10.1016\/j.patcog.2026.114108_b39","first-page":"29733","article-title":"Swift: a scalable lightweight infrastructure for fine-tuning","volume":"vol. 39","author":"Zhao","year":"2025"},{"key":"10.1016\/j.patcog.2026.114108_b40","unstructured":"Obsismc, MMLU-multi-answers, URL https:\/\/huggingface.co\/datasets\/Obsismc\/mmlu-multi_answers."},{"key":"10.1016\/j.patcog.2026.114108_b41","doi-asserted-by":"crossref","unstructured":"W. Zhong, R. Cui, Y. Guo, Y. Liang, S. Lu, Y. Wang, A. Saied, W. Chen, N. Duan, Agieval: A human-centric benchmark for evaluating foundation models, in: Findings of the Association for Computational Linguistics: NAACL 2024, 2024, pp. 2299\u20132314.","DOI":"10.18653\/v1\/2024.findings-naacl.149"},{"key":"10.1016\/j.patcog.2026.114108_b42","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.111332","article-title":"Enhancing textual textbook question answering with large language models and retrieval augmented generation","volume":"162","author":"Alawwad","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114108_b43","unstructured":"X.AI Corp, Grok-1.5 vision preview: Connecting the digital and physical worlds with our first multimodal model, https:\/\/x.ai\/blog\/grok-1.5v."},{"key":"10.1016\/j.patcog.2026.114108_b44","series-title":"The Thirteenth International Conference on Learning Representations","article-title":"MuirBench: A comprehensive benchmark for robust multi-image understanding","author":"Wang","year":"2025"},{"key":"10.1016\/j.patcog.2026.114108_b45","series-title":"European Conference on Computer Vision","first-page":"148","article-title":"Blink: Multimodal large language models can see but not perceive","author":"Fu","year":"2024"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326010733?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326010733?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T14:43:30Z","timestamp":1780929810000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320326010733"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,12]]},"references-count":45,"alternative-id":["S0031320326010733"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.114108","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Physical plausibility reasoning via HCM-GRPO: Empowering compact model for superior performance","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.114108","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"114108"}}