{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T07:12:18Z","timestamp":1778051538125,"version":"3.51.4"},"reference-count":52,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,3,6]]},"DOI":"10.1109\/wacv61042.2026.00045","type":"proceedings-article","created":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T19:59:32Z","timestamp":1778011172000},"page":"382-392","source":"Crossref","is-referenced-by-count":0,"title":["M4U: Evaluating Multilingual Understanding and Reasoning for Large Multimodal Models"],"prefix":"10.1109","author":[{"given":"Hongyu","family":"Wang","sequence":"first","affiliation":[{"name":"China University of Chinese Academy of Sciences,Institute of Computing Technology, Chinese Academy of Sciences,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiayu","family":"Xu","sequence":"additional","affiliation":[{"name":"China University of Chinese Academy of Sciences,Institute of Computing Technology, Chinese Academy of Sciences,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Senwei","family":"Xie","sequence":"additional","affiliation":[{"name":"China University of Chinese Academy of Sciences,Institute of Computing Technology, Chinese Academy of Sciences,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ruiping","family":"Wang","sequence":"additional","affiliation":[{"name":"China University of Chinese Academy of Sciences,Institute of Computing Technology, Chinese Academy of Sciences,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jialin","family":"Li","sequence":"additional","affiliation":[{"name":"China University of Chinese Academy of Sciences,Institute of Computing Technology, Chinese Academy of Sciences,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhaojie","family":"Xie","sequence":"additional","affiliation":[{"name":"China University of Chinese Academy of Sciences,Institute of Computing Technology, Chinese Academy of Sciences,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bin","family":"Zhang","sequence":"additional","affiliation":[{"name":"China University of Chinese Academy of Sciences,Institute of Computing Technology, Chinese Academy of Sciences,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chuyan","family":"Xiong","sequence":"additional","affiliation":[{"name":"China University of Chinese Academy of Sciences,Institute of Computing Technology, Chinese Academy of Sciences,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xilin","family":"Chen","sequence":"additional","affiliation":[{"name":"China University of Chinese Academy of Sciences,Institute of Computing Technology, Chinese Academy of Sciences,China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Phi-3 technical report: A highly capable language model locally on your phone","author":"Abdin","year":"2024"},{"key":"ref2","year":"2024","journal-title":"Yi: Open foundation models by 01.ai"},{"key":"ref3","article-title":"Gemini: A family of highly capable multimodal models","volume":"abs\/2312.11805","author":"Anil","year":"2023","journal-title":"CoRR"},{"key":"ref4","article-title":"Qwen technical report","volume":"abs\/2309.16609","author":"Bai","year":"2023","journal-title":"CoRR"},{"key":"ref5","article-title":"Qwen-vl: A frontier large vision-language model with versatile abilities","volume":"abs\/2308.12966","author":"Bai","year":"2023","journal-title":"CoRR"},{"key":"ref6","article-title":"Qwen2.5-vl technical report","volume":"abs\/2502.13923","author":"Bai","year":"2025","journal-title":"CoRR"},{"key":"ref7","article-title":"Microsoft COCO captions: Data collection and evaluation server","volume":"abs\/1504.00325","author":"Chen","year":"2015","journal-title":"CoRR"},{"key":"ref8","article-title":"Expanding performance boundaries of open-source multimodal models with model, data, and test-time scaling","author":"Chen","year":"2024"},{"key":"ref9","article-title":"Gemini 2.5: Pushing the frontier with advanced reasoning, multimodality, long context, and next generation agentic capabilities","volume":"abs\/2507.06261","author":"Comanici","year":"2025","journal-title":"CoRR"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2142"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.420"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.26"},{"key":"ref13","article-title":"MME: A comprehensive evaluation benchmark for multimodal large language models","volume":"abs\/2306.13394","author":"Fu","year":"2023","journal-title":"CoRR"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref15","article-title":"Measuring massive multitask language understanding","volume-title":"ICLR 2021","author":"Hendrycks","year":"2021"},{"key":"ref16","article-title":"C-eval: A multi-level multi-discipline chinese evaluation suite for foundation models","author":"Huang","year":"2023","journal-title":"NeurIPS 2023"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"ref18","article-title":"Mistral 7b","volume":"abs\/2310.06825","author":"Jiang","year":"2023","journal-title":"CoRR"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_15"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.50"},{"key":"ref21","article-title":"Seed-bench: Benchmarking multimodal llms with generative comprehension","volume":"abs\/2307.16125","author":"Li","year":"2023","journal-title":"CoRR"},{"key":"ref22","article-title":"CMMLU: measuring massive multitask language understanding in chinese","volume":"abs\/2306.09212","author":"Li","year":"2023","journal-title":"CoRR"},{"key":"ref23","article-title":"M3it: A large-scale dataset towards multi-modal multilingual instruction tuning","author":"Li","year":"2023"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.616"},{"key":"ref26","author":"Liu","year":"2024","journal-title":"Llava-next: Improved reasoning, ocr, and world knowledge"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"ref28","author":"Lu","year":"2024","journal-title":"Deepseek-vl: Towards real-world vision-language understanding"},{"key":"ref29","article-title":"Learn to explain: Multimodal reasoning via thought chains for science question answering","author":"Lu","year":"2022","journal-title":"NeurIPS 2022"},{"key":"ref30","article-title":"Mathvista: Evaluating mathematical reasoning of foundation models in visual contexts","author":"Lu","year":"2024","journal-title":"ICLR 2024"},{"key":"ref31","article-title":"GPT-4 technical report","volume":"abs\/2303.08774","year":"2023","journal-title":"CoRR"},{"key":"ref32","year":"2024","journal-title":"Hello gpt-4o"},{"key":"ref33","year":"2024","journal-title":"Mmmlu dataset"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.185"},{"key":"ref35","author":"Shi","year":"2022","journal-title":"Language models are multilingual chain-of-thought reasoners"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.919"},{"key":"ref37","article-title":"Beyond the imitation game: Quantifying and extrapolating the capabilities of language models","author":"Srivastava","year":"2022"},{"key":"ref38","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","author":"Team","year":"2024"},{"key":"ref39","author":"Team","year":"2025","journal-title":"Glm-4.5v and glm-4.1v-thinking: Towards versatile multimodal reasoning with scalable reinforcement learning"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.5040\/9781501365072.09396"},{"key":"ref41","article-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024"},{"key":"ref42","article-title":"Cogvlm: Visual expert for pretrained language models","volume":"abs\/2311.03079","author":"Wang","year":"2023","journal-title":"CoRR"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-65093-6_300275"},{"key":"ref44","article-title":"Chain-of-thought prompting elicits reasoning in large language models","author":"Wei","year":"2022","journal-title":"NeurIPS 2022"},{"key":"ref45","article-title":"Qwen2. 5 technical report","author":"Yang","year":"2024"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1382"},{"key":"ref47","article-title":"Mm-vet: Evaluating large multimodal models for integrated capabilities","volume-title":"ICML 2024","author":"Yu","year":"2024"},{"key":"ref48","article-title":"MMMU: A massive multi-discipline multimodal understanding and reasoning benchmark for expert AGI","volume":"abs\/2311.16502","author":"Yue","year":"2023","journal-title":"CoRR"},{"key":"ref49","article-title":"CMMMU: A chinese massive multi-discipline multimodal understanding benchmark","volume":"abs\/2401.11944","author":"Zhang","year":"2024","journal-title":"CoRR"},{"key":"ref50","author":"Zhang","year":"2023","journal-title":"Internlm-xcomposer: A vision-language large model for advanced text-image comprehension and composition"},{"key":"ref51","article-title":"M3exam: A multilingual, multimodal, multilevel benchmark for examining large language models","volume":"abs\/2306.05179","author":"Zhang","year":"2023","journal-title":"CoRR"},{"key":"ref52","article-title":"Multimodal chain-of-thought reasoning in language models","volume":"abs\/2302.00923","author":"Zhang","year":"2023","journal-title":"CoRR"}],"event":{"name":"2026 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)","location":"Tucson, AZ, USA","start":{"date-parts":[[2026,3,6]]},"end":{"date-parts":[[2026,3,10]]}},"container-title":["2026 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11491838\/11491925\/11492617.pdf?arnumber=11492617","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T06:16:52Z","timestamp":1778048212000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11492617\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,6]]},"references-count":52,"URL":"https:\/\/doi.org\/10.1109\/wacv61042.2026.00045","relation":{},"subject":[],"published":{"date-parts":[[2026,3,6]]}}}