{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T14:27:50Z","timestamp":1778855270764,"version":"3.51.4"},"reference-count":128,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"NSFC","doi-asserted-by":"publisher","award":["62406189,62322113,62376156"],"award-info":[{"award-number":["62406189,62322113,62376156"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00291","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"3034-3046","source":"Crossref","is-referenced-by-count":1,"title":["Corvid: Improving Multimodal Large Language Models Towards Chain-of-Thought Reasoning"],"prefix":"10.1109","author":[{"given":"Jingjing","family":"Jiang","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chao","family":"Ma","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xurui","family":"Song","sequence":"additional","affiliation":[{"name":"Nanyang Technological University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hanwang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Nanyang Technological University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jun","family":"Luo","sequence":"additional","affiliation":[{"name":"Nanyang Technological University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018076"},{"key":"ref2","article-title":"Enhancing textbook question answering task with large language models and retrieval augmented generation","author":"Alawwad","year":"2024","journal-title":"arXiv:2402.05128"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref4","article-title":"Anthropic","volume-title":"Grok-1.5 vision preview","year":"2024"},{"key":"ref5","article-title":"Qwen technical report","author":"Bai","year":"2023","journal-title":"arXiv:2309.16609"},{"key":"ref6","article-title":"Qwen2.5-vl technical report","author":"Bai","year":"2025","journal-title":"arXiv:2502.13923"},{"key":"ref7","first-page":"65","article-title":"Meteor: An automatic metric for mt evaluation with improved correlation with human judgments","volume-title":"ACL Workshop","author":"Banerjee","year":"2005"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00439"},{"key":"ref9","article-title":"Mapqa: A dataset for question answering on choropleth maps","author":"Chang","year":"2022","journal-title":"arXiv:2211.08545"},{"key":"ref10","article-title":"Allava: Harnessing gpt4v-synthesized data for a lite vision-language model","author":"Chen","year":"2024","journal-title":"arXiv:2402.11684"},{"key":"ref11","article-title":"Sharegpt4v: Improving large multi-modal models with better captions","author":"Chen","year":"2023","journal-title":"arXiv:2311.12793"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0850"},{"key":"ref13","article-title":"Januspro: Unified multimodal understanding and generation with data and model scaling","author":"Chen","year":"2025","journal-title":"arXiv:2501.17811"},{"key":"ref14","article-title":"Expanding performance boundaries of open-source multimodal models with model, data, and test-time scaling","author":"Chen","year":"2024","journal-title":"arXiv:2412.05271"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"ref16","article-title":"Mllm is a strong reranker: Advancing multimodal retrievalaugmented generation via knowledge-enhanced reranking and noise-injected training","author":"Chen","year":"2024","journal-title":"arXiv:2407.21439"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.63317\/5ceqa2phg88j"},{"key":"ref18","article-title":"Hitab: A hierarchical table dataset for question answering and natural language generation","author":"Cheng","year":"2021","journal-title":"arXiv:2108.06712"},{"key":"ref19","volume-title":"Sharegpt-4o: Comprehensive multimodal annotations with gpt-4o","author":"Cui","year":"2024"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2142"},{"key":"ref21","article-title":"Deepseek-rl: Incentivizing reasoning capability in 11 ms via reinforcement learning","author":"Guo DeepSeek-AI","year":"2025","journal-title":"arXiv:2501.12948"},{"key":"ref22","article-title":"Molmo and pixmo: Open weights and open data for state-of-the-art multimodal models","author":"Deitke","year":"2024","journal-title":"arXiv:2409.17146"},{"key":"ref23","article-title":"Active prompting with chain-of-thought for large language models","author":"Diao","year":"2023","journal-title":"arXiv:2302.12246"},{"key":"ref24","first-page":"8469","article-title":"Palm-e: An embodied multimodal language model","author":"Driess","year":"2023","journal-title":"ICML"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3685520"},{"key":"ref26","article-title":"Vita-1.5: Towards gpt-4o level real-time vision and speech interaction","author":"Fu","year":"2025","journal-title":"arXiv:2501.01957"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73337-6_9"},{"key":"ref28","article-title":"Chain-of-thought hub: A continuous effort to measure large language models\u2019 reasoning performance","author":"Fu","year":"2023","journal-title":"arXiv:2305.17306"},{"key":"ref29","article-title":"Llamaadapter v2: Parameter-efficient visual instruction model","author":"Gao","year":"2023","journal-title":"arXiv:2304.15010"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681249"},{"key":"ref31","article-title":"Chatglm: A family of large language models from glm-130b to glm-4 all tools","author":"Team","year":"2024","journal-title":"arXiv:2406.12793"},{"key":"ref32","article-title":"Multimodal-gpt: A vision and language model for dialogue with humans","author":"Gong","year":"2023","journal-title":"arXiv:2305.04790"},{"key":"ref33","article-title":"Glore: When, where, and how to improve 11 m reasoning via global and local refinements","author":"Havrilla","year":"2024","journal-title":"arXiv:2402.10963"},{"key":"ref34","article-title":"Efficient multimodal learning from data-centric perspective","author":"He","year":"2024","journal-title":"arXiv:2402.11530"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref36","article-title":"Large language models can self-improve","author":"Huang","year":"2022","journal-title":"arXiv:2210.11610"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p17-1167"},{"key":"ref38","article-title":"Openai o1 system card","author":"Jaech","year":"2024","journal-title":"arXiv:2412.16720"},{"key":"ref39","article-title":"Mantis: Interleaved multi-image instruction tuning","author":"Jiang","year":"2024","journal-title":"arXiv:2405.01483"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.215"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00592"},{"key":"ref42","article-title":"Figureqa: An annotated figure dataset for visual reasoning","author":"Kahou","year":"2017","journal-title":"arXiv:1710.07300"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.277"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_15"},{"key":"ref45","first-page":"2611","article-title":"The hateful memes challenge: Detecting hate speech in multimodal memes","author":"Kiela","year":"2020","journal-title":"NeurIPS"},{"key":"ref46","first-page":"22199","article-title":"Large language models are zero-shot reasoners","author":"Kojima","year":"2022","journal-title":"NeurIPS"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"ref48","article-title":"Building and better understanding vision-language models: insights and future directions","author":"Lauren\u00e7on","year":"2024","journal-title":"arXiv:2408.12637"},{"key":"ref49","article-title":"Seed-bench: Benchmarking multimodal 11 ms with generative comprehension","author":"Li","year":"2023","journal-title":"arXiv:2307.16125"},{"key":"ref50","first-page":"51991","article-title":"Camel: Communicative agents for \u201cmind\u201d exploration of large language model society","author":"Li","year":"2023","journal-title":"NeurIPS"},{"key":"ref51","first-page":"19730","article-title":"Blip2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023","journal-title":"ICML"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02527"},{"key":"ref53","first-page":"74","article-title":"Rouge: A package for automatic evaluation of summaries","author":"Lin","year":"2004","journal-title":"Text summarization branches out"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"ref55","article-title":"Clevr-math: A dataset for compositional language, visual and mathematical reasoning","author":"Lindstr\u00f6m","year":"2022","journal-title":"arXiv:2208.05358"},{"key":"ref56","article-title":"Visual spatial reasoning","author":"Liu","year":"2022","journal-title":"arXiv:2205.00363"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref59","volume-title":"Llava-next: Improved reasoning, ocr, and world knowledge","author":"Liu","year":"2024"},{"key":"ref60","article-title":"Points1.5: Building a vision-language model towards real world applications","author":"Liu","year":"2024","journal-title":"arXiv:2412.08443"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"ref62","article-title":"The llama 3 herd of models","year":"2024","journal-title":"arXiv:2407.21783"},{"key":"ref63","article-title":"Deepseek-vl: towards real-world vision-language understanding","author":"Lu","year":"2024","journal-title":"arXiv:2403.05525"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.528"},{"key":"ref65","article-title":"Iconqa: A new benchmark for abstract diagram understanding and visual language reasoning","author":"Lu","year":"2021","journal-title":"arXiv:2110.13214"},{"key":"ref66","first-page":"2507","article-title":"Learn to explain: Multimodal reasoning via thought chains for science question answering","author":"Lu","year":"2022","journal-title":"NeurIPS"},{"key":"ref67","article-title":"Mathvista: Evaluating mathematical reasoning of foundation models in visual contexts","author":"Lu","year":"2023","journal-title":"arXiv:2310.02255"},{"key":"ref68","first-page":"43447","article-title":"Chameleon: Plug-and-play compositional reasoning with large language models","author":"Lu","year":"2023","journal-title":"NeurIPS"},{"key":"ref69","article-title":"Ovis: Structural embedding alignment for multimodal large language model","author":"Lu","year":"2024","journal-title":"arXiv:2405.20797"},{"key":"ref70","first-page":"29615","article-title":"Cheap and quick: Efficient visionlanguage instruction tuning for large language models","author":"Luo","year":"2023","journal-title":"NeurIPS"},{"key":"ref71","article-title":"Ursa: Understanding and verifying chain-of-thought reasoning in multimodal mathematics","author":"Luo","year":"2025","journal-title":"arXiv:2501.04686"},{"key":"ref72","doi-asserted-by":"crossref","first-page":"39","DOI":"10.1007\/s100320200071","article-title":"The iam-database: an english sentence database for offline handwriting recognition","volume":"5","year":"2002","journal-title":"IJDAR"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00264"},{"key":"ref76","article-title":"Mm1: Methods, analysis & insights from multimodal 11 m pre-training","author":"McKinzie","year":"2024","journal-title":"arXiv:2403.09611"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093523"},{"key":"ref78","volume-title":"Free dolly: Introducing the world\u2019s first truly open instruction-tuned llm","author":"Mike","year":"2023"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"ref80","article-title":"Orca-math: Unlocking the potential of slms in grade school math","author":"Mitra","year":"2024","journal-title":"arXiv:2402.14830"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01367"},{"key":"ref82","first-page":"25081","article-title":"Embodiedgpt: Vision-language pre-training via embodied chain of thought","author":"Mu","year":"2023","journal-title":"NeurIPS"},{"key":"ref83","first-page":"3982","article-title":"sentence embeddings using siamese bertnetworks","author":"Nils","year":"2020","journal-title":"EMNLP"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73027-6_17"},{"key":"ref85","article-title":"OpenAI","volume-title":"Hello gpt-4o","year":"2024"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P15-1142"},{"key":"ref88","article-title":"We-math: Does your large multimodal model achieve human-like mathematical reasoning?","author":"Qiao","year":"2024","journal-title":"arXiv:2407.01284"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20074-8_9"},{"key":"ref90","article-title":"Eagle: Exploring the design space for multimodal 11 ms with mixture of encoders","author":"Shi","year":"2024","journal-title":"arXiv:2408.15998"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_44"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00851"},{"key":"ref93","article-title":"Scaling llm test-time compute optimally can be more effective than scaling model parameters","author":"Snell","year":"2024","journal-title":"arXiv:2408.03314"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i15.17635"},{"key":"ref95","article-title":"Gemma 2: Improving open language models at a practical size","author":"Team","year":"2024","journal-title":"arXiv:2408.00118"},{"key":"ref96","article-title":"Teknium","year":"2023","journal-title":"Openhermes 2.5: An open dataset of synthetic data for generalist 11 m assistants"},{"key":"ref97","article-title":"Llamav-o1: Rethinking step-by-step visual reasoning in 11 ms","author":"Thawakar","year":"2025","journal-title":"arXiv:2501.06186"},{"key":"ref98","article-title":"Cambrian1: A fully open, vision-centric exploration of multimodal llms","author":"Tong","year":"2024","journal-title":"arXiv:2406.16860"},{"key":"ref99","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv:2302.13971"},{"key":"ref100","first-page":"200","article-title":"Multimodal few-shot learning with frozen language models","author":"Tsimpoukelli","year":"2021","journal-title":"NeurIPS"},{"key":"ref101","article-title":"Measuring multimodal mathematical reasoning with math-vision dataset","author":"Wang","year":"2024","journal-title":"arXiv:2402.14804"},{"key":"ref102","article-title":"Enhancing the reasoning ability of multimodal large language models via mixed preference optimization","author":"Wang","year":"2024","journal-title":"arXiv:2411.10442"},{"key":"ref103","article-title":"Self-consistency improves chain of thought reasoning in language models","author":"Wang","year":"2022","journal-title":"arXiv:2203.11171"},{"key":"ref104","article-title":"Self-consistency improves chain of thought reasoning in language models","author":"Wang","year":"2023","journal-title":"ICLR"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73235-5_23"},{"key":"ref106","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","author":"Wei","year":"2022","journal-title":"NeurIPS"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.167"},{"key":"ref108","volume-title":"X.AI. Grok-1.5 vision preview","year":"2024"},{"key":"ref109","article-title":"Llava-o1: Let vision language models reason step-by-step","author":"Xu","year":"2024","journal-title":"arXiv:2411.10440"},{"key":"ref110","article-title":"Mulberry: Empowering mllm with o1-like reasoning and reflection via collective monte carlo tree search","author":"Yao","year":"2024","journal-title":"arXiv:2412.18319"},{"key":"ref111","article-title":"Minicpm-v: A gpt-4v level mllm on your phone","author":"Yao","year":"2024","journal-title":"arXiv:2408.01800"},{"key":"ref112","article-title":"mplug-owl: Modularization empowers large language models with multimodality","author":"Ye","year":"2023","journal-title":"arXiv:2304.14178"},{"key":"ref113","article-title":"Mmt-bench: A comprehensive multimodal benchmark for evaluating large vision-language models towards multitask agi","author":"Ying","year":"2024","journal-title":"arXiv:2404.16006"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00688"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00546"},{"key":"ref118","article-title":"Mm1.5: Methods, analysis & insights from multimodal llm fine-tuning","author":"Zhang","year":"2024","journal-title":"arXiv:2409.20566"},{"key":"ref119","article-title":"Mathverse: Does your multimodal 11 m truly see the diagrams in visual math problems?","author":"Zhang","year":"2024","journal-title":"arXiv:2403.14624"},{"key":"ref120","article-title":"Mavis: Mathematical visual instruction tuning","author":"Zhang","year":"2024","journal-title":"arXiv:2407.08739"},{"key":"ref121","article-title":"Improve vision language model chain-ofthought reasoning","author":"Zhang","year":"2024","journal-title":"arXiv:2410.16198"},{"key":"ref122","article-title":"Beyond llavahd: Diving into high-resolution large multimodal models","author":"Zhang","year":"2024","journal-title":"arXiv:2406.08487"},{"key":"ref123","article-title":"Automatic chain of thought prompting in large language models","author":"Zhang","year":"2023","journal-title":"ICLR"},{"key":"ref124","article-title":"Mllm-dataengine: An iterative refinement approach for mllm","author":"Zhao","year":"2023","journal-title":"arXiv:2308.13566"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.131"},{"key":"ref126","article-title":"Seq2sql: Generating structured queries from natural language using reinforcement learning","author":"Zhong","year":"2017","journal-title":"arxiv 2017"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.254"},{"key":"ref128","article-title":"Dynamath: A dynamic visual benchmark for evaluating mathematical reasoning robustness of vision language models","author":"Zou","year":"2024","journal-title":"arXiv:2411.00836"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11445582.pdf?arnumber=11445582","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:17:13Z","timestamp":1777612633000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11445582\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":128,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00291","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}