{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:16:10Z","timestamp":1765340170096,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754585","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:47:18Z","timestamp":1761374838000},"page":"5385-5393","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["OCR-Critic: Aligning Multimodal Large Language Models' Perception through Critical Feedback"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-1496-1304","authenticated-orcid":false,"given":"Qiuna","family":"Tan","sequence":"first","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4508-201X","authenticated-orcid":false,"given":"Runqi","family":"Qiao","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2318-0281","authenticated-orcid":false,"given":"Guanting","family":"Dong","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1869-3734","authenticated-orcid":false,"given":"YiFan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3529-7689","authenticated-orcid":false,"given":"Minhui","family":"Wu","sequence":"additional","affiliation":[{"name":"WeChat Vision, Tencent, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2060-3488","authenticated-orcid":false,"given":"Jiapeng","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Electronic and Information Engineering, South China University of Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9185-3353","authenticated-orcid":false,"given":"Miaoxuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2058-8531","authenticated-orcid":false,"given":"Yida","family":"Xu","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5365-6250","authenticated-orcid":false,"given":"Chong","family":"Sun","sequence":"additional","affiliation":[{"name":"WeChat Vision, Tencent, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2450-8525","authenticated-orcid":false,"given":"Chen","family":"Li","sequence":"additional","affiliation":[{"name":"WeChat Vision, Tencent, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8287-6783","authenticated-orcid":false,"given":"Honggang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, et al.","author":"Abdin Marah","year":"2024","unstructured":"Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, et al., 2024. Phi-3 technical report: A highly capable language model locally on your phone. arXiv preprint arXiv:2404.14219 (2024)."},{"key":"e_1_3_2_2_2_1","volume-title":"Mllm-as-a-judge: Assessing multimodal llm-as-a-judge with vision-language benchmark. arXiv preprint arXiv:2402.04788","author":"Chen Dongping","year":"2024","unstructured":"Dongping Chen, Ruoxi Chen, Shilin Zhang, Yinuo Liu, Yaochen Wang, Huichi Zhou, Qihui Zhang, Yao Wan, Pan Zhou, and Lichao Sun. 2024a. Mllm-as-a-judge: Assessing multimodal llm-as-a-judge with vision-language benchmark. arXiv preprint arXiv:2402.04788 (2024)."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"crossref","unstructured":"Zhe Chen Weiyun Wang Hao Tian Shenglong Ye Zhangwei Gao Erfei Cui Wenwen Tong Kongzhi Hu Jiapeng Luo Zheng Ma et al. 2024b. How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites. arXiv preprint arXiv:2404.16821 (2024).","DOI":"10.1007\/s11432-024-4231-5"},{"key":"e_1_3_2_2_4_1","volume-title":"Svtr: Scene text recognition with a single visual model. arXiv preprint arXiv:2205.00159","author":"Du Yongkun","year":"2022","unstructured":"Yongkun Du, Zhineng Chen, Caiyan Jia, Xiaoting Yin, Tianlun Zheng, Chenxia Li, Yuning Du, and Yu-Gang Jiang. 2022. Svtr: Scene text recognition with a single visual model. arXiv preprint arXiv:2205.00159 (2022)."},{"key":"e_1_3_2_2_5_1","volume-title":"Towards Detecting Prompt Knowledge Gaps for Improved LLM-guided Issue Resolution. arXiv preprint arXiv:2501.11709","author":"Ehsani Ramtin","year":"2025","unstructured":"Ramtin Ehsani, Sakshi Pathak, and Preetha Chatterjee. 2025. Towards Detecting Prompt Knowledge Gaps for Improved LLM-guided Issue Resolution. arXiv preprint arXiv:2501.11709 (2025)."},{"key":"e_1_3_2_2_6_1","volume-title":"Don't Hallucinate","author":"Feng Shangbin","year":"2024","unstructured":"Shangbin Feng, Weijia Shi, Yike Wang, Wenxuan Ding, Vidhisha Balachandran, and Yulia Tsvetkov. 2024. Don't Hallucinate, Abstain: Identifying LLM Knowledge Gaps via Multi-LLM Collaboration. arXiv preprint arXiv:2402.00367 (2024)."},{"key":"e_1_3_2_2_7_1","unstructured":"Ling Fu Biao Yang Zhebin Kuang Jiajun Song Yuzhe Li Linghao Zhu Qidi Luo Xinyu Wang Hao Lu Mingxin Huang et al. 2024. OCRBench v2: An Improved Benchmark for Evaluating Large Multimodal Models on Visual Text Localization and Reasoning. arXiv preprint arXiv:2501.00321 (2024)."},{"key":"e_1_3_2_2_8_1","volume-title":"Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv preprint arXiv:2406.12793","author":"Aohan Zeng Team GLM","year":"2024","unstructured":"Team GLM, Aohan Zeng, Bin Xu, Bowen Wang, Chenhui Zhang, Da Yin, Dan Zhang, Diego Rojas, Guanyu Feng, Hanlin Zhao, et al., 2024. Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv preprint arXiv:2406.12793 (2024)."},{"key":"e_1_3_2_2_9_1","unstructured":"Jiawei Gu Xuhui Jiang Zhichao Shi Hexiang Tan Xuehao Zhai Chengjin Xu Wei Li Yinghan Shen Shengjie Ma Honghao Liu et al. 2024. A Survey on LLM-as-a-Judge. arXiv preprint arXiv:2411.15594 (2024)."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01478"},{"key":"e_1_3_2_2_11_1","volume-title":"TableVQA-Bench: A Visual Question Answering Benchmark on Multiple Table Domains. arXiv preprint arXiv:2404.19205","author":"Kim Yoonsik","year":"2024","unstructured":"Yoonsik Kim, Moonbin Yim, and Ka Yeon Song. 2024. TableVQA-Bench: A Visual Question Answering Benchmark on Multiple Table Domains. arXiv preprint arXiv:2404.19205 (2024)."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00281"},{"key":"e_1_3_2_2_13_1","volume-title":"LLMs-as-Judges: A Comprehensive Survey on LLM-based Evaluation Methods. arXiv preprint arXiv:2412.05579","author":"Li Haitao","year":"2024","unstructured":"Haitao Li, Qian Dong, Junjie Chen, Huixue Su, Yujia Zhou, Qingyao Ai, Ziyi Ye, and Yiqun Liu. 2024a. LLMs-as-Judges: A Comprehensive Survey on LLM-based Evaluation Methods. arXiv preprint arXiv:2412.05579 (2024)."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02527"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6812"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4235-6"},{"key":"e_1_3_2_2_17_1","volume-title":"Textmonkey: An ocr-free large multimodal model for understanding document. arXiv preprint arXiv:2403.04473","author":"Liu Yuliang","year":"2024","unstructured":"Yuliang Liu, Biao Yang, Qiang Liu, Zhang Li, Zhiyin Ma, Shuo Zhang, and Xiang Bai. 2024b. Textmonkey: An ocr-free large multimodal model for understanding document. arXiv preprint arXiv:2403.04473 (2024)."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_2"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.107980"},{"key":"e_1_3_2_2_20_1","volume-title":"Jia Qing Tan, Shafiq Joty, and Enamul Hoque.","author":"Masry Ahmed","year":"2022","unstructured":"Ahmed Masry, Do Xuan Long, Jia Qing Tan, Shafiq Joty, and Enamul Hoque. 2022. Chartqa: A benchmark for question answering about charts with visual and logical reasoning. arXiv preprint arXiv:2203.10244 (2022)."},{"key":"e_1_3_2_2_21_1","volume-title":"Infographicvqa. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. 1697-1706","author":"Mathew Minesh","year":"2022","unstructured":"Minesh Mathew, Viraj Bagal, Rub\u00e8n Tito, Dimosthenis Karatzas, Ernest Valveny, and CV Jawahar. 2022. Infographicvqa. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. 1697-1706."},{"key":"e_1_3_2_2_22_1","unstructured":"OpenAI. 2024. Hello GPT-4o. https:\/\/openai.com\/index\/hello-gpt-4o\/"},{"key":"e_1_3_2_2_23_1","volume-title":"Citekey: gptvision","author":"System Card R","year":"2023","unstructured":"R OpenAI. 2023. GPT-4V (ision) System Card. Citekey: gptvision (2023)."},{"key":"e_1_3_2_2_24_1","volume-title":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","author":"Qiao Runqi","year":"2012","unstructured":"Runqi Qiao, Qiuna Tan, Guanting Dong, MinhuiWu MinhuiWu, Jiapeng Wang, YiFan Zhang, Zhuoma GongQue, Chong Sun, Yida Xu, Yadong Xue, Ye Tian, Zhimin Bao, Lan Yang, Chen Li, and Honggang Zhang. 2025a. V-Oracle: Making Progressive Reasoning in Deciphering Oracle Bones for You and Me. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), Wanxiang Che, Joyce Nabende, Ekaterina Shutova, and Mohammad Taher Pilehvar (Eds.). Association for Computational Linguistics, Vienna, Austria, 20124-20150. https:\/\/aclanthology.org\/2025.acl-long.986\/"},{"key":"e_1_3_2_2_25_1","volume-title":"We-math: Does your large multimodal model achieve human-like mathematical reasoning? arXiv preprint arXiv:2407.01284","author":"Qiao Runqi","year":"2024","unstructured":"Runqi Qiao, Qiuna Tan, Guanting Dong, Minhui Wu, Chong Sun, Xiaoshuai Song, Zhuoma GongQue, Shanglin Lei, Zhe Wei, Miaoxuan Zhang, et al., 2024a. We-math: Does your large multimodal model achieve human-like mathematical reasoning? arXiv preprint arXiv:2407.01284 (2024)."},{"key":"e_1_3_2_2_26_1","unstructured":"Runqi Qiao Qiuna Tan Peiqing Yang Yanzi Wang Xiaowan Wang Enhui Wan Sitong Zhou Guanting Dong Yuchen Zeng Yida Xu et al. 2025b. We-Math 2.0: A Versatile MathBook System for Incentivizing Visual Mathematical Reasoning. arXiv preprint arXiv:2508.10433 (2025)."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01203"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00130"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"crossref","unstructured":"Amanpreet Singh Vivek Natarajan Meet Shah Yu Jiang Xinlei Chen Dhruv Batra Devi Parikh and Marcus Rohrbach. 2019. Towards vqa models that can read. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 8317-8326.","DOI":"10.1109\/CVPR.2019.00851"},{"key":"e_1_3_2_2_30_1","volume-title":"A comprehensive survey of hallucination mitigation techniques in large language models. arXiv preprint arXiv:2401.01313","author":"Tonmoy SM","year":"2024","unstructured":"SM Tonmoy, SM Zaman, Vinija Jain, Anku Rani, Vipula Rawte, Aman Chadha, and Amitava Das. 2024. A comprehensive survey of hallucination mitigation techniques in large language models. arXiv preprint arXiv:2401.01313 (2024)."},{"key":"e_1_3_2_2_31_1","unstructured":"Jiaqi Wang Hanqi Jiang Yiheng Liu Chong Ma Xu Zhang Yi Pan Mengyuan Liu Peiran Gu Sichen Xia Wenjun Li et al. 2024b. A comprehensive review of multimodal large language models: Performance and challenges across different tasks. arXiv preprint arXiv:2408.01319 (2024)."},{"key":"e_1_3_2_2_32_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024a. Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_2_33_1","unstructured":"Haoran Wei Chenglong Liu Jinyue Chen Jia Wang Lingyu Kong Yanming Xu Zheng Ge Liang Zhao Jianjian Sun Yuang Peng et al. 2024. General ocr theory: Towards ocr-2.0 via a unified end-to-end model. (2024)."},{"key":"e_1_3_2_2_34_1","volume-title":"Llava-critic: Learning to evaluate multimodal models. arXiv preprint arXiv:2410.02712","author":"Xiong Tianyi","year":"2024","unstructured":"Tianyi Xiong, Xiyao Wang, Dong Guo, Qinghao Ye, Haoqi Fan, Quanquan Gu, Heng Huang, and Chunyuan Li. 2024. Llava-critic: Learning to evaluate multimodal models. arXiv preprint arXiv:2410.02712 (2024)."},{"key":"e_1_3_2_2_35_1","unstructured":"Yuan Yao Tianyu Yu Ao Zhang Chongyi Wang Junbo Cui Hongji Zhu Tianchi Cai Haoyu Li Weilin Zhao Zhihui He et al. 2024. MiniCPM-V: A GPT-4V Level MLLM on Your Phone. arXiv preprint arXiv:2408.01800 (2024)."},{"key":"e_1_3_2_2_36_1","unstructured":"Jiabo Ye Anwen Hu Haiyang Xu Qinghao Ye Ming Yan Yuhao Dan Chenlin Zhao Guohai Xu Chenliang Li Junfeng Tian et al. 2023a. mplug-docowl: Modularized multimodal large language model for document understanding. arXiv preprint arXiv:2307.02499 (2023)."},{"key":"e_1_3_2_2_37_1","volume-title":"Ureader: Universal ocr-free visually-situated language understanding with multimodal large language model. arXiv preprint arXiv:2310.05126","author":"Ye Jiabo","year":"2023","unstructured":"Jiabo Ye, Anwen Hu, Haiyang Xu, Qinghao Ye, Ming Yan, Guohai Xu, Chenliang Li, Junfeng Tian, Qi Qian, Ji Zhang, et al., 2023b. Ureader: Universal ocr-free visually-situated language understanding with multimodal large language model. arXiv preprint arXiv:2310.05126 (2023)."},{"key":"e_1_3_2_2_38_1","volume-title":"Critic-v: Vlm critics help catch vlm errors in multimodal reasoning. arXiv preprint arXiv:2411.18203","author":"Zhang Di","year":"2024","unstructured":"Di Zhang, Jingdi Lei, Junxian Li, Xunzhi Wang, Yujie Liu, Zonglin Yang, Jiatong Li, Weida Wang, Suorong Yang, Jianbo Wu, et al., 2024a. Critic-v: Vlm critics help catch vlm errors in multimodal reasoning. arXiv preprint arXiv:2411.18203 (2024)."},{"key":"e_1_3_2_2_39_1","volume-title":"Mm-llms: Recent advances in multimodal large language models. arXiv preprint arXiv:2401.13601","author":"Zhang Duzhen","year":"2024","unstructured":"Duzhen Zhang, Yahan Yu, Jiahua Dong, Chenxing Li, Dan Su, Chenhui Chu, and Dong Yu. 2024b. Mm-llms: Recent advances in multimodal large language models. arXiv preprint arXiv:2401.13601 (2024)."},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00972"},{"key":"e_1_3_2_2_41_1","first-page":"46595","article-title":"Judging llm-as-a-judge with mt-bench and chatbot arena","volume":"36","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric Xing, et al., 2023. Judging llm-as-a-judge with mt-bench and chatbot arena. Advances in Neural Information Processing Systems, Vol. 36 (2023), 46595-46623.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_42_1","volume-title":"Judgelm: Fine-tuned large language models are scalable judges. arXiv preprint arXiv:2310.17631","author":"Zhu Lianghui","year":"2023","unstructured":"Lianghui Zhu, Xinggang Wang, and Xinlong Wang. 2023. Judgelm: Fine-tuned large language models are scalable judges. arXiv preprint arXiv:2310.17631 (2023)."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00314"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754585","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:14:01Z","timestamp":1765340041000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754585"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":43,"alternative-id":["10.1145\/3746027.3754585","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754585","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}