{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:10:37Z","timestamp":1765343437368,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758277","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:55Z","timestamp":1761377215000},"page":"13213-13220","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["GEMeX-RMCoT: An Enhanced Med-VQA Dataset for Region-Aware Multimodal Chain-of-Thought Reasoning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2165-245X","authenticated-orcid":false,"given":"Bo","family":"Liu","sequence":"first","affiliation":[{"name":"The Hong Kong Polytechnic University, Hong Kong S.A.R., China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4468-5529","authenticated-orcid":false,"given":"Xiangyu","family":"Zhao","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, Hong Kong S.A.R., China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1356-8757","authenticated-orcid":false,"given":"Along","family":"He","sequence":"additional","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6293-4624","authenticated-orcid":false,"given":"Yidi","family":"Chen","sequence":"additional","affiliation":[{"name":"West China Hospital of Sichuan University, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9702-5524","authenticated-orcid":false,"given":"Huazhu","family":"Fu","sequence":"additional","affiliation":[{"name":"IHPC, Agency for Science, Technology and Research, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3130-0554","authenticated-orcid":false,"given":"Xiao-Ming","family":"Wu","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, Hong Kong S.A.R., China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Quantifying uncertainty in answers from any language model and enhancing their trustworthiness. arXiv preprint arXiv:2308.16175","author":"Chen Jiuhai","year":"2023","unstructured":"Jiuhai Chen and Jonas Mueller. 2023. Quantifying uncertainty in answers from any language model and enhancing their trustworthiness. arXiv preprint arXiv:2308.16175 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"EyeGPT: Ophthalmic Assistant with Large Language Models. arXiv preprint arXiv:2403.00840","author":"Chen Xiaolan","year":"2024","unstructured":"Xiaolan Chen, Ziwei Zhao, Weiyi Zhang, Pusheng Xu, Le Gao, Mingpu Xu, Yue Wu, Yinwen Li, Danli Shi, and Mingguang He. 2024. EyeGPT: Ophthalmic Assistant with Large Language Models. arXiv preprint arXiv:2403.00840 (2024)."},{"key":"e_1_3_2_1_4_1","volume-title":"Sft memorizes, rl generalizes: A comparative study of foundation model post-training. arXiv preprint arXiv:2501.17161","author":"Chu Tianzhe","year":"2025","unstructured":"Tianzhe Chu, Yuexiang Zhai, Jihan Yang, Shengbang Tong, Saining Xie, Dale Schuurmans, Quoc V Le, Sergey Levine, and Yi Ma. 2025. Sft memorizes, rl generalizes: A comparative study of foundation model post-training. arXiv preprint arXiv:2501.17161 (2025)."},{"key":"e_1_3_2_1_5_1","volume-title":"HiLa: Hierarchical Vision-Language Collaboration for Cancer Survival Prediction. arXiv preprint arXiv:2507.04613","author":"Cui Jiaqi","year":"2025","unstructured":"Jiaqi Cui, Lu Wen, Yuchen Fei, Bo Liu, Luping Zhou, Dinggang Shen, and Yan Wang. 2025. HiLa: Hierarchical Vision-Language Collaboration for Cancer Survival Prediction. arXiv preprint arXiv:2507.04613 (2025)."},{"key":"e_1_3_2_1_6_1","unstructured":"Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiao Bi et al. 2025. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:2501.12948 (2025)."},{"key":"e_1_3_2_1_7_1","volume-title":"Pathvqa: 30000 questions for medical visual question answering. arXiv preprint arXiv:2003.10286","author":"He Xuehai","year":"2020","unstructured":"Xuehai He, Yichen Zhang, Luntian Mou, Eric Xing, and Pengtao Xie. 2020. Pathvqa: 30000 questions for medical visual question answering. arXiv preprint arXiv:2003.10286 (2020)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599819"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02093"},{"key":"e_1_3_2_1_10_1","volume-title":"Vision-r1: Incentivizing reasoning capability in multimodal large language models. arXiv preprint arXiv:2503.06749","author":"Huang Wenxuan","year":"2025","unstructured":"Wenxuan Huang, Bohan Jia, Zijie Zhai, Shaosheng Cao, Zheyu Ye, Fei Zhao, Zhe Xu, Yao Hu, and Shaohui Lin. 2025. Vision-r1: Incentivizing reasoning capability in multimodal large language models. arXiv preprint arXiv:2503.06749 (2025)."},{"key":"e_1_3_2_1_11_1","unstructured":"Aaron Jaech Adam Kalai Adam Lerer Adam Richardson Ahmed El-Kishky Aiden Low Alec Helyar Aleksander Madry Alex Beutel Alex Carney et al. 2024. Openai o1 system card. arXiv preprint arXiv:2412.16720 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"Du Nguyen Duong, Tan Bui, Pierre Chambon, Yuhao Zhang, Matthew P Lungren, Andrew Y Ng, et al.","author":"Jain Saahil","year":"2021","unstructured":"Saahil Jain, Ashwin Agrawal, Adriel Saporta, Steven QH Truong, Du Nguyen Duong, Tan Bui, Pierre Chambon, Yuhao Zhang, Matthew P Lungren, Andrew Y Ng, et al., 2021. Radgraph: Extracting clinical entities and relations from radiology reports. arXiv preprint arXiv:2106.14463 (2021)."},{"key":"e_1_3_2_1_13_1","volume-title":"Preference Optimization for Reasoning with Pseudo Feedback. arXiv preprint arXiv:2411.16345","author":"Jiao Fangkai","year":"2024","unstructured":"Fangkai Jiao, Geyang Guo, Xingxing Zhang, Nancy F Chen, Shafiq Joty, and Furu Wei. 2024. Preference Optimization for Reasoning with Pseudo Feedback. arXiv preprint arXiv:2411.16345 (2024)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISBI48211.2021.9434063"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_16_1","volume-title":"Med-r1: Reinforcement learning for generalizable medical reasoning in vision-language models. arXiv preprint arXiv:2503.13939","author":"Lai Yuxiang","year":"2025","unstructured":"Yuxiang Lai, Jike Zhong, Ming Li, Shitian Zhao, and Xiaofeng Yang. 2025. Med-r1: Reinforcement learning for generalizable medical reasoning in vision-language models. arXiv preprint arXiv:2503.13939 (2025)."},{"key":"e_1_3_2_1_17_1","volume-title":"Asma Ben Abacha, and Dina Demner-Fushman","author":"Lau Jason J","year":"2018","unstructured":"Jason J Lau, Soumya Gayen, Asma Ben Abacha, and Dina Demner-Fushman. 2018. A dataset of clinically generated visual questions and answers about radiology images. Scientific data, Vol. 5, 1 (2018), 1-10."},{"key":"e_1_3_2_1_18_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Li Chunyuan","year":"2024","unstructured":"Chunyuan Li, Cliff Wong, Sheng Zhang, Naoto Usuyama, Haotian Liu, Jianwei Yang, Tristan Naumann, Hoifung Poon, and Jianfeng Gao. 2024. Llava-med: Training a large language-and-vision assistant for biomedicine in one day. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"Group Relative Policy Optimization for Image Captioning. arXiv preprint arXiv:2503.01333","author":"Liang Xu","year":"2025","unstructured":"Xu Liang. 2025. Group Relative Policy Optimization for Image Captioning. arXiv preprint arXiv:2503.01333 (2025)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-87196-3_20"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISBI48211.2021.9434010"},{"key":"e_1_3_2_1_22_1","volume-title":"A Large-Scale, Groundable, and Explainable Medical VQA Benchmark for Chest X-ray Diagnosis. arXiv preprint arXiv:2411.16778","author":"Liu Bo","year":"2024","unstructured":"Bo Liu, Ke Zou, Liming Zhan, Zexin Lu, Xiaoyu Dong, Yidi Chen, Chengqiang Xie, Jiannong Cao, Xiao-Ming Wu, and Huazhu Fu. 2024. GEMeX: A Large-Scale, Groundable, and Explainable Medical VQA Benchmark for Chest X-ray Diagnosis. arXiv preprint arXiv:2411.16778 (2024)."},{"key":"e_1_3_2_1_23_1","volume-title":"Visual-rft: Visual reinforcement fine-tuning. arXiv preprint arXiv:2503.01785","author":"Liu Ziyu","year":"2025","unstructured":"Ziyu Liu, Zeyi Sun, Yuhang Zang, Xiaoyi Dong, Yuhang Cao, Haodong Duan, Dahua Lin, and Jiaqi Wang. 2025. Visual-rft: Visual reinforcement fine-tuning. arXiv preprint arXiv:2503.01785 (2025)."},{"key":"e_1_3_2_1_24_1","volume-title":"Reft: Reasoning with reinforced fine-tuning. arXiv preprint arXiv:2401.08967","author":"Luong Trung Quoc","year":"2024","unstructured":"Trung Quoc Luong, Xinbo Zhang, Zhanming Jie, Peng Sun, Xiaoran Jin, and Hang Li. 2024. Reft: Reasoning with reinforced fine-tuning. arXiv preprint arXiv:2401.08967, Vol. 3 (2024)."},{"key":"e_1_3_2_1_25_1","volume-title":"Chen Chen, Cheng Ouyang, and Daniel Rueckert.","author":"Pan Jiazhen","year":"2025","unstructured":"Jiazhen Pan, Che Liu, Junde Wu, Fenglin Liu, Jiayuan Zhu, Hongwei Bran Li, Chen Chen, Cheng Ouyang, and Daniel Rueckert. 2025. Medvlm-r1: Incentivizing medical reasoning capability of vision-language models (vlms) via reinforcement learning. arXiv preprint arXiv:2502.19634 (2025)."},{"key":"e_1_3_2_1_26_1","first-page":"68772","article-title":"Llm evaluators recognize and favor their own generations","volume":"37","author":"Panickssery Arjun","year":"2024","unstructured":"Arjun Panickssery, Samuel Bowman, and Shi Feng. 2024. Llm evaluators recognize and favor their own generations. Advances in Neural Information Processing Systems, Vol. 37 (2024), 68772-68802.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_27_1","unstructured":"Qwen : An Yang Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chengyuan Li Dayiheng Liu Fei Huang Haoran Wei Huan Lin Jian Yang Jianhong Tu Jianwei Zhang Jianxin Yang Jiaxi Yang Jingren Zhou Junyang Lin Kai Dang Keming Lu Keqin Bao Kexin Yang Le Yu Mei Li Mingfeng Xue Pei Zhang Qin Zhu Rui Men Runji Lin Tianhao Li Tianyi Tang Tingyu Xia Xingzhang Ren Xuancheng Ren Yang Fan Yang Su Yichang Zhang Yu Wan Yuqiong Liu Zeyu Cui Zhenru Zhang and Zihan Qiu. 2024. Qwen2.5 Technical Report. arXiv:2412.15115 [cs.CL]"},{"key":"e_1_3_2_1_28_1","volume-title":"Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347","author":"Schulman John","year":"2017","unstructured":"John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. 2017. Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347 (2017)."},{"key":"e_1_3_2_1_29_1","unstructured":"Andrew Sellergren Sahar Kazemzadeh Tiam Jaroensri Atilla Kiraly Madeleine Traverse Timo Kohlberger Shawn Xu Fayaz Jamil C\u00edan Hughes Charles Lau Justin Chen Fereshteh Mahvar Liron Yatziv Tiffany Chen Bram Sterling Stefanie Anna Baby Susanna Maria Baby Jeremy Lai Samuel Schmidgall Lu Yang Kejia Chen Per Bjornsson Shashir Reddy Ryan Brush Kenneth Philbrick Mercy Asiedu Ines Mezerreg Howard Hu Howard Yang Richa Tiwari Sunny Jansen Preeti Singh Yun Liu Shekoofeh Azizi Aishwarya Kamath Johan Ferret Shreya Pathak Nino Vieillard Ramona Merhej Sarah Perrin Tatiana Matejovicova Alexandre Ram\u00e9 Morgane Riviere Louis Rouillard Thomas Mesnard Geoffrey Cideron Jean bastien Grill Sabela Ramos Edouard Yvinec Michelle Casbon Elena Buchatskaya Jean-Baptiste Alayrac Dmitry Lepikhin Vlad Feinberg Sebastian Borgeaud Alek Andreev Cassidy Hardin Robert Dadashi L\u00e9onard Hussenot Armand Joulin Olivier Bachem Yossi Matias Katherine Chou Avinatan Hassidim Kavi Goel Clement Farabet Joelle Barral Tris Warkentin Jonathon Shlens David Fleet Victor Cotruta Omar Sanseviero Gus Martins Phoebe Kirk Anand Rao Shravya Shetty David F. Steiner Can Kirmizibayrak Rory Pilgrim Daniel Golden and Lin Yang. 2025. MedGemma Technical Report. arXiv:2507.05201 [cs.AI] https:\/\/arxiv.org\/abs\/2507.05201"},{"key":"e_1_3_2_1_30_1","volume-title":"Deepseekmath: Pushing the limits of mathematical reasoning in open language models. arXiv preprint arXiv:2402.03300","author":"Shao Zhihong","year":"2024","unstructured":"Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei Zhang, Mingchuan Zhang, YK Li, Y Wu, et al., 2024. Deepseekmath: Pushing the limits of mathematical reasoning in open language models. arXiv preprint arXiv:2402.03300 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"Openthinkimg: Learning to think with images via visual tool reinforcement learning. arXiv preprint arXiv:2505.08617","author":"Su Zhaochen","year":"2025","unstructured":"Zhaochen Su, Linjie Li, Mingyang Song, Yunzhuo Hao, Zhengyuan Yang, Jun Zhang, Guanjie Chen, Jiawei Gu, Juntao Li, Xiaoye Qu, et al., 2025a. Openthinkimg: Learning to think with images via visual tool reinforcement learning. arXiv preprint arXiv:2505.08617 (2025)."},{"key":"e_1_3_2_1_32_1","volume-title":"Fung","author":"Su Zhaochen","year":"2025","unstructured":"Zhaochen Su, Peng Xia, Hangyu Guo, Zhenhua Liu, Yan Ma, Xiaoye Qu, Jiaqi Liu, Yanshu Li, Kaide Zeng, Zhengyuan Yang, Linjie Li, Yu Cheng, Heng Ji, Junxian He, and Yi R. Fung. 2025b. Thinking with Images for Multimodal Reasoning: Foundations, Methods, and Future Frontiers. arXiv:2506.23918 [cs.CV]"},{"key":"e_1_3_2_1_33_1","unstructured":"Zhiqing Sun Sheng Shen Shengcao Cao Haotian Liu Chunyuan Li Yikang Shen Chuang Gan Liang-Yan Gui Yu-Xiong Wang Yiming Yang et al. 2023. Aligning large multimodal models with factually augmented rlhf. arXiv preprint arXiv:2309.14525 (2023)."},{"key":"e_1_3_2_1_34_1","volume-title":"Multimodal chain-of-thought reasoning: A comprehensive survey. arXiv preprint arXiv:2503.12605","author":"Wang Yaoting","year":"2025","unstructured":"Yaoting Wang, Shengqiong Wu, Yuecheng Zhang, Shuicheng Yan, Ziwei Liu, Jiebo Luo, and Hao Fei. 2025. Multimodal chain-of-thought reasoning: A comprehensive survey. arXiv preprint arXiv:2503.12605 (2025)."},{"key":"e_1_3_2_1_35_1","volume-title":"Internlm-math: Open math large language models toward verifiable reasoning. arXiv preprint arXiv:2402.06332","author":"Ying Huaiyuan","year":"2024","unstructured":"Huaiyuan Ying, Shuo Zhang, Linyang Li, Zhejian Zhou, Yunfan Shao, Zhaoye Fei, Yichuan Ma, Jiawei Hong, Kuikun Liu, Ziyi Wang, et al., 2024. Internlm-math: Open math large language models toward verifiable reasoning. arXiv preprint arXiv:2402.06332 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"Codedpo: Aligning code models with self generated and verified source code. arXiv preprint arXiv:2410.05605","author":"Zhang Kechi","year":"2024","unstructured":"Kechi Zhang, Ge Li, Yihong Dong, Jingjing Xu, Jun Zhang, Jing Su, Yongfei Liu, and Zhi Jin. 2024. Codedpo: Aligning code models with self generated and verified source code. arXiv preprint arXiv:2410.05605 (2024)."},{"key":"e_1_3_2_1_37_1","unstructured":"Sheng Zhang Yanbo Xu Naoto Usuyama Hanwen Xu Jaspreet Bagga Robert Tinn Sam Preston Rajesh Rao Mu Wei Naveen Valluri et al. 2023a. BiomedCLIP: a multimodal biomedical foundation model pretrained from fifteen million scientific image-text pairs. arXiv preprint arXiv:2303.00915 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"Multimodal chain-of-thought reasoning in language models. arXiv preprint arXiv:2302.00923","author":"Zhang Zhuosheng","year":"2023","unstructured":"Zhuosheng Zhang, Aston Zhang, Mu Li, Hai Zhao, George Karypis, and Alex Smola. 2023b. Multimodal chain-of-thought reasoning in language models. arXiv preprint arXiv:2302.00923 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"MSEarth: A Benchmark for Multimodal Scientific Comprehension of Earth Science. arXiv preprint arXiv:2505.20740","author":"Zhao Xiangyu","year":"2025","unstructured":"Xiangyu Zhao, Wanghan Xu, Bo Liu, Yuhao Zhou, Fenghua Ling, Ben Fei, Xiaoyu Yue, Lei Bai, Wenlong Zhang, and Xiao-Ming Wu. 2025. MSEarth: A Benchmark for Multimodal Scientific Comprehension of Earth Science. arXiv preprint arXiv:2505.20740 (2025)."},{"key":"e_1_3_2_1_40_1","volume-title":"Beyond hallucinations: Enhancing lvlms through hallucination-aware direct preference optimization. arXiv preprint arXiv:2311.16839","author":"Zhao Zhiyuan","year":"2023","unstructured":"Zhiyuan Zhao, Bin Wang, Linke Ouyang, Xiaoyi Dong, Jiaqi Wang, and Conghui He. 2023. Beyond hallucinations: Enhancing lvlms through hallucination-aware direct preference optimization. arXiv preprint arXiv:2311.16839 (2023)."},{"key":"e_1_3_2_1_41_1","unstructured":"Yuhao Zhou Yiheng Wang Xuming He Ruoyao Xiao Zhiwei Li Qiantai Feng Zijie Guo Yuejin Yang Hao Wu Wenxuan Huang et al. 2025. Scientists' First Exam: Probing Cognitive Abilities of MLLM via Perception Understanding and Reasoning. arXiv preprint arXiv:2506.10521 (2025)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Ke Zou Yang Bai Bo Liu Yidi Chen Zhihao Chen Yang Zhou Xuedong Yuan Meng Wang Xiaojing Shen Xiaochun Cao et al. 2025. Uncertainty-aware Medical Diagnostic Phrase Identification and Grounding. IEEE Transactions on Pattern Analysis and Machine Intelligence (2025).","DOI":"10.1109\/TPAMI.2025.3596878"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758277","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:08:39Z","timestamp":1765343319000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758277"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":42,"alternative-id":["10.1145\/3746027.3758277","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758277","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}