{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T18:27:11Z","timestamp":1763922431943,"version":"3.45.0"},"publisher-location":"Cham","reference-count":44,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032093677","type":"print"},{"value":"9783032093684","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T00:00:00Z","timestamp":1763942400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T00:00:00Z","timestamp":1763942400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-09368-4_18","type":"book-chapter","created":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T18:14:36Z","timestamp":1763921676000},"page":"292-309","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Rule-Based Reinforcement Learning for\u00a0Document Image Classification with\u00a0Vision Language Models"],"prefix":"10.1007","author":[{"given":"Michael","family":"Jungo","sequence":"first","affiliation":[]},{"given":"Andreas","family":"Fischer","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,24]]},"reference":[{"key":"18_CR1","unstructured":"Achiam, J., et\u00a0al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"18_CR2","doi-asserted-by":"publisher","unstructured":"Aghajanyan, A., Gupta, S., Zettlemoyer, L.: Intrinsic dimensionality explains the effectiveness of language model fine-tuning. In: Zong, C., Xia, F., Li, W., Navigli, R. (eds.) Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (vol. 1: Long Papers), pp. 7319\u20137328. Association for Computational Linguistics (2021). https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.568","DOI":"10.18653\/v1\/2021.acl-long.568"},{"key":"18_CR3","unstructured":"Bai, S., et\u00a0al.: Qwen2.5-VL Technical Report. arXiv preprint arXiv:2502.13923 (2025)"},{"key":"18_CR4","unstructured":"Bai, Y., et\u00a0al.: Training a helpful and harmless assistant with reinforcement learning from human feedback. arXiv preprint arXiv:2204.05862 (2022)"},{"key":"18_CR5","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"18_CR6","unstructured":"Chu, T., Zhai, Y., Yang, J., Tong, S., Xie, S., Levine, S., Ma, Y.: SFT memorizes, RL generalizes: a comparative study of foundation model post-training. In: The Second Conference on Parsimony and Learning (Recent Spotlight Track) (2025). https:\/\/openreview.net\/forum?id=d3E3LWmTar"},{"key":"18_CR7","unstructured":"Dai, J., et al.: Safe RLHF: safe reinforcement learning from human feedback. In: The Twelfth International Conference on Learning Representations (2024). https:\/\/openreview.net\/forum?id=TyFrPOKYXw"},{"key":"18_CR8","unstructured":"Deitke, M., et\u00a0al.: Molmo and PixMo: open weights and open data for state-of-the-art multimodal models. arXiv preprint arXiv:2409.17146 (2024)"},{"key":"18_CR9","first-page":"10088","volume":"36","author":"T Dettmers","year":"2023","unstructured":"Dettmers, T., Pagnoni, A., Holtzman, A., Zettlemoyer, L.: QLoRA: efficient finetuning of quantized LLMs. Adv. Neural. Inf. Process. Syst. 36, 10088\u201310115 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"18_CR10","unstructured":"Grattafiori, A., et\u00a0al.: The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)"},{"key":"18_CR11","unstructured":"Guo, D., et\u00a0al.: DeepSeek-R1: incentivizing reasoning capability in LLMs via reinforcement learning. arXiv preprint arXiv:2501.12948 (2025)"},{"key":"18_CR12","doi-asserted-by":"crossref","unstructured":"Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: Proc. Int. Conf. on Document Analysis and Recognition (ICDAR), pp. 991\u2013995 (2015)","DOI":"10.1109\/ICDAR.2015.7333910"},{"issue":"2","key":"18_CR13","first-page":"3","volume":"1","author":"EJ Hu","year":"2022","unstructured":"Hu, E.J., et al.: LoRa: low-rank adaptation of large language models. ICLR 1(2), 3 (2022)","journal-title":"ICLR"},{"key":"18_CR14","unstructured":"Lai, Y., Zhong, J., Li, M., Zhao, S., Yang, X.: Med-R1: reinforcement learning for generalizable medical reasoning in vision-language models. arXiv preprint arXiv:2503.13939 (2025)"},{"key":"18_CR15","unstructured":"Lambert, N., et\u00a0al.: T$$\\backslash $$\" ulu 3: pushing frontiers in open language model post-training. arXiv preprint arXiv:2411.15124 (2024)"},{"key":"18_CR16","first-page":"11673","volume":"35","author":"S Larson","year":"2022","unstructured":"Larson, S., Lim, Y.Y.G., Ai, Y., Kuang, D., Leach, K.: Evaluating out-of-distribution performance on document image classifiers. Adv. Neural. Inf. Process. Syst. 35, 11673\u201311685 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"18_CR17","unstructured":"Lauren\u00e7on, H., Marafioti, A., Sanh, V., Tronchon, L.: Building and better understanding vision-language models: insights and future directions. In: Workshop on Responsibly Building the Next Generation of Multimodal Foundational Models (2024)"},{"key":"18_CR18","doi-asserted-by":"crossref","unstructured":"Lewis, D., Agam, G., Argamon, S., Frieder, O., Grossman, D., Heard, J.: Building a test collection for complex document information processing. In: Proceedings of the 29th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 665\u2013666 (2006)","DOI":"10.1145\/1148170.1148307"},{"key":"18_CR19","unstructured":"Li, C., Farkhoor, H., Liu, R., Yosinski, J.: Measuring the intrinsic dimension of objective landscapes. In: International Conference on Learning Representations (2018)"},{"key":"18_CR20","doi-asserted-by":"crossref","unstructured":"Li, Z., Wu, X., Du, H., Nghiem, H., Shi, G.: Benchmark evaluations, applications, and challenges of large vision language models: a survey. arXiv preprint arXiv:2501.021891 (2025)","DOI":"10.32388\/GXR68Q"},{"key":"18_CR21","unstructured":"Liu, Y., et al.: Trustworthy LLMs: a survey and guideline for evaluating large language models\u2019 alignment. In: Socially Responsible Language Modelling Research (2023). https:\/\/openreview.net\/forum?id=oss9uaPFfB"},{"key":"18_CR22","unstructured":"Liu, Z., et al.: Visual-RFT: visual reinforcement fine-tuning. CoRR abs\/2503.01785 (2025). https:\/\/doi.org\/10.48550\/arXiv.2503.01785"},{"key":"18_CR23","first-page":"2507","volume":"35","author":"P Lu","year":"2022","unstructured":"Lu, P., et al.: Learn to explain: multimodal reasoning via thought chains for science question answering. Adv. Neural. Inf. Process. Syst. 35, 2507\u20132521 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"18_CR24","doi-asserted-by":"crossref","unstructured":"Mathew, M., Bagal, V., Tito, R., Karatzas, D., Valveny, E., Jawahar, C.: InfographicVQA. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1697\u20131706 (2022)","DOI":"10.1109\/WACV51458.2022.00264"},{"key":"18_CR25","doi-asserted-by":"crossref","unstructured":"Mathew, M., Karatzas, D., Jawahar, C.: DocVQA: a dataset for VQA on document images. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2200\u20132209 (2021)","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"18_CR26","doi-asserted-by":"crossref","unstructured":"Mishra, A., Shekhar, S., Singh, A.K., Chakraborty, A.: OCR-VQA: visual question answering by reading text in images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 947\u2013952. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"18_CR27","first-page":"27730","volume":"35","author":"L Ouyang","year":"2022","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. Adv. Neural. Inf. Process. Syst. 35, 27730\u201327744 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"18_CR28","doi-asserted-by":"crossref","unstructured":"Pan, J., et al.: MedVLM-R1: incentivizing medical reasoning capability of vision-language models (VLMs) via reinforcement learning. CoRR abs\/2502.19634 (2025). https:\/\/doi.org\/10.48550\/arXiv.2502.19634","DOI":"10.1007\/978-3-032-04981-0_32"},{"key":"18_CR29","doi-asserted-by":"crossref","unstructured":"Scius-Bertrand, A., Jungo, M., V\u00f6gtlin, L., Spat, J.M., Fischer, A.: Zero-shot prompting and few-shot fine-tuning: revisiting document image classification using large language models. In: International Conference on Pattern Recognition, pp. 152\u2013166. Springer (2025)","DOI":"10.1007\/978-3-031-78495-8_10"},{"key":"18_CR30","unstructured":"Shao, Z., et\u00a0al.: DeepSeekMath: pushing the limits of mathematical reasoning in open language models. arXiv preprint arXiv:2402.03300 (2024)"},{"key":"18_CR31","unstructured":"Shen, H., et\u00a0al.: VLM-R1: a stable and generalizable r1-style large vision-language model. arXiv preprint arXiv:2504.07615 (2025)"},{"key":"18_CR32","unstructured":"Snell, C., Lee, J., Xu, K., Kumar, A.: Scaling LLM test-time compute optimally can be more effective than scaling model parameters. arXiv preprint arXiv:2408.03314 (2024)"},{"key":"18_CR33","doi-asserted-by":"crossref","unstructured":"Tanaka, R., Nishida, K., Yoshida, S.: VisualMRC: machine reading comprehension on document images. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a035, pp. 13878\u201313888 (2021)","DOI":"10.1609\/aaai.v35i15.17635"},{"key":"18_CR34","unstructured":"Team, G., et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"18_CR35","unstructured":"Team, G., et\u00a0al.: Gemma 3 technical report. arXiv preprint arXiv:2503.19786 (2025)"},{"key":"18_CR36","unstructured":"Wang, Z., et al.: Jigsaw-R1: a study of rule-based visual reinforcement learning with jigsaw puzzles. arXiv preprint arXiv:2505.23590 (2025)"},{"key":"18_CR37","first-page":"24824","volume":"35","author":"J Wei","year":"2022","unstructured":"Wei, J., et al.: Chain-of-thought prompting elicits reasoning in large language models. Adv. Neural. Inf. Process. Syst. 35, 24824\u201324837 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"18_CR38","unstructured":"Wen, X., et\u00a0al.: Reinforcement learning with verifiable rewards implicitly incentivizes correct reasoning in base LLMs. arXiv preprint arXiv:2506.14245 (2025)"},{"key":"18_CR39","unstructured":"Yang, A., et\u00a0al.: Qwen3 technical report. arXiv preprint arXiv:2505.09388 (2025)"},{"key":"18_CR40","unstructured":"Yang, Y., et\u00a0al.: R1-OneVision: advancing generalized multimodal reasoning through cross-modal formalization. arXiv preprint arXiv:2503.10615 (2025)"},{"key":"18_CR41","doi-asserted-by":"crossref","unstructured":"Yang, Y., et\u00a0al.: Scaling text-rich image understanding via code-guided synthetic multimodal data generation. arXiv preprint arXiv:2502.14846 (2025)","DOI":"10.18653\/v1\/2025.acl-long.855"},{"key":"18_CR42","unstructured":"Zhan, Y., et al.: Vision-R1: evolving human-free alignment in large vision-language models via vision-guided reinforcement learning. CoRR abs\/2503.18013 (2025). https:\/\/doi.org\/10.48550\/arXiv.2503.18013"},{"key":"18_CR43","doi-asserted-by":"crossref","unstructured":"Zhang, J., Huang, J., Jin, S., Lu, S.: Vision-language models for vision tasks: a survey. IEEE Trans. Pattern Anal. Mach. Intell. (2024)","DOI":"10.1109\/TPAMI.2024.3369699"},{"key":"18_CR44","unstructured":"Zhou, H., Li, X., Wang, R., Cheng, M., Zhou, T., Hsieh, C.J.: R1-zero\u2019s \"Aha Moment\" in visual reasoning on a 2B Non-SFT model. CoRR abs\/2503.05132 (2025). https:\/\/doi.org\/10.48550\/arXiv.2503.05132"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition \u2013 ICDAR 2025 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-09368-4_18","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T18:14:42Z","timestamp":1763921682000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-09368-4_18"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,24]]},"ISBN":["9783032093677","9783032093684"],"references-count":44,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-09368-4_18","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,24]]},"assertion":[{"value":"24 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Wuhan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/iapr.org\/icdar2025","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}