{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T09:39:06Z","timestamp":1775036346631,"version":"3.50.1"},"reference-count":13,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,11,28]],"date-time":"2025-11-28T00:00:00Z","timestamp":1764288000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,11,28]],"date-time":"2025-11-28T00:00:00Z","timestamp":1764288000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,11,28]]},"DOI":"10.1109\/icvisp68610.2025.11451708","type":"proceedings-article","created":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T19:48:48Z","timestamp":1774986528000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["Vision-driven Adaptive Decoding for Document Understanding: A Dual-Path Visual-Language and OCR Framework with Prompt-aware Task Routing"],"prefix":"10.1109","author":[{"given":"Ao","family":"Shen","sequence":"first","affiliation":[{"name":"Beihang University,School of Artificial Intelligence,Beijing,China"}]},{"given":"Tongfei","family":"Chen","sequence":"additional","affiliation":[{"name":"Beihang University,School of Artificial Intelligence,Beijing,China"}]},{"given":"Yangyang","family":"Sun","sequence":"additional","affiliation":[{"name":"Beijing Institute of Control and Electronics Technology,Beijing,China"}]},{"given":"Feng","family":"Jiang","sequence":"additional","affiliation":[{"name":"Beijing Institute of Control and Electronics Technology,Beijing,China"}]},{"given":"Yaogong","family":"Feng","sequence":"additional","affiliation":[{"name":"Beijing Institute of Control and Electronics Technology,Beijing,China"}]},{"given":"Kun","family":"Hu","sequence":"additional","affiliation":[{"name":"Beihang University,School of Artificial Intelligence,Beijing,China"}]},{"given":"Mengqi","family":"Liu","sequence":"additional","affiliation":[{"name":"China Information Communication Technologies Group Design Institute Co., Ltd., China Unicom,Beijing,China"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"Qwen2.5-vl technical report","author":"S. B","year":"2025"},{"key":"ref2","article-title":"Paddleocr 3.0 technical report","volume-title":"arXiv preprint arXiv:2507.05595","author":"Cui","year":"2025"},{"key":"ref3","first-page":"4258","article-title":"Rapidocr: An efficient and accurate ocr system based on convolutional neural networks","volume-title":"Proceedings of the 29th ACM International Conference on Multimedia","author":"Deng"},{"key":"ref4","doi-asserted-by":"crossref","DOI":"10.1109\/CVPR52734.2025.02313","article-title":"Omnidocbench: Benchmarking diverse pdf document parsing with comprehensive annotations","author":"Ouyang","year":"2025"},{"key":"ref5","article-title":"Donut: Document understanding transformer without ocr","author":"Kim","year":"2021","journal-title":"arXiv preprint arXiv:2111.15664"},{"key":"ref6","article-title":"Pix2struct: Schematic parsing with structured text generation","author":"Xu","year":"2022","journal-title":"arXiv preprint arXiv:2209.03557"},{"key":"ref7","article-title":"Ocr-augmented vision-language models for structured document understanding","author":"Yang","year":"2022","journal-title":"arXiv preprint arXiv:2207.03403"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403172"},{"key":"ref9","first-page":"4567","article-title":"Fusionocr: Feature-level integration of ocr and vision-language models for document understanding","volume-title":"Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics","author":"Chen"},{"key":"ref10","first-page":"1234","article-title":"Vlmocr: Schema-constrained vision-language model for structured document generation","volume":"33","author":"Wang","year":"2024","journal-title":"IEEE Transactions on Image Processing"},{"key":"ref11","first-page":"3456","article-title":"Llm-based post-processing for ocr error correction in document understanding","volume-title":"Proceedings of the 31st ACM International Conference on Multimedia","author":"Liu"},{"key":"ref12","article-title":"General ocr theory: Towards ocr-2.0 via a unified end-to-end model","author":"Wei","year":"2024","journal-title":"arXiv preprint arXiv:2410.04567"},{"key":"ref13","article-title":"Enhancing the reasoning ability of multimodal large language models via mixed preference optimization","author":"Chen","year":"2024","journal-title":"arXiv preprint arXiv:2411.10442"}],"event":{"name":"2025 9th International Conference on Vision, Image and Signal Processing (ICVISP)","location":"Xi'an, China","start":{"date-parts":[[2025,11,28]]},"end":{"date-parts":[[2025,11,30]]}},"container-title":["2025 9th International Conference on Vision, Image and Signal Processing (ICVISP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11450551\/11451183\/11451708.pdf?arnumber=11451708","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T06:57:42Z","timestamp":1775026662000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11451708\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,28]]},"references-count":13,"URL":"https:\/\/doi.org\/10.1109\/icvisp68610.2025.11451708","relation":{},"subject":[],"published":{"date-parts":[[2025,11,28]]}}}