{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T18:05:48Z","timestamp":1775325948866,"version":"3.50.1"},"reference-count":66,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Sci. China Inf. Sci."],"published-print":{"date-parts":[[2024,12]]},"DOI":"10.1007\/s11432-024-4250-y","type":"journal-article","created":{"date-parts":[[2024,12,16]],"date-time":"2024-12-16T02:16:30Z","timestamp":1734315390000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":34,"title":["DocPedia: unleashing the power of large multimodal model in the frequency domain for versatile document understanding"],"prefix":"10.1007","volume":"67","author":[{"given":"Hao","family":"Feng","sequence":"first","affiliation":[]},{"given":"Qi","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Hao","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Jingqun","family":"Tang","sequence":"additional","affiliation":[]},{"given":"Wengang","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Houqiang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Can","family":"Huang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,13]]},"reference":[{"key":"4250_CR1","first-page":"87","volume-title":"Proceedings of Fall Joint Computer Conference","author":"S N Srihari","year":"1986","unstructured":"Srihari S N, Lam S W, Govindaraju V, et al. Document image understanding. In: Proceedings of Fall Joint Computer Conference, 1986. 87\u201395"},{"key":"4250_CR2","volume-title":"Proceedings of Neural Information Processing Systems","author":"W Hwang","year":"2019","unstructured":"Hwang W, Kim S, Seo M, et al. Post-OCR parsing: building simple and robust parser via BIO tagging. In: Proceedings of Neural Information Processing Systems, 2019"},{"key":"4250_CR3","first-page":"498","volume-title":"Proceedings of European Conference on Computer Vision","author":"G Kim","year":"2022","unstructured":"Kim G, Hong T, Yim M, et al. OCR-free document understanding transformer. In: Proceedings of European Conference on Computer Vision, 2022. 498\u2013517"},{"key":"4250_CR4","first-page":"7092","volume-title":"Proceedings of Conference on Computer Vision and Pattern Recognition","author":"C W Luo","year":"2023","unstructured":"Luo C W, Cheng C X, Zheng Q, et al. GeoLayoutLM: geometric pre-training for visual information extraction. In: Proceedings of Conference on Computer Vision and Pattern Recognition, 2023. 7092\u20137101"},{"key":"4250_CR5","unstructured":"Ye Q H, Xu H Y, Xu G H, et al. mPLUG-Owl: modularization empowers large language models with multimodality. 2023. ArXiv:2304.14178"},{"key":"4250_CR6","unstructured":"Feng H, Wang Z J, Tang J Q, et al. UniDoc: a universal large multimodal model for simultaneous text detection, recognition, spotting and understanding. 2023. ArXiv:2308.11592"},{"key":"4250_CR7","doi-asserted-by":"crossref","unstructured":"Ye J B, Hu A W, Xu H Y, et al. UReader: universal OCR-free visually-situated language understanding with multimodal large language model. 2023. ArXiv:2310.05126","DOI":"10.18653\/v1\/2023.findings-emnlp.187"},{"key":"4250_CR8","unstructured":"Lv T C, Huang Y P, Chen J Y, et al. KOSMOS-2.5: a multimodal literate model. 2023. ArXiv:2309.11419"},{"key":"4250_CR9","unstructured":"Xu Y H, Lv T C, Cui L, et al. LayoutxLM: multimodal pre-training for multilingual visually-rich document understanding. 2021. ArXiv:2104.08836"},{"key":"4250_CR10","first-page":"1192","volume-title":"Proceedings of KDD","author":"Y H Xu","year":"2020","unstructured":"Xu Y H, Li M H, Cui L, et al. LayoutLM: pre-training of text and layout for document image understanding. In: Proceedings of KDD, 2020. 1192\u20131200"},{"key":"4250_CR11","doi-asserted-by":"publisher","first-page":"4083","DOI":"10.1145\/3503161.3548112","volume-title":"Proceedings of ACM International Conference on Multimedia","author":"Y P Huang","year":"2022","unstructured":"Huang Y P, Lv T C, Cui L, et al. LayoutLMv3: pre-training for document AI with unified text and image masking. In: Proceedings of ACM International Conference on Multimedia, 2022. 4083\u20134091"},{"key":"4250_CR12","first-page":"10767","volume-title":"Proceedings of AAAI","author":"T Hong","year":"2022","unstructured":"Hong T, Kim D H, Ji M, et al. BROS: a pre-trained language model focusing on text and layout for better key information extraction from documents. In: Proceedings of AAAI, 2022. 36: 10767\u201310775"},{"key":"4250_CR13","unstructured":"Bai H L, Liu Z G, Meng X J, et al. Wukong-Reader: multi-modal pre-training for fine-grained visual document understanding. 2022. ArXiv:2212.09621"},{"key":"4250_CR14","first-page":"19254","volume-title":"Proceedings of Conference on Computer Vision and Pattern Recognition","author":"Z N Tang","year":"2023","unstructured":"Tang Z N, Yang Z Y, Wang G X, et al. Unifying vision, text, and layout for universal document processing. In: Proceedings of Conference on Computer Vision and Pattern Recognition, 2023. 19254\u201319264"},{"key":"4250_CR15","first-page":"1912","volume-title":"Proceedings of ACM International Conference on Multimedia","author":"Y L Li","year":"2021","unstructured":"Li Y L, Qian Y X, Yu Y C, et al. StrucTexT: structured text understanding with multi-modal transformers. In: Proceedings of ACM International Conference on Multimedia, 2021. 1912\u20131920"},{"key":"4250_CR16","doi-asserted-by":"crossref","unstructured":"Peng Q M, Pan Y X, Wang W J, et al. ERNIE-Layout: layout knowledge enhanced pre-training for visually-rich document understanding. 2022. ArXiv:2210.06155","DOI":"10.18653\/v1\/2022.findings-emnlp.274"},{"key":"4250_CR17","first-page":"993","volume-title":"Proceedings of International Conference on Computer Vision","author":"S Appalaraju","year":"2021","unstructured":"Appalaraju S, Jasani B, Kota B U, et al. DocFormer: end-to-end transformer for document understanding. In: Proceedings of International Conference on Computer Vision, 2021. 993\u20131003"},{"key":"4250_CR18","first-page":"11474","volume-title":"Proceedings of AAAI","author":"M Liao","year":"2020","unstructured":"Liao M, Wan Z, Yao C, et al. Real-time scene text detection with differentiable binarization. In: Proceedings of AAAI, 2020. 34: 11474\u201311481"},{"key":"4250_CR19","doi-asserted-by":"publisher","first-page":"2298","DOI":"10.1109\/TPAMI.2016.2646371","volume":"39","author":"B Shi","year":"2016","unstructured":"Shi B, Bai X, Yao C. An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition. IEEE Trans Pattern Anal Mach Intell, 2016, 39: 2298\u20132304","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"4250_CR20","unstructured":"Zhang Y Z, Zhang R Y, Gu J X, et al. LLaVAR: enhanced visual instruction tuning for text-rich image understanding. 2023. ArXiv:2306.17107"},{"key":"4250_CR21","unstructured":"Ye J B, Hu A W, Xu H Y, et al. mPLUG-DocOwl: modularized multimodal large language model for document understanding. 2023. ArXiv:2307.02499"},{"key":"4250_CR22","first-page":"18893","volume-title":"Proceedings of International Conference on Machine Learning","author":"K Lee","year":"2023","unstructured":"Lee K, Joshi M, Turc I R, et al. Pix2Struct: screenshot parsing as pretraining for visual language understanding. In: Proceedings of International Conference on Machine Learning, 2023. 18893\u201318912"},{"key":"4250_CR23","first-page":"8748","volume-title":"Proceedings of International Conference on Machine Learning","author":"A Radford","year":"2021","unstructured":"Radford A, Kim J W, Hallacy C, et al. Learning transferable visual models from natural language supervision. In: Proceedings of International Conference on Machine Learning, 2021. 8748\u20138763"},{"key":"4250_CR24","unstructured":"Liu Y L, Li Z, Li H L, et al. On the hidden mystery of OCR in large multimodal models. 2023. ArXiv:2305.07895"},{"key":"4250_CR25","first-page":"30","volume-title":"Proceedings of Neural Information Processing Systems","author":"A Vaswani","year":"2017","unstructured":"Vaswani A, Shazeer N, Parmar N, et al. Attention is all you need. In: Proceedings of Neural Information Processing Systems, 2017. 30"},{"key":"4250_CR26","unstructured":"Touvron H, Lavril T, Izacard G, et al. LLaMA: open and efficient foundation language models. 2023. ArXiv:2302.13971"},{"key":"4250_CR27","volume-title":"Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality","author":"W-L Chiang","year":"2023","unstructured":"Chiang W-L, Li Z H, Lin Z, et al. Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality. 2023. https:\/\/vicuna.lmsys.org"},{"key":"4250_CR28","doi-asserted-by":"publisher","first-page":"90","DOI":"10.1109\/T-C.1974.223784","volume":"23","author":"N Ahmed","year":"1974","unstructured":"Ahmed N, Natarajan T, Rao K R. Discrete cosine transform. IEEE Trans Comput, 1974, 23: 90\u201393","journal-title":"IEEE Trans Comput"},{"key":"4250_CR29","doi-asserted-by":"publisher","first-page":"30","DOI":"10.1145\/103085.103089","volume":"34","author":"G K Wallace","year":"1991","unstructured":"Wallace G K. The JPEG still picture compression standard. Commun ACM, 1991, 34: 30\u201344","journal-title":"Commun ACM"},{"key":"4250_CR30","first-page":"1649","volume-title":"Proceedings of AAAI","author":"H Liu","year":"2023","unstructured":"Liu H, Jiang X, Li X, et al. The devil is in the frequency: geminated gestalt autoencoder for self-supervised visual pretraining. In: Proceedings of AAAI, 2023. 37: 1649\u20131656"},{"key":"4250_CR31","first-page":"12073","volume-title":"Proceedings of Conference on Computer Vision and Pattern Recognition","author":"H Liu","year":"2022","unstructured":"Liu H, Jiang X H, Li X, et al. NomMer: nominate synergistic context in vision transformer for visual recognition. In: Proceedings of Conference on Computer Vision and Pattern Recognition, 2022. 12073\u201312082"},{"key":"4250_CR32","first-page":"5676","volume-title":"Proceedings of Conference on Computer Vision and Pattern Recognition","author":"X B Liu","year":"2018","unstructured":"Liu X B, Liang D, Yan S, et al. FOTS: fast oriented text spotting with a unified network. In: Proceedings of Conference on Computer Vision and Pattern Recognition, 2018. 5676\u20135685"},{"key":"4250_CR33","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3295748","volume":"51","author":"M Z Hossain","year":"2019","unstructured":"Hossain M Z, Sohel F, Shiratuddin M F, et al. A comprehensive survey of deep learning for image captioning. ACM Comput Surv, 2019, 51: 1\u201336","journal-title":"ACM Comput Surv"},{"key":"4250_CR34","first-page":"1877","volume-title":"Proceedings of Neural Information Processing Systems","author":"T Brown","year":"2020","unstructured":"Brown T, Mann B, Ryder N, et al. Language models are few-shot learners. In: Proceedings of Neural Information Processing Systems, 2020. 33: 1877\u20131901"},{"key":"4250_CR35","unstructured":"Liu H T, Li C Y, Wu Q Y, et al. Visual instruction tuning. 2023. ArXiv:2304.08485"},{"key":"4250_CR36","unstructured":"Yu Y C, Li Y L, Zhang C Q, et al. StrucTexTv2: masked visual-textual prediction for document image pre-training. 2023. ArXiv:2303.00289"},{"key":"4250_CR37","first-page":"16000","volume-title":"Proceedings of Conference on Computer Vision and Pattern Recognition","author":"K M He","year":"2022","unstructured":"He K M, Chen X L, Xie S N, et al. Masked autoencoders are scalable vision learners. In: Proceedings of Conference on Computer Vision and Pattern Recognition, 2022. 16000\u201316009"},{"key":"4250_CR38","unstructured":"Devlin J, Chang M-W, Lee K, et al. BERT: pre-training of deep bidirectional transformers for language understanding. 2018. ArXiv:1810.04805"},{"key":"4250_CR39","unstructured":"Peng Z L, Wang W H, Dong L, et al. KOSMOS-2: grounding multimodal large language models to the world. 2023. ArXiv:2306.14824"},{"key":"4250_CR40","volume-title":"Proceedings of Conference on Computer Vision and Pattern Recognition","author":"Z Li","year":"2024","unstructured":"Li Z, Yang B, Liu Q, et al. Monkey: image resolution and text label are important things for large multi-modal models. In: Proceedings of Conference on Computer Vision and Pattern Recognition, 2024"},{"key":"4250_CR41","unstructured":"Liu Y L, Yang B, Liu Q, et al. TextMonkey: an OCR-free large multimodal model for understanding document. 2024. ArXiv:2403.04473"},{"key":"4250_CR42","first-page":"10012","volume-title":"Proceedings of International Conference on Computer Vision","author":"Z Liu","year":"2021","unstructured":"Liu Z, Li Y T, Cao Y, et al. Swin Transformer: hierarchical vision transformer using shifted windows. In: Proceedings of International Conference on Computer Vision, 2021. 10012\u201310022"},{"key":"4250_CR43","unstructured":"Zhu D Y, Chen J, Shen X Q, et al. MiniGPT-4: enhancing vision-language understanding with advanced large language models. 2023. ArXiv:2304.10592"},{"key":"4250_CR44","first-page":"1457","volume-title":"Proceedings of International Conference on Computer Vision","author":"K Wang","year":"2011","unstructured":"Wang K, Babenko B, Belongie S. End-to-end scene text recognition. In: Proceedings of International Conference on Computer Vision, 2011. 1457\u20131464"},{"key":"4250_CR45","first-page":"2200","volume-title":"Proceedings of IEEE Winter Conference on Applications of Computer Vision (WACV)","author":"M Mathew","year":"2021","unstructured":"Mathew M, Karatzas D, Jawahar C V. DocVQA: a dataset for VQA on document images. In: Proceedings of IEEE Winter Conference on Applications of Computer Vision (WACV), 2021. 2200\u20132209"},{"key":"4250_CR46","first-page":"947","volume-title":"Proceedings of International Conference on Document Analysis and Recognition","author":"A Mishra","year":"2019","unstructured":"Mishra A, Shekhar S, Singh A K, et al. OCR-VQA: visual question answering by reading text in images. In: Proceedings of International Conference on Document Analysis and Recognition, 2019. 947\u2013952"},{"key":"4250_CR47","first-page":"8317","volume-title":"Proceedings of Conference on Computer Vision and Pattern Recognition","author":"A Singh","year":"2019","unstructured":"Singh A, Natarajan V, Shah M, et al. Towards VQA models that can read. In: Proceedings of Conference on Computer Vision and Pattern Recognition, 2019. 8317\u20138326"},{"key":"4250_CR48","first-page":"1697","volume-title":"Proceedings of IEEE Winter Conference on Applications of Computer Vision (WACV)","author":"M Mathew","year":"2022","unstructured":"Mathew M, Bagal V, Tito R, et al. InfographicVQA. In: Proceedings of IEEE Winter Conference on Applications of Computer Vision (WACV), 2022. 1697\u20131706"},{"key":"4250_CR49","doi-asserted-by":"crossref","unstructured":"Masry A, Long D X, Tan J Q, et al. ChartQA: a benchmark for question answering about charts with visual and logical reasoning. 2022. ArXiv:2203.10244","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"4250_CR50","unstructured":"Kahou S E, Michalski V, Atkinson A, et al. FigureQA: an annotated figure dataset for visual reasoning. 2017. ArXiv:1710.07300"},{"key":"4250_CR51","first-page":"1","volume-title":"Proceedings of International Conference on Document Analysis and Recognition Workshops","author":"G Jaume","year":"2019","unstructured":"Jaume G, Ekenel H K, Thiran J-P. FUNSD: a dataset for form understanding in noisy scanned documents. In: Proceedings of International Conference on Document Analysis and Recognition Workshops, 2019. 2: 1\u20136"},{"key":"4250_CR52","first-page":"1516","volume-title":"Proceedings of International Conference on Document Analysis and Recognition","author":"Z Huang","year":"2019","unstructured":"Huang Z, Chen K, He J H, et al. ICDAR 2019 competition on scanned receipt OCR and information extraction. In: Proceedings of International Conference on Document Analysis and Recognition, 2019. 1516\u20131520"},{"key":"4250_CR53","first-page":"36","volume-title":"Proceedings of International Conference on Document Analysis and Recognition","author":"J F Kuang","year":"2023","unstructured":"Kuang J F, Hua W, Liang D K, et al. Visual information extraction in the wild: practical dataset and end-to-end solution. In: Proceedings of International Conference on Document Analysis and Recognition, 2023. 36\u201353"},{"key":"4250_CR54","first-page":"369","volume-title":"Proceedings of AIMLMOA","author":"L N Smith","year":"2019","unstructured":"Smith L N, Topin N. Super-convergence: very fast training of neural networks using large learning rates. In: Proceedings of AIMLMOA, 2019. 11006: 369\u2013386"},{"key":"4250_CR55","unstructured":"Loshchilov I, Hutter F. Decoupled weight decay regularization. 2017. ArXiv:1711.05101"},{"key":"4250_CR56","first-page":"19730","volume-title":"Proceedings of International Conference on Machine Learning","author":"J N Li","year":"2023","unstructured":"Li J N, Li D X, Savarese S, et al. BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: Proceedings of International Conference on Machine Learning, 2023. 19730\u201319742"},{"key":"4250_CR57","volume-title":"Proceedings of Neural Information Processing Systems","author":"W L Dai","year":"2024","unstructured":"Dai W L, Li J N, Li D X, et al. InstructBLIP: towards general-purpose vision-language models with instruction tuning. In: Proceedings of Neural Information Processing Systems, 2024"},{"key":"4250_CR58","first-page":"2256","volume-title":"Proceedings of AAAI","author":"W Hu","year":"2024","unstructured":"Hu W, Xu Y, Li Y, et al. BLIVA: A simple multimodal LLM for better handling of text-rich visual questions. In: Proceedings of AAAI, 2024. 38: 2256\u20132264"},{"key":"4250_CR59","unstructured":"Liu H T, Li C Y, Li Y H, et al. Improved baselines with visual instruction tuning. 2023. ArXiv:2310.03744"},{"key":"4250_CR60","unstructured":"Wang Y H, Zhou W G, Feng H, et al. Towards improving document understanding: an exploration on text-grounding via MLLMs. 2023. ArXiv:2311.13194"},{"key":"4250_CR61","volume-title":"Proceedings of Conference on Computer Vision and Pattern Recognition","author":"Q H Ye","year":"2024","unstructured":"Ye Q H, Xu H Y, Ye J B, et al. mPLUG-Owl2: revolutionizing multi-modal large language model with modality collaboration. In: Proceedings of Conference on Computer Vision and Pattern Recognition, 2024"},{"key":"4250_CR62","unstructured":"Dong X Y, Zhang P, Zang Y H, et al. InternLM-XComposer2: mastering free-form text-image composition and comprehension in vision-language large model. 2024. ArXiv:2401.16420"},{"key":"4250_CR63","first-page":"1563","volume-title":"Proceedings of International Conference on Document Analysis and Recognition","author":"A F Biten","year":"2019","unstructured":"Biten A F, Tito R, Mafla A, et al. ICDAR 2019 competition on scene text visual question answering. In: Proceedings of International Conference on Document Analysis and Recognition, 2019. 1563\u20131570"},{"key":"4250_CR64","unstructured":"Bai J, Bai S, Yang S S, et al. Qwen-VL: a Frontier large vision-language model with versatile abilities. 2023. ArXiv:2308.12966"},{"key":"4250_CR65","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, et al. An image is worth 16 \u00d7 16 words: transformers for image recognition at scale. 2020. ArXiv:2010.11929"},{"key":"4250_CR66","doi-asserted-by":"publisher","first-page":"109834","DOI":"10.1016\/j.patcog.2023.109834","volume":"144","author":"R Tito","year":"2023","unstructured":"Tito R, Karatzas D, Valveny E. Hierarchical multimodal transformers for multipage DocVQA. Pattern Recogn, 2023, 144: 109834","journal-title":"Pattern Recogn"}],"container-title":["Science China Information Sciences"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-024-4250-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11432-024-4250-y","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-024-4250-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,19]],"date-time":"2026-01-19T22:02:40Z","timestamp":1768860160000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11432-024-4250-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12]]},"references-count":66,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2024,12]]}},"alternative-id":["4250"],"URL":"https:\/\/doi.org\/10.1007\/s11432-024-4250-y","relation":{},"ISSN":["1674-733X","1869-1919"],"issn-type":[{"value":"1674-733X","type":"print"},{"value":"1869-1919","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12]]},"assertion":[{"value":"29 March 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 September 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 November 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 December 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"220106"}}