{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T17:37:49Z","timestamp":1758044269476,"version":"3.44.0"},"publisher-location":"Cham","reference-count":53,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032046130","type":"print"},{"value":"9783032046147","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,9,13]],"date-time":"2025-09-13T00:00:00Z","timestamp":1757721600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,13]],"date-time":"2025-09-13T00:00:00Z","timestamp":1757721600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-04614-7_4","type":"book-chapter","created":{"date-parts":[[2025,9,12]],"date-time":"2025-09-12T12:22:57Z","timestamp":1757679777000},"page":"58-75","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["HIP: Hierarchical Point Modeling and\u00a0Pre-training for\u00a0Visual Information Extraction"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1330-3193","authenticated-orcid":false,"given":"Rujiao","family":"Long","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1204-8860","authenticated-orcid":false,"given":"Pengfei","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2343-7750","authenticated-orcid":false,"given":"Zhibo","family":"Yang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3452-9170","authenticated-orcid":false,"given":"Wenqing","family":"Cheng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,9,13]]},"reference":[{"key":"4_CR1","doi-asserted-by":"crossref","unstructured":"Cao, H., et al.: Attention where it matters: rethinking visual document understanding with selective region concentration. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 19517\u201319527 (2023)","DOI":"10.1109\/ICCV51070.2023.01788"},{"key":"4_CR2","doi-asserted-by":"crossref","unstructured":"Davis, B., Morse, B., Price, B., Tensmeyer, C., Wigington, C., Morariu, V.: End-to-end document recognition and understanding with dessurt. In: European Conference on Computer Vision, pp. 280\u2013296. Springer (2022)","DOI":"10.1007\/978-3-031-25069-9_19"},{"key":"4_CR3","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., Fei Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255 (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"4_CR4","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"4_CR5","doi-asserted-by":"crossref","unstructured":"Dhouib, M., Bettaieb, G., Shabou, A.: DocParser: end-to-end ocr-free information extraction from visually rich documents. arXiv preprint arXiv:2304.12484. Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, (2023)","DOI":"10.1007\/978-3-031-41734-4_10"},{"key":"4_CR6","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"4_CR7","doi-asserted-by":"crossref","unstructured":"Duan, K., Bai, S., Xie, L., Qi, H., Huang, Q., Tian, Q.: 2019. CenterNet: keypoint triplets for object detection. In: 2019 IEEE\/CVF International Conference on Computer Vision, ICCV 2019, Seoul, Korea (South), October 27 November 2, pp. 6568\u20136577. IEEE (2019)","DOI":"10.1109\/ICCV.2019.00667"},{"key":"4_CR8","doi-asserted-by":"crossref","unstructured":"Fang, S., Xie, H., Wang, Y., Mao, Z., Zhang, Y.: Read like humans: autonomous, bidirectional and iterative language modeling for scene text recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7098\u20137107 (2021)","DOI":"10.1109\/CVPR46437.2021.00702"},{"key":"4_CR9","doi-asserted-by":"crossref","unstructured":"Gu, Z., et al.: Xylayoutlm: towards layout-aware multimodal networks for visually-rich document understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4583\u20134592 (2022)","DOI":"10.1109\/CVPR52688.2022.00454"},{"key":"4_CR10","doi-asserted-by":"crossref","unstructured":"Guan, T., et al.: Self-supervised character-to-character distillation for text recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 19473\u201319484 (2023)","DOI":"10.1109\/ICCV51070.2023.01784"},{"key":"4_CR11","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"4_CR12","doi-asserted-by":"crossref","unstructured":"He, T., Tian, Z., Huang, W., Shen, C., Qiao, Y., Sun, C.: An end-to-end textspotter with explicit alignment and attention. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5020\u20135029 (2018)","DOI":"10.1109\/CVPR.2018.00527"},{"key":"4_CR13","doi-asserted-by":"crossref","unstructured":"Huang, M., et al.: Swintextspotter: scene text spotting via better synergy between text detection and text recognition. In: proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4593\u20134603 (2022a)","DOI":"10.1109\/CVPR52688.2022.00455"},{"key":"4_CR14","doi-asserted-by":"crossref","unstructured":"Huang, Y., Lv, T., Cui, L., Lu, Y., Wei, F.: LayoutLMv3: pre-training for Document AI with unified text and image masking. In: Proceedings of the 30th ACM International Conference on Multimedia (2022b)","DOI":"10.1145\/3503161.3548112"},{"key":"4_CR15","doi-asserted-by":"crossref","unstructured":"Huang, Z., et al.: Icdar2019 competition on scanned receipt ocr and information extraction. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1516\u20131520. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00244"},{"key":"4_CR16","doi-asserted-by":"crossref","unstructured":"Jaume, G., Ekenel, H.K., Thiran, J.-P.: Funsd: a dataset for form understanding in noisy scanned documents. In: 2019 International Conference on Document Analysis and Recognition Workshops (ICDARW), vol. 2, pp. 1\u20136. IEEE (2019)","DOI":"10.1109\/ICDARW.2019.10029"},{"key":"4_CR17","doi-asserted-by":"crossref","unstructured":"Jiang, Q., Wang, J., Peng, D., Liu, C., Jin, L.: Revisiting scene text recognition: a data perspective. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 20543\u201320554 (2023)","DOI":"10.1109\/ICCV51070.2023.01878"},{"key":"4_CR18","doi-asserted-by":"crossref","unstructured":"Kim, G., et al.: Ocr-free document understanding transformer. In: European Conference on Computer Vision, pp. 498\u2013517. Springer (2022)","DOI":"10.1007\/978-3-031-19815-1_29"},{"key":"4_CR19","unstructured":"Kim, G., et al.: Donut: document understanding transformer without OCR. ArXiv, abs\/2111.15664 (2021)"},{"key":"4_CR20","doi-asserted-by":"crossref","unstructured":"Kittenplon, Y., Lavi, I., Fogel, S., Bar, Y., Manmatha, R., Perona, P.: Towards weakly-supervised text spotting using a multi-task transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4604\u20134613 (2022)","DOI":"10.1109\/CVPR52688.2022.00456"},{"key":"4_CR21","doi-asserted-by":"crossref","unstructured":"Lee, C.-Y., et al.: Formnet: structural encoding beyond sequential modeling in form document information extraction. arXiv preprint arXiv:2203.08411 (2022)","DOI":"10.18653\/v1\/2022.acl-long.260"},{"key":"4_CR22","doi-asserted-by":"crossref","unstructured":"Lee, C.-Y., et al.: Rope: reading order equivariant positional encoding for graph-based document information extraction. arXiv preprint arXiv:2106.10786 (2021)","DOI":"10.18653\/v1\/2021.acl-short.41"},{"key":"4_CR23","doi-asserted-by":"crossref","unstructured":"Lee, C.-Y., et al.: Formnetv2: multimodal graph contrastive learning for form document information extraction. arXiv preprint arXiv:2305.02549 (2023)","DOI":"10.18653\/v1\/2023.acl-long.501"},{"key":"4_CR24","doi-asserted-by":"crossref","unstructured":"Lewis, D., Agam, G., Argamon, S., Frieder, O., Grossman, D., Heard, J.: Building a test collection for complex document information processing. In: Proceedings of the 29th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 665\u2013666 (2006)","DOI":"10.1145\/1148170.1148307"},{"key":"4_CR25","doi-asserted-by":"crossref","unstructured":"Li, H., Wang, P., Shen, C.: Towards end-to-end text spotting with convolutional recurrent neural networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5238\u20135246 (2017)","DOI":"10.1109\/ICCV.2017.560"},{"key":"4_CR26","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Structext: Structured text understanding with multi-modal transformers. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 1912\u20131920 (2021)","DOI":"10.1145\/3474085.3475345"},{"key":"4_CR27","doi-asserted-by":"crossref","unstructured":"Liao, M., Pang, G., Huang, J., Hassner, T., Bai, X.: Mask textspotter v3: segmentation proposal network for robust scene text spotting. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XI 16, pp. 706\u2013722. Springer (2020a)","DOI":"10.1007\/978-3-030-58621-8_41"},{"key":"4_CR28","first-page":"11474","volume":"34\u201307","author":"M Liao","year":"2020","unstructured":"Liao, M., Wan, Z., Yao, C., Chen, K., Bai, X.: Real-time scene text detection with differentiable binarization. Proc. AAAI Conf. Artif.Intell. 34\u201307, 11474\u201311481 (2020)","journal-title":"Proc. AAAI Conf. Artif.Intell."},{"key":"4_CR29","doi-asserted-by":"crossref","unstructured":"Liu, X., Liang, D., Yan, S., Chen, D., Qiao, Y., Yan, J.: Fots: fast oriented text spotting with a unified network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5676\u20135685 (2018)","DOI":"10.1109\/CVPR.2018.00595"},{"key":"4_CR30","doi-asserted-by":"crossref","unstructured":"Liu, Y., Chen, H., Shen, C., He, T., Jin, L., Wang, L.: Abcnet: real-time scene text spotting with adaptive bezier-curve network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9809\u20139818 (2020)","DOI":"10.1109\/CVPR42600.2020.00983"},{"key":"4_CR31","doi-asserted-by":"publisher","first-page":"110816","DOI":"10.1016\/j.patcog.2024.110816","volume":"157","author":"R Long","year":"2025","unstructured":"Long, R., et al.: Lore++: logical location regression network for table structure recognition with pre-training. Pattern Recogn. 157, 110816 (2025)","journal-title":"Pattern Recogn."},{"key":"4_CR32","doi-asserted-by":"crossref","unstructured":"Long, S., Qin, S., Panteleev, D., Bissacco, A., Fujii, Y., Raptis, M.: Towards end-to-end unified scene text detection and layout analysis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1049\u20131059 (2022)","DOI":"10.1109\/CVPR52688.2022.00112"},{"key":"4_CR33","unstructured":"Park, S., et al.: CORD: a consolidated receipt dataset for post-OCR parsing. In: Workshop on Document Intelligence at NeurIPS, vol. 2019 (2019)"},{"key":"4_CR34","doi-asserted-by":"crossref","unstructured":"Peng, D., et al.: SPTS: single point text spotting. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 4272\u20134281 (2022)","DOI":"10.1145\/3503161.3547942"},{"key":"4_CR35","doi-asserted-by":"crossref","unstructured":"Shi, B., Bai, X., Belongie, S.: Detecting oriented text in natural images by linking segments. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2550\u20132558 (2017)","DOI":"10.1109\/CVPR.2017.371"},{"issue":"11","key":"4_CR36","doi-asserted-by":"publisher","first-page":"2298","DOI":"10.1109\/TPAMI.2016.2646371","volume":"39","author":"B Shi","year":"2016","unstructured":"Shi, B., Bai, X., Yao, C.: An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition. IEEE Trans. Pattern Anal. Mach. Intell. 39(11), 2298\u20132304 (2016)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"9","key":"4_CR37","doi-asserted-by":"publisher","first-page":"2035","DOI":"10.1109\/TPAMI.2018.2848939","volume":"41","author":"B Shi","year":"2018","unstructured":"Shi, B., Yang, M., Wang, X., Lyu, P., Yao, C., Bai, X.: Aster: an attentional scene text recognizer with flexible rectification. IEEE Trans. Pattern Anal. Mach. Intell. 41(9), 2035\u20132048 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"4_CR38","first-page":"2738","volume":"35","author":"J Wang","year":"2021","unstructured":"Wang, J., et al.: Towards robust visual information extraction in real world: new dataset and novel solution. Proc. AAAI Conf. Artif. Intell. 35, 2738\u20132745 (2021)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"4_CR39","first-page":"2782","volume":"35","author":"P Wang","year":"2021","unstructured":"Wang, P., et al.: Pgnet: real time arbitrarily-shaped text spotting with point gathering network. Proc. AAAI Conf. Artif. Intell. 35, 2782\u20132790 (2021)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"4_CR40","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: AE TextSpotter: learning visual and linguistic representation for ambiguous text spotting. In: European Conference on Computer Vision (2020)","DOI":"10.1007\/978-3-030-58568-6_27"},{"issue":"9","key":"4_CR41","first-page":"5349","volume":"44","author":"W Wang","year":"2021","unstructured":"Wang, W., et al.: Pan++: towards efficient and accurate end-to-end spotting of arbitrarily-shaped text. IEEE Trans. Pattern Anal. Mach. Intell. 44(9), 5349\u20135367 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"4_CR42","doi-asserted-by":"crossref","unstructured":"Xing, L., Tian, Z., Huang, W., Scott, M.R.: Convolutional character networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9126\u20139136 (2019)","DOI":"10.1109\/ICCV.2019.00922"},{"key":"4_CR43","first-page":"1192","volume-title":"KDD \u201920: The 26th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Virtual Event, CA, USA, August 23\u201327, 2020","author":"Y Xu","year":"2020","unstructured":"Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: LayoutLM: pre-training of text and layout for document image understanding. In: Gupta, R., Liu, Y., Tang, J., Prakash, B.A. (eds.) KDD \u201920: The 26th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Virtual Event, CA, USA, August 23\u201327, 2020, pp. 1192\u20131200. ACM (2020)"},{"key":"4_CR44","unstructured":"Xu, Y., et al.: Layoutxlm: multimodal pretraining for multilingual visually-rich document understanding. ArXiv preprint, abs\/2104.08836 (2021a)"},{"key":"4_CR45","doi-asserted-by":"crossref","unstructured":"Xu, Y., et al.: LayoutLMv2: multi-modal pre-training for visually-rich document understanding. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 2579\u20132591. ACL (2021b)","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"4_CR46","doi-asserted-by":"crossref","unstructured":"Yang, Z., et al.: Modeling entities as semantic points for visual information extraction in the wild. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15358\u201315367 (2023)","DOI":"10.1109\/CVPR52729.2023.01474"},{"key":"4_CR47","doi-asserted-by":"crossref","unstructured":"Ye, M., et al.: Deepsolo: let transformer decoder with explicit points solo for text spotting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19348\u201319357 (2023)","DOI":"10.1109\/CVPR52729.2023.01854"},{"key":"4_CR48","unstructured":"Yu, Y., et al.: StrucTexTv2: masked visual-textual prediction for document image pretraining. arXiv preprint arXiv:2303.00289 (2023)"},{"key":"4_CR49","doi-asserted-by":"crossref","unstructured":"Zhang, P., et al.: TRIE: End-to-end text reading and information extraction for document understanding. In MM\u201920: The 28th ACM International Conference on Multimedia, Virtual Event \/ Seattle, WA, USA, October 12-16, vol. 2020, pp. 1413\u20131422 (2020)","DOI":"10.1145\/3394171.3413900"},{"key":"4_CR50","unstructured":"Zhou, X., Wang, D., Kr\u00e4henb\u00fchl, P.: Objects as points. arXiv preprint arXiv:1904.07850 (2019)"},{"key":"4_CR51","unstructured":"Read like humans: autonomous, bidirectional and iterative language modeling for scene text recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7098\u20137107"},{"key":"4_CR52","doi-asserted-by":"crossref","unstructured":"Wan, J., et al.: Omniparser: a unified framework for text spotting key information extraction and table recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15641\u201315653 (2024)","DOI":"10.1109\/CVPR52733.2024.01481"},{"key":"4_CR53","doi-asserted-by":"crossref","unstructured":"Okamoto, Y., et al.: CREPE: coordinate-aware end-to-end document parser. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 3\u201320. IEEE (2024)","DOI":"10.1007\/978-3-031-70546-5_1"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition \u2013 ICDAR 2025"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-04614-7_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,12]],"date-time":"2025-09-12T12:23:16Z","timestamp":1757679796000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-04614-7_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,13]]},"ISBN":["9783032046130","9783032046147"],"references-count":53,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-04614-7_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,13]]},"assertion":[{"value":"13 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Wuhan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/iapr.org\/icdar2025","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}