{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T23:41:06Z","timestamp":1780357266697,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-sa\/4.0\/"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62441604, 61936003"],"award-info":[{"award-number":["62441604, 61936003"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680931","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"5171-5180","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["PEneo: Unifying Line Extraction, Line Grouping, and Entity Linking for End-to-end Document Pair Extraction"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-8681-311X","authenticated-orcid":false,"given":"Zening","family":"Lin","sequence":"first","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2060-3488","authenticated-orcid":false,"given":"Jiapeng","family":"Wang","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5852-7960","authenticated-orcid":false,"given":"Teng","family":"Li","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3823-0132","authenticated-orcid":false,"given":"Wenhui","family":"Liao","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8248-2906","authenticated-orcid":false,"given":"Dayi","family":"Huang","sequence":"additional","affiliation":[{"name":"Kingsoft Office, Zhuhai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2029-8391","authenticated-orcid":false,"given":"Longfei","family":"Xiong","sequence":"additional","affiliation":[{"name":"Kingsoft Office, Zhuhai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5456-0957","authenticated-orcid":false,"given":"Lianwen","family":"Jin","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"DocFormer: End-to-End Transformer for Document Understanding. In 2021 IEEE\/CVF International Conference on Computer Vision (ICCV). IEEE, 973--983","author":"Appalaraju Srikar","year":"2021","unstructured":"Srikar Appalaraju, Bhavan Jasani, Bhargava Urala Kota, Yusheng Xie, and R Manmatha. 2021. DocFormer: End-to-End Transformer for Document Understanding. In 2021 IEEE\/CVF International Conference on Computer Vision (ICCV). IEEE, 973--983."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547877"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016300"},{"key":"e_1_3_2_1_5_1","volume-title":"End-to-End Document Recognition and Understanding with Dessurt. In European Conference on Computer Vision. Springer, 280--296","author":"Davis Brian","year":"2022","unstructured":"Brian Davis, Bryan Morse, Brian Price, Chris Tensmeyer, Curtis Wigington, and Vlad Morariu. 2022. End-to-End Document Recognition and Understanding with Dessurt. In European Conference on Computer Vision. Springer, 280--296."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings, Part I 16","author":"Davis Brian","year":"2021","unstructured":"Brian Davis, Bryan Morse, Brian Price, Chris Tensmeyer, and Curtis Wiginton. 2021. Visual FUDGE: Form Understanding via Dynamic Graph Editing. In Document Analysis and Recognition--ICDAR 2021: 16th International Conference, Lausanne, Switzerland, September 5--10, 2021, Proceedings, Part I 16. Springer, 416--431."},{"key":"e_1_3_2_1_7_1","first-page":"39","article-title":"UniDoc: Unified Pretraining Framework for Document Understanding","volume":"34","author":"Gu Jiuxiang","year":"2021","unstructured":"Jiuxiang Gu, Jason Kuen, Vlad I Morariu, Handong Zhao, Rajiv Jain, Nikolaos Barmpalios, Ani Nenkova, and Tong Sun. 2021. UniDoc: Unified Pretraining Framework for Document Understanding. Advances in Neural Information Processing Systems, Vol. 34 (2021), 39--50.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00454"},{"key":"e_1_3_2_1_9_1","volume-title":"Hazim Kemal Ekenel","author":"Thiran Guillaume Jaume Jean-Philippe","year":"2019","unstructured":"Jean-Philippe Thiran Guillaume Jaume, Hazim Kemal Ekenel. 2019. FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents. In ICDAR-OST."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21322"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i11.26516"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548112"},{"key":"e_1_3_2_1_13_1","volume-title":"ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction. In 2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 1516--1520","author":"Huang Zheng","year":"2019","unstructured":"Zheng Huang, Kai Chen, Jianhua He, Xiang Bai, Dimosthenis Karatzas, Shijian Lu, and CV Jawahar. 2019. ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction. In 2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 1516--1520."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.28"},{"key":"e_1_3_2_1_15_1","volume-title":"OCR-free Document Understanding Transformer. In European Conference on Computer Vision. Springer, 498--517","author":"Kim Geewook","year":"2022","unstructured":"Geewook Kim, Teakgyu Hong, Moonbin Yim, JeongYeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, and Seunghyun Park. 2022. OCR-free Document Understanding Transformer. In European Conference on Computer Vision. Springer, 498--517."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-41731-3_3"},{"key":"e_1_3_2_1_17_1","volume-title":"SelfDoc: Self-Supervised Document Representation Learning. In 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 5648--5656","author":"Li Peizhao","year":"2021","unstructured":"Peizhao Li, Jiuxiang Gu, Jason Kuen, Vlad I Morariu, Handong Zhao, Rajiv Jain, Varun Manjunatha, and Hongfu Liu. 2021. SelfDoc: Self-Supervised Document Representation Learning. In 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 5648--5656."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475345"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01794"},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the 7th International Conference on Learning Representations (ICLR).","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In Proceedings of the 7th International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00685"},{"key":"e_1_3_2_1_22_1","volume-title":"CORD: A Consolidated Receipt Dataset for Post-OCR Parsing. In Workshop on Document Intelligence at NeurIPS","author":"Park Seunghyun","year":"2019","unstructured":"Seunghyun Park, Seung Shin, Bado Lee, Junyeop Lee, Jaeheung Surh, Minjoon Seo, and Hwalsuk Lee. 2019. CORD: A Consolidated Receipt Dataset for Post-OCR Parsing. In Workshop on Document Intelligence at NeurIPS 2019."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-emnlp.274"},{"key":"e_1_3_2_1_24_1","volume-title":"Natural language processing using very large corpora","author":"Ramshaw Lance A","unstructured":"Lance A Ramshaw and Mitchell P Marcus. 1999. Text chunking using transformation-based learning. In Natural language processing using very large corpora. Springer, 157--176."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2007.4377003"},{"key":"e_1_3_2_1_26_1","volume-title":"Exploring ocr capabilities of gpt-4v (ision): A quantitative and in-depth evaluation. arXiv preprint arXiv:2310.16809","author":"Shi Yongxin","year":"2023","unstructured":"Yongxin Shi, Dezhi Peng, Wenhui Liao, Zening Lin, Xinhong Chen, Chongyu Liu, Yuyi Zhang, and Lianwen Jin. 2023. Exploring ocr capabilities of gpt-4v (ision): A quantitative and in-depth evaluation. arXiv preprint arXiv:2310.16809 (2023)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1279"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1132"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.534"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.coling-main.138"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/34.385976"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.136"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403172"},{"key":"e_1_3_2_1_34_1","unstructured":"Yiheng Xu Tengchao Lv Lei Cui Guoxin Wang Yijuan Lu Dinei Florencio Cha Zhang and Furu Wei. 2021. LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding. arxiv: 2104.08836 [cs.CL]"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.253"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01474"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.337"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.846"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.5"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680931","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680931","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:34Z","timestamp":1750295854000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680931"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":41,"alternative-id":["10.1145\/3664647.3680931","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680931","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}