{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,9]],"date-time":"2025-09-09T20:50:33Z","timestamp":1757451033143,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681473","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"4890-4898","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["UNER: A Unified Prediction Head for Named Entity Recognition in Visually-rich Documents"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2184-4443","authenticated-orcid":false,"given":"Yi","family":"Tu","sequence":"first","affiliation":[{"name":"Tiansuan Security Lab, Ant Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6083-7593","authenticated-orcid":false,"given":"Chong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9242-493X","authenticated-orcid":false,"given":"Ya","family":"Guo","sequence":"additional","affiliation":[{"name":"Tiansuan Security Lab, Ant Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7280-2956","authenticated-orcid":false,"given":"Huan","family":"Chen","sequence":"additional","affiliation":[{"name":"Tiansuan Security Lab, Ant Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5630-9756","authenticated-orcid":false,"given":"Jinyang","family":"Tang","sequence":"additional","affiliation":[{"name":"Tiansuan Security Lab, Ant Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5784-7225","authenticated-orcid":false,"given":"Huijia","family":"Zhu","sequence":"additional","affiliation":[{"name":"Tiansuan Security Lab, Ant Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0947-4942","authenticated-orcid":false,"given":"Qi","family":"Zhang","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"BEiT: BERT Pre-Training of Image Transformers. In International Conference on Learning Representations.","author":"Bao Hangbo","year":"2021","unstructured":"Hangbo Bao, Li Dong, Songhao Piao, and Furu Wei. 2021. BEiT: BERT Pre-Training of Image Transformers. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_2_1","volume-title":"International Conference on Machine Learning. PMLR, 642--652","author":"Bao Hangbo","year":"2020","unstructured":"Hangbo Bao, Li Dong, Furu Wei, Wenhui Wang, Nan Yang, Xiaodong Liu, Yu Wang, Jianfeng Gao, Songhao Piao, Ming Zhou, et al. 2020. Unilmv2: Pseudo-masked language models for unified language model pre-training. In International Conference on Machine Learning. PMLR, 642--652."},{"key":"e_1_3_2_1_3_1","volume-title":"Trie: Towards end-to-end information extraction from visually rich documents. arXiv preprint arXiv:2207.06744","author":"Cheng Zhanzhan","year":"2022","unstructured":"Zhanzhan Cheng, Peng Zhang, Can Li, Qiao Liang, Yunlu Xu, Pengfei Li, Shiliang Pu, Yi Niu, and Fei Wu. 2022. Trie: Towards end-to-end information extraction from visually rich documents. arXiv preprint arXiv:2207.06744 (2022)."},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of NAACL-HLT. 4171--4186","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of NAACL-HLT. 4171--4186."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00454"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548112"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00244"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDARW.2019.10029"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.493"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21344"},{"key":"e_1_3_2_1_11_1","volume-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547911"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475345"},{"key":"e_1_3_2_1_14_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.395"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00685"},{"key":"e_1_3_2_1_17_1","volume-title":"Workshop on Document Intelligence at NeurIPS","author":"Park Seunghyun","year":"2019","unstructured":"Seunghyun Park, Seung Shin, Bado Lee, Junyeop Lee, Jaeheung Surh, Minjoon Seo, and Hwalsuk Lee. 2019. CORD: a consolidated receipt dataset for post-OCR parsing. In Workshop on Document Intelligence at NeurIPS 2019."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-emnlp.274"},{"volume-title":"Natural language processing using very large corpora","author":"Ramshaw Lance A","key":"e_1_3_2_1_19_1","unstructured":"Lance A Ramshaw and Mitchell P Marcus. 1999. Text chunking using transformation-based learning. In Natural language processing using very large corpora. Springer, 157--176."},{"key":"e_1_3_2_1_20_1","volume-title":"Michal Uvrivc\u00e1vr, Yash Patel, Ahmed Hamdi, Matvej Koci\u00e1n, Maty\u00e1vs Skalick\u1ef3, Jivr\u00ed Matas, Antoine Doucet, Micka\u00ebl Coustaty, et al.","author":"Simsa Stvep\u00e1n","year":"2023","unstructured":"vStvep\u00e1n vSimsa, Milan vSulc, Michal Uvrivc\u00e1vr, Yash Patel, Ahmed Hamdi, Matvej Koci\u00e1n, Maty\u00e1vs Skalick\u1ef3, Jivr\u00ed Matas, Antoine Doucet, Micka\u00ebl Coustaty, et al. 2023. Docile benchmark for document information localization and extraction. arXiv preprint arXiv:2302.05658 (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"Zlpr: A novel loss for multi-label classification. arXiv preprint arXiv:2208.02955","author":"Su Jianlin","year":"2022","unstructured":"Jianlin Su, Mingren Zhu, Ahmed Murtadha, Shengfeng Pan, Bo Wen, and Yunfeng Liu. 2022. Zlpr: A novel loss for multi-label classification. arXiv preprint arXiv:2208.02955 (2022)."},{"key":"e_1_3_2_1_22_1","volume-title":"Spatial dual-modality graph reasoning for key information extraction. arXiv preprint arXiv:2103.14470","author":"Sun Hongbin","year":"2021","unstructured":"Hongbin Sun, Zhanghui Kuang, Xiaoyu Yue, Chenhao Lin, and Wayne Zhang. 2021. Spatial dual-modality graph reasoning for key information extraction. arXiv preprint arXiv:2103.14470 (2021)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01845"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.847"},{"key":"e_1_3_2_1_25_1","volume-title":"Graph Attention Networks. In International Conference on Learning Representations.","author":"Velivckovi\u0107 Petar","year":"2018","unstructured":"Petar Velivckovi\u0107, Guillem Cucurull, Arantxa Casanova, Adriana Romero, Pietro Li\u00f2, and Yoshua Bengio. 2018. Graph Attention Networks. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.534"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16378"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.63"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.389"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403172"},{"key":"e_1_3_2_1_31_1","volume-title":"Layoutxlm: Multimodal pre-training for multilingual visually-rich document understanding. arXiv preprint arXiv:2104.08836","author":"Xu Yiheng","year":"2021","unstructured":"Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, and Furu Wei. 2021. Layoutxlm: Multimodal pre-training for multilingual visually-rich document understanding. arXiv preprint arXiv:2104.08836 (2021)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.253"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.451"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01474"},{"key":"e_1_3_2_1_36_1","volume-title":"ICDAR 2023 Competition on Structured Text Extraction from Visually-Rich Document Images. arXiv preprint arXiv:2306","author":"Yu Wenwen","year":"2023","unstructured":"Wenwen Yu, Chengquan Zhang, Haoyu Cao, Wei Hua, Bohan Li, Huang Chen, Mingyu Liu, Mingrui Chen, Jianfeng Kuang, Mengjun Cheng, et al. 2023. ICDAR 2023 Competition on Structured Text Extraction from Visually-Rich Document Images. arXiv preprint arXiv:2306.03287 (2023)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.846"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413900"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1139"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681473","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681473","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:47Z","timestamp":1750294667000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681473"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":39,"alternative-id":["10.1145\/3664647.3681473","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681473","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}