{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,7]],"date-time":"2026-07-07T15:57:16Z","timestamp":1783439836841,"version":"3.54.6"},"publisher-location":"New York, NY, USA","reference-count":62,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,10,12]],"date-time":"2020-10-12T00:00:00Z","timestamp":1602460800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,10,12]]},"DOI":"10.1145\/3394171.3413900","type":"proceedings-article","created":{"date-parts":[[2020,10,12]],"date-time":"2020-10-12T12:26:25Z","timestamp":1602505585000},"page":"1413-1422","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":84,"title":["TRIE: End-to-End Text Reading and Information Extraction for Document Understanding"],"prefix":"10.1145","author":[{"given":"Peng","family":"Zhang","sequence":"first","affiliation":[{"name":"Hikvision Research Institute, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yunlu","family":"Xu","sequence":"additional","affiliation":[{"name":"Hikvision Research Institute, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhanzhan","family":"Cheng","sequence":"additional","affiliation":[{"name":"Zhejiang University &amp; Hikvision Research Institute, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shiliang","family":"Pu","sequence":"additional","affiliation":[{"name":"Hikvision Research Institute, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jing","family":"Lu","sequence":"additional","affiliation":[{"name":"Hikvision Research Institute, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Liang","family":"Qiao","sequence":"additional","affiliation":[{"name":"Hikvision Research Institute, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yi","family":"Niu","sequence":"additional","affiliation":[{"name":"Hikvision Research Institute, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Fei","family":"Wu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2020,10,12]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"crossref","unstructured":"Peter Anderson Xiaodong He Chris Buehler Damien Teney Mark Johnson Stephen Gould and Lei Zhang. 2018. Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering. In CVPR. 6077--6086. Peter Anderson Xiaodong He Chris Buehler Damien Teney Mark Johnson Stephen Gould and Lei Zhang. 2018. Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering. In CVPR. 6077--6086.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_2_2_1","volume-title":"Rosetta: Large Scale System for Text Detection and Recognition in Images. In KDD. 71--79.","author":"Borisyuk Fedor","year":"2018"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"crossref","unstructured":"Michal Busta Lukas Neumann and Jiri Matas. 2017. Deep TextSpotter: An End-to-End Trainable Scene Text Localization and Recognition Framework. In ICCV. 2223--2231. Michal Busta Lukas Neumann and Jiri Matas. 2017. Deep TextSpotter: An End-to-End Trainable Scene Text Localization and Recognition Framework. In ICCV. 2223--2231.","DOI":"10.1109\/ICCV.2017.242"},{"key":"e_1_3_2_2_4_1","unstructured":"Manuel Carbonell Alicia Forn\u00e9 s Mauricio Villegas and Josep Llad\u00f3 s. 2019. TreyNet: A Neural Model for Text Localization Transcription and Named Entity Recognition in Full Pages. arXiv preprint arXiv:1912.10016 (2019). Manuel Carbonell Alicia Forn\u00e9 s Mauricio Villegas and Josep Llad\u00f3 s. 2019. TreyNet: A Neural Model for Text Localization Transcription and Named Entity Recognition in Full Pages. arXiv preprint arXiv:1912.10016 (2019)."},{"key":"e_1_3_2_2_5_1","volume-title":"Focusing Attention: Towards Accurate Text Recognition in Natural Images. In ICCV. 5086--5094.","author":"Cheng Zhanzhan","year":"2017"},{"key":"e_1_3_2_2_6_1","unstructured":"Junyoung Chung cC aglar G\u00fc lcc ehre KyungHyun Cho and Yoshua Bengio. 2014. Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling. arXiv preprint arXiv:1412.3555 (2014). Junyoung Chung cC aglar G\u00fc lcc ehre KyungHyun Cho and Yoshua Bengio. 2014. Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling. arXiv preprint arXiv:1412.3555 (2014)."},{"key":"e_1_3_2_2_7_1","unstructured":"Zihang Dai Zhilin Yang Yiming Yang Jaime G. Carbonell Quoc Viet Le and Ruslan Salakhutdinov. 2019. Transformer-XL: Attentive Language Models beyond a Fixed-Length Context. In ACL. 2978--2988. Zihang Dai Zhilin Yang Yiming Yang Jaime G. Carbonell Quoc Viet Le and Ruslan Salakhutdinov. 2019. Transformer-XL: Attentive Language Models beyond a Fixed-Length Context. In ACL. 2978--2988."},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-45869-7_47"},{"key":"e_1_3_2_2_9_1","unstructured":"Timo I. Denk and Christian Reisswig. 2019. BERTgrid: Contextualized Embedding for 2D Document Representation and Understanding. arXiv preprint arXiv:1909.04948 (2019). Timo I. Denk and Christian Reisswig. 2019. BERTgrid: Contextualized Embedding for 2D Document Representation and Understanding. arXiv preprint arXiv:1909.04948 (2019)."},{"key":"e_1_3_2_2_10_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL-HLT. 4171--4186.","author":"Devlin Jacob","year":"2019"},{"key":"e_1_3_2_2_11_1","volume-title":"Document Recognition and Retrieval XIX, part of the IS&T-SPIE Electronic Imaging Symposium (SPIE Proceedings)","author":"Esser Daniel"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"crossref","unstructured":"Wei Feng Wenhao He Fei Yin Xu-Yao Zhang and Cheng-Lin Liu. 2019. TextDragon: An End-to-End Framework for Arbitrary Shaped Text Spotting. In ICCV. 9075--9084. Wei Feng Wenhao He Fei Yin Xu-Yao Zhang and Cheng-Lin Liu. 2019. TextDragon: An End-to-End Framework for Arbitrary Shaped Text Spotting. In ICCV. 9075--9084.","DOI":"10.1109\/ICCV.2019.00917"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"crossref","unstructured":"Akira Fukui Dong Huk Park Daylen Yang Anna Rohrbach Trevor Darrell and Marcus Rohrbach. 2016. Multimodal Compact Bilinear Pooling for Visual Question Answering and Visual Grounding. In EMNLP. 457--468. Akira Fukui Dong Huk Park Daylen Yang Anna Rohrbach Trevor Darrell and Marcus Rohrbach. 2016. Multimodal Compact Bilinear Pooling for Visual Question Answering and Visual Grounding. In EMNLP. 457--468.","DOI":"10.18653\/v1\/D16-1044"},{"key":"e_1_3_2_2_14_1","volume-title":"EATEN: Entity-Aware Attention for Single Shot Visual Text Extraction. In ICDAR. 254--259.","author":"Guo He","year":"2019"},{"key":"e_1_3_2_2_15_1","unstructured":"Kaiming He Georgia Gkioxari Piotr Doll\u00e1 r and Ross B. Girshick. 2017a. Mask R-CNN. In ICCV. 2980--2988. Kaiming He Georgia Gkioxari Piotr Doll\u00e1 r and Ross B. Girshick. 2017a. Mask R-CNN. In ICCV. 2980--2988."},{"key":"e_1_3_2_2_16_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In CVPR. 770--778. Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In CVPR. 770--778."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"crossref","unstructured":"Pan He Weilin Huang Tong He Qile Zhu Yu Qiao and Xiaolin Li. 2017b. Single Shot Text Detector with Regional Attention. In ICCV. 3066--3074. Pan He Weilin Huang Tong He Qile Zhu Yu Qiao and Xiaolin Li. 2017b. Single Shot Text Detector with Regional Attention. In ICCV. 3066--3074.","DOI":"10.1109\/ICCV.2017.331"},{"key":"e_1_3_2_2_18_1","unstructured":"Tong He Zhi Tian Weilin Huang Chunhua Shen Yu Qiao and Changming Sun. 2018. An End-to-End TextSpotter With Explicit Alignment and Attention. In CVPR. 5020--5029. Tong He Zhi Tian Weilin Huang Chunhua Shen Yu Qiao and Changming Sun. 2018. An End-to-End TextSpotter With Explicit Alignment and Attention. In CVPR. 5020--5029."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_2_20_1","volume-title":"ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction. In ICDAR. 1516--1520","author":"Huang Zheng"},{"key":"e_1_3_2_2_21_1","volume-title":"Connectionist, Statistical, and Symbolic Approaches to Learning for Natural Language Processing (Lecture Notes in Computer Science)","author":"Huffman Scott B."},{"key":"e_1_3_2_2_22_1","first-page":"338","article-title":"Apparatus and method for searching and retrieving structured, semi-structured and unstructured content","volume":"10","author":"Judd Douglass Russell","year":"2004","journal-title":"US Patent App."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"crossref","unstructured":"Anoop R. Katti Christian Reisswig Cordula Guder Sebastian Brarda Steffen Bickel Johannes H\u00f6 hne and Jean Baptiste Faddoul. 2018. Chargrid: Towards Understanding 2D Documents. In EMNLP. 4459--4469. Anoop R. Katti Christian Reisswig Cordula Guder Sebastian Brarda Steffen Bickel Johannes H\u00f6 hne and Jean Baptiste Faddoul. 2018. Chargrid: Towards Understanding 2D Documents. In EMNLP. 4459--4469.","DOI":"10.18653\/v1\/D18-1476"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"crossref","unstructured":"Guillaume Lample Miguel Ballesteros Sandeep Subramanian Kazuya Kawakami and Chris Dyer. 2016. Neural Architectures for Named Entity Recognition. In NAACL-HLT. 260--270. Guillaume Lample Miguel Ballesteros Sandeep Subramanian Kazuya Kawakami and Chris Dyer. 2016. Neural Architectures for Named Entity Recognition. In NAACL-HLT. 260--270.","DOI":"10.18653\/v1\/N16-1030"},{"key":"e_1_3_2_2_25_1","unstructured":"Chen-Yu Lee and Simon Osindero. 2016. Recursive Recurrent Nets with Attention Modeling for OCR in the Wild. In CVPR. 2231--2239. Chen-Yu Lee and Simon Osindero. 2016. Recursive Recurrent Nets with Attention Modeling for OCR in the Wild. In CVPR. 2231--2239."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"crossref","unstructured":"Hui Li Peng Wang and Chunhua Shen. 2017. Towards End-to-End Text Spotting with Convolutional Recurrent Neural Networks. In ICCV. 5248--5256. Hui Li Peng Wang and Chunhua Shen. 2017. Towards End-to-End Text Spotting with Convolutional Recurrent Neural Networks. In ICCV. 5248--5256.","DOI":"10.1109\/ICCV.2017.560"},{"key":"e_1_3_2_2_27_1","unstructured":"Liunian Harold Li Mark Yatskar Da Yin Cho-Jui Hsieh and Kai-Wei Chang. 2019. VisualBERT: A Simple and Performant Baseline for Vision and Language. arXiv preprint arXiv:1908.03557 (2019). Liunian Harold Li Mark Yatskar Da Yin Cho-Jui Hsieh and Kai-Wei Chang. 2019. VisualBERT: A Simple and Performant Baseline for Vision and Language. arXiv preprint arXiv:1908.03557 (2019)."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"crossref","unstructured":"Minghui Liao Baoguang Shi Xiang Bai Xinggang Wang and Wenyu Liu. 2017. TextBoxes: A Fast Text Detector with a Single Deep Neural Network. In AAAI. 4161--4167. Minghui Liao Baoguang Shi Xiang Bai Xinggang Wang and Wenyu Liu. 2017. TextBoxes: A Fast Text Detector with a Single Deep Neural Network. In AAAI. 4161--4167.","DOI":"10.1609\/aaai.v31i1.11196"},{"key":"e_1_3_2_2_29_1","unstructured":"Tsung-Yi Lin Piotr Doll\u00e1 r Ross B. Girshick Kaiming He Bharath Hariharan and Serge J. Belongie. 2017. Feature Pyramid Networks for Object Detection. In CVPR. 936--944. Tsung-Yi Lin Piotr Doll\u00e1 r Ross B. Girshick Kaiming He Bharath Hariharan and Serge J. Belongie. 2017. Feature Pyramid Networks for Object Detection. In CVPR. 936--944."},{"key":"e_1_3_2_2_30_1","unstructured":"Xiaojing Liu Feiyu Gao Qiong Zhang and Huasha Zhao. 2019. Graph Convolution for Multimodal Information Extraction from Visually Rich Documents. In NAACL-HLT. 32--39. Xiaojing Liu Feiyu Gao Qiong Zhang and Huasha Zhao. 2019. Graph Convolution for Multimodal Information Extraction from Visually Rich Documents. In NAACL-HLT. 32--39."},{"key":"e_1_3_2_2_31_1","volume-title":"FOTS: Fast Oriented Text Spotting With a Unified Network. In CVPR. 5676--5685.","author":"Liu Xuebo","year":"2018"},{"key":"e_1_3_2_2_32_1","unstructured":"Yuliang Liu and Lianwen Jin. 2017. Deep Matching Prior Network: Toward Tighter Multi-oriented Text Detection. In CVPR. 3454--3461. Yuliang Liu and Lianwen Jin. 2017. Deep Matching Prior Network: Toward Tighter Multi-oriented Text Detection. In CVPR. 3454--3461."},{"key":"e_1_3_2_2_33_1","volume-title":"Textsnake: A Flexible Representation for Detecting Text of Arbitrary Shapes. In ECCV. 19--35.","author":"Long Shangbang","year":"2018"},{"key":"e_1_3_2_2_34_1","unstructured":"Jiasen Lu Dhruv Batra Devi Parikh and Stefan Lee. 2019. ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In NeurIPS. 13--23. Jiasen Lu Dhruv Batra Devi Parikh and Stefan Lee. 2019. ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In NeurIPS. 13--23."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"crossref","unstructured":"Pengyuan Lyu Minghui Liao Cong Yao Wenhao Wu and Xiang Bai. 2018. Mask TextSpotter: An End-to-End Trainable Neural Network for Spotting Text with Arbitrary Shapes. In ECCV. 71--88. Pengyuan Lyu Minghui Liao Cong Yao Wenhao Wu and Xiang Bai. 2018. Mask TextSpotter: An End-to-End Trainable Neural Network for Spotting Text with Arbitrary Shapes. In ECCV. 71--88.","DOI":"10.1007\/978-3-030-01264-9_5"},{"key":"e_1_3_2_2_36_1","unstructured":"Xuezhe Ma and Eduard H. Hovy. 2016. End-to-end Sequence Labeling via Bi-directional LSTM-CNNs-CRF. In ACL. Xuezhe Ma and Eduard H. Hovy. 2016. End-to-end Sequence Labeling via Bi-directional LSTM-CNNs-CRF. In ACL."},{"key":"e_1_3_2_2_37_1","unstructured":"Bodhisattwa Prasad Majumder Navneet Potti Sandeep Tata James Bradley Wendt Qi Zhao and Marc Najork. 2020. Representation Learning for Information Extraction from Form-like Documents. In ACL. 6495--6504. Bodhisattwa Prasad Majumder Navneet Potti Sandeep Tata James Bradley Wendt Qi Zhao and Marc Najork. 2020. Representation Learning for Information Extraction from Form-like Documents. In ACL. 6495--6504."},{"key":"e_1_3_2_2_38_1","volume-title":"AAAI","volume":"2","author":"Ion"},{"key":"e_1_3_2_2_39_1","unstructured":"Rasmus Berg Palm Florian Laws and Ole Winther. 2019. Attend Copy Parse End-to-end Information Extraction from Documents. In ICDAR. 329--336. Rasmus Berg Palm Florian Laws and Ole Winther. 2019. Attend Copy Parse End-to-end Information Extraction from Documents. In ICDAR. 329--336."},{"key":"e_1_3_2_2_40_1","unstructured":"Rasmus Berg Palm Ole Winther and Florian Laws. 2017. CloudScan - A Configuration-Free Invoice Analysis System Using Recurrent Neural Networks. In ICDAR. 406--413. Rasmus Berg Palm Ole Winther and Florian Laws. 2017. CloudScan - A Configuration-Free Invoice Analysis System Using Recurrent Neural Networks. In ICDAR. 406--413."},{"key":"e_1_3_2_2_41_1","unstructured":"Adam Paszke Sam Gross Francisco Massa Adam Lerer James Bradbury Gregory Chanan Trevor Killeen Zeming Lin Natalia Gimelshein Luca Antiga Alban Desmaison Andreas K\u00f6 pf Edward Yang Zachary DeVito Martin Raison Alykhan Tejani Sasank Chilamkurthy Benoit Steiner Lu Fang Junjie Bai and Soumith Chintala. 2019. PyTorch: An Imperative Style High-Performance Deep Learning Library. In NeurIPS. 8024--8035. Adam Paszke Sam Gross Francisco Massa Adam Lerer James Bradbury Gregory Chanan Trevor Killeen Zeming Lin Natalia Gimelshein Luca Antiga Alban Desmaison Andreas K\u00f6 pf Edward Yang Zachary DeVito Martin Raison Alykhan Tejani Sasank Chilamkurthy Benoit Steiner Lu Fang Junjie Bai and Soumith Chintala. 2019. PyTorch: An Imperative Style High-Performance Deep Learning Library. In NeurIPS. 8024--8035."},{"key":"e_1_3_2_2_42_1","volume-title":"Text Perceptron: Towards End-to-End Arbitrary-Shaped Text Spotting. In AAAI.","author":"Qiao Liang","year":"2020"},{"key":"e_1_3_2_2_43_1","unstructured":"Shaoqing Ren Kaiming He Ross B. Girshick and Jian Sun. 2015. Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. In NeurIPS. 91--99. Shaoqing Ren Kaiming He Ross B. Girshick and Jian Sun. 2015. Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. In NeurIPS. 91--99."},{"key":"e_1_3_2_2_44_1","volume-title":"Proceedings of the 11th National Conference on Artificial Intelligence. 811--816","author":"Riloff Ellen","year":"1993"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"crossref","unstructured":"Cl\u00e9 ment Sage Alexandre Aussem Haytham Elghazel V\u00e9 ronique Eglin and J\u00e9 r\u00e9 my Espinas. 2019. Recurrent Neural Network Approach for Table Field Extraction in Business Documents. In ICDAR. 1308--1313. Cl\u00e9 ment Sage Alexandre Aussem Haytham Elghazel V\u00e9 ronique Eglin and J\u00e9 r\u00e9 my Espinas. 2019. Recurrent Neural Network Approach for Table Field Extraction in Business Documents. In ICDAR. 1308--1313.","DOI":"10.1109\/ICDAR.2019.00211"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"crossref","unstructured":"Erik F. Tjong Kim Sang and Jorn Veenstra. 1999. Representing Text Chunks. In EACL. 173--179. Erik F. Tjong Kim Sang and Jorn Veenstra. 1999. Representing Text Chunks. In EACL. 173--179.","DOI":"10.3115\/977035.977059"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"crossref","unstructured":"Daniel Schuster Klemens Muthmann Daniel Esser Alexander Schill Michael Berger Christoph Weidling Kamil Aliyev and Andreas Hofmeier. 2013. Intellix - End-User Trained Information Extraction for Document Archiving. In ICDAR. 101--105. Daniel Schuster Klemens Muthmann Daniel Esser Alexander Schill Michael Berger Christoph Weidling Kamil Aliyev and Andreas Hofmeier. 2013. Intellix - End-User Trained Information Extraction for Document Archiving. In ICDAR. 101--105.","DOI":"10.1109\/ICDAR.2013.28"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1162\/COLI_a_00178"},{"key":"e_1_3_2_2_49_1","unstructured":"Baoguang Shi Xiang Bai and Serge J. Belongie. [n.d.]. Detecting Oriented Text in Natural Images by Linking Segments. In CVPR. Baoguang Shi Xiang Bai and Serge J. Belongie. [n.d.]. Detecting Oriented Text in Natural Images by Linking Segments. In CVPR."},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2646371"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1023\/A:1007562322031"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"crossref","unstructured":"Hao Wang Pu Lu Hui Zhang Mingkun Yang Xiang Bai Yongchao Xu Mengchao He Yongpan Wang and Wenyu Liu. 2020. All You Need Is Boundary: Toward Arbitrary-Shaped Text Spotting. In AAAI. Hao Wang Pu Lu Hui Zhang Mingkun Yang Xiang Bai Yongchao Xu Mengchao He Yongpan Wang and Wenyu Liu. 2020. All You Need Is Boundary: Toward Arbitrary-Shaped Text Spotting. In AAAI.","DOI":"10.1609\/aaai.v34i07.6896"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"crossref","unstructured":"Wenhai Wang Enze Xie Xiang Li Wenbo Hou Tong Lu Gang Yu and Shuai Shao. 2019. Shape Robust Text Detection With Progressive Scale Expansion Network. In CVPR. Wenhai Wang Enze Xie Xiang Li Wenbo Hou Tong Lu Gang Yu and Shuai Shao. 2019. Shape Robust Text Detection With Progressive Scale Expansion Network. In CVPR.","DOI":"10.1109\/CVPR.2019.00956"},{"key":"e_1_3_2_2_54_1","unstructured":"Yiheng Xu Minghao Li Lei Cui Shaohan Huang Furu Wei and Ming Zhou. 2020. LayoutLM: Pre-training of Text and Layout for Document Image Understanding. KDD (2020). Yiheng Xu Minghao Li Lei Cui Shaohan Huang Furu Wei and Ming Zhou. 2020. LayoutLM: Pre-training of Text and Layout for Document Image Understanding. KDD (2020)."},{"key":"e_1_3_2_2_55_1","unstructured":"Vikas Yadav and Steven Bethard. 2018. A Survey on Recent Advances in Named Entity Recognition from Deep Learning models. In COLING. 2145--2158. Vikas Yadav and Steven Bethard. 2018. A Survey on Recent Advances in Named Entity Recognition from Deep Learning models. In COLING. 2145--2158."},{"key":"e_1_3_2_2_56_1","unstructured":"Zhilin Yang Zihang Dai Yiming Yang Jaime G. Carbonell Ruslan Salakhutdinov and Quoc V. Le. 2019. XLNet: Generalized Autoregressive Pretraining for Language Understanding. In NeurIPS. 5754--5764. Zhilin Yang Zihang Dai Yiming Yang Jaime G. Carbonell Ruslan Salakhutdinov and Quoc V. Le. 2019. XLNet: Generalized Autoregressive Pretraining for Language Understanding. In NeurIPS. 5754--5764."},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"crossref","unstructured":"Zichao Yang Xiaodong He Jianfeng Gao Li Deng and Alexander J. Smola. 2016. Stacked Attention Networks for Image Question Answering. In CVPR. 21--29. Zichao Yang Xiaodong He Jianfeng Gao Li Deng and Alexander J. Smola. 2016. Stacked Attention Networks for Image Question Answering. In CVPR. 21--29.","DOI":"10.1109\/CVPR.2016.10"},{"key":"e_1_3_2_2_58_1","volume-title":"PICK: Processing Key Information Extraction from Documents using Improved Graph Learning-Convolutional Networks. ICPR","author":"Yu Wenwen","year":"2020"},{"key":"e_1_3_2_2_59_1","volume-title":"ADADELTA: An Adaptive Learning Rate Method. arXiv preprint arXiv:1212.5701","author":"Zeiler Matthew D.","year":"2012"},{"key":"e_1_3_2_2_60_1","unstructured":"Xiang Zhang Junbo Jake Zhao and Yann LeCun. 2015. Character-level Convolutional Networks for Text Classification. In NeurIPS. 649--657. Xiang Zhang Junbo Jake Zhao and Yann LeCun. 2015. Character-level Convolutional Networks for Text Classification. In NeurIPS. 649--657."},{"key":"e_1_3_2_2_61_1","volume-title":"CUTIE: Learning to Understand Documents with Convolutional Universal Text Information Extractor. arXiv preprint arXiv:1903.12363","author":"Zhao Xiaohui","year":"2019"},{"key":"e_1_3_2_2_62_1","volume-title":"EAST: An Efficient and Accurate Scene Text Detector. In CVPR. 2642--2651.","author":"Zhou Xinyu","year":"2017"}],"event":{"name":"MM '20: The 28th ACM International Conference on Multimedia","location":"Seattle WA USA","acronym":"MM '20","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 28th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3394171.3413900","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3394171.3413900","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T21:32:06Z","timestamp":1750195926000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3394171.3413900"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,10,12]]},"references-count":62,"alternative-id":["10.1145\/3394171.3413900","10.1145\/3394171"],"URL":"https:\/\/doi.org\/10.1145\/3394171.3413900","relation":{},"subject":[],"published":{"date-parts":[[2020,10,12]]},"assertion":[{"value":"2020-10-12","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}