{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T16:22:32Z","timestamp":1774369352498,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":75,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755204","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:51Z","timestamp":1761377211000},"page":"3817-3826","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Optimal Feature Embedding for Document Large Visual Language Model"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5821-021X","authenticated-orcid":false,"given":"Fan","family":"Yang","sequence":"first","affiliation":[{"name":"School of Electronic and Information Engineering, South China University of Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5607-8192","authenticated-orcid":false,"given":"Ling","family":"Deng","sequence":"additional","affiliation":[{"name":"China United Network Communications Corporation Limited Guangdong Branch, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6451-1068","authenticated-orcid":false,"given":"Zhiyong","family":"Gan","sequence":"additional","affiliation":[{"name":"China United Network Communications Corporation Limited Guangdong Branch, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6535-0100","authenticated-orcid":false,"given":"Qisheng","family":"He","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3830-0178","authenticated-orcid":false,"given":"Yuanbo","family":"Fang","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4573-5820","authenticated-orcid":false,"given":"Xiangmin","family":"Xu","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5544-4544","authenticated-orcid":false,"given":"Shuangping","family":"Huang","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5848-5624","authenticated-orcid":false,"given":"Tianshui","family":"Chen","sequence":"additional","affiliation":[{"name":"Guangdong University of Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems Vol. 35 (2022) 23716-23736."},{"key":"e_1_3_2_1_2_1","first-page":"993","article-title":"DocFormer","author":"Appalaraju Srikar","year":"2021","unstructured":"Srikar Appalaraju, Bhavan Jasani, Bhargava Urala Kota, Yusheng Xie, and R. Manmatha. 2021. DocFormer: End-to-End Transformer for Document Understanding. In ICCV. 993-1003.","journal-title":"End-to-End Transformer for Document Understanding. In ICCV."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.54870\/1551-3440.1099"},{"key":"e_1_3_2_1_4_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"Representation learning: A review and new perspectives","author":"Bengio Yoshua","year":"2013","unstructured":"Yoshua Bengio, Aaron Courville, and Pascal Vincent. 2013. Representation learning: A review and new perspectives. IEEE transactions on pattern analysis and machine intelligence, Vol. 35, 8 (2013), 1798-1828."},{"key":"e_1_3_2_1_6_1","volume-title":"Large-scale machine learning with stochastic gradient descent","author":"Bottou L\u00e9on","unstructured":"L\u00e9on Bottou. 2010. Large-scale machine learning with stochastic gradient descent. Springer. 177-186 pages."},{"key":"e_1_3_2_1_7_1","volume-title":"Berger","author":"Casella George","year":"2002","unstructured":"George Casella and Roger L. Berger. 2002. Statistical Inference (2nd ed.). Duxbury Press."},{"key":"e_1_3_2_1_8_1","volume-title":"Pali: A jointly-scaled multilingual language-image model. arXiv preprint arXiv:2209.06794","author":"Chen Xi","year":"2022","unstructured":"Xi Chen, Xiao Wang, Soravit Changpinyo, AJ Piergiovanni, Piotr Padlewski, Daniel Salz, Sebastian Goodman, Adam Grycner, Basil Mustafa, Lucas Beyer, et al., 2022. Pali: A jointly-scaled multilingual language-image model. arXiv preprint arXiv:2209.06794 (2022)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_1_10_1","first-page":"3576","article-title":"InfoXLM: An information-theoretic framework for cross-lingual language model pre-training","author":"Chi Zewen","year":"2021","unstructured":"Zewen Chi, Li Dong, Furu Wei, Nan Yang, Saksham Singhal, Wenhui Wang, Xia Song, Xian-Ling Mao, He-Yan Huang, and Ming Zhou. 2021. InfoXLM: An information-theoretic framework for cross-lingual language model pre-training. In NAACL-HLT. 3576-3588.","journal-title":"NAACL-HLT."},{"key":"e_1_3_2_1_11_1","first-page":"4171","article-title":"BERT","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proc. NAACL. 4171-4186.","journal-title":"Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proc. NAACL."},{"key":"e_1_3_2_1_12_1","volume-title":"InternLM-XComposer2: Mastering Free-form Text-Image Composition and Comprehension in Vision-Language Large Model. arXiv: Comp. Res. Repository","author":"Dong Xiaoyi","year":"2024","unstructured":"Xiaoyi Dong, Pan Zhang, Yuhang Zang, Yuhang Cao, Bin Wang, Linke Ouyang, Xilin Wei, Songyang Zhang, Haodong Duan, Maosong Cao, Wenwei Zhang, Yining Li, Hang Yan, Yang Gao, Xinyue Zhang, Wei Li, Jingwen Li, Kai Chen, Conghui He, Xingcheng Zhang, Yu Qiao, Dahua Lin, and Jiaqi Wang. 2024. InternLM-XComposer2: Mastering Free-form Text-Image Composition and Comprehension in Vision-Language Large Model. arXiv: Comp. Res. Repository, Vol. abs\/2401.16420 (2024)."},{"key":"e_1_3_2_1_13_1","volume-title":"The design of experiments. Oliver and Boyd","author":"Fisher Ronald A.","year":"1935","unstructured":"Ronald A. Fisher. 1935. The design of experiments. Oliver and Boyd (1935), 1-272."},{"key":"e_1_3_2_1_14_1","volume-title":"Emergent properties of the local geometry of neural loss landscapes. arXiv preprint arXiv:1906.04724","author":"Fort Stanislav","year":"2019","unstructured":"Stanislav Fort and Surya Ganguli. 2019. Emergent properties of the local geometry of neural loss landscapes. arXiv preprint arXiv:1906.04724 (2019)."},{"key":"e_1_3_2_1_15_1","first-page":"10219","article-title":"LayoutLLM","author":"Fujitake Masato","year":"2024","unstructured":"Masato Fujitake. 2024. LayoutLLM: Large Language Model Instruction Tuning for Visually Rich Document Understanding. In Proc. LREC\/COLING. 10219-10224.","journal-title":"In Proc. LREC\/COLING."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1134\/S1064230710050060"},{"key":"e_1_3_2_1_17_1","volume-title":"Deep learning","author":"Goodfellow Ian","unstructured":"Ian Goodfellow, Yoshua Bengio, and Aaron Courville. 2016. Deep learning. MIT press."},{"key":"e_1_3_2_1_18_1","volume-title":"Gemini: A Family of Highly Capable Multimodal Models. arXiv: Comp. Res. Repository","author":"Google Gemini Team","year":"2023","unstructured":"Gemini Team Google. 2023. Gemini: A Family of Highly Capable Multimodal Models. arXiv: Comp. Res. Repository, Vol. abs\/2312.11805 (2023)."},{"key":"e_1_3_2_1_19_1","first-page":"991","article-title":"Evaluation of deep convolutional nets for document image classification and retrieval","author":"Harley Adam W","year":"2015","unstructured":"Adam W Harley et al., 2015. Evaluation of deep convolutional nets for document image classification and retrieval. In ICDAR. 991-995.","journal-title":"ICDAR."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_21_1","volume-title":"Bros: A pre-trained language model focusing on text and layout for better key information extraction from documents. In AAAI.","author":"Hong Teakgyu","year":"2022","unstructured":"Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, and Sungrae Park. 2022. Bros: A pre-trained language model focusing on text and layout for better key information extraction from documents. In AAAI."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.175"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2409.03420"},{"key":"e_1_3_2_1_24_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J.","year":"2022","unstructured":"Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. arXiv preprint arXiv:2106.09685 (2022)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01071"},{"key":"e_1_3_2_1_26_1","first-page":"4083","article-title":"LayoutLMv3","author":"Huang Yupan","year":"2022","unstructured":"Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, and Furu Wei. 2022. LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In ACM MM. 4083-4091.","journal-title":"In ACM MM."},{"key":"e_1_3_2_1_27_1","volume-title":"Hazim Kemal Ekenel, and Jean-Philippe Thiran","author":"Jaume Guillaume","year":"2019","unstructured":"Guillaume Jaume, Hazim Kemal Ekenel, and Jean-Philippe Thiran. 2019. FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents. arXiv:1905.13538 [cs.IR]"},{"key":"e_1_3_2_1_28_1","volume-title":"International conference on machine learning. PMLR, 4904-4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning. PMLR, 4904-4916."},{"key":"e_1_3_2_1_29_1","volume-title":"A history of mathematics: An introduction. (No Title)","author":"Katz Victor J","year":"1998","unstructured":"Victor J Katz. 1998. A history of mathematics: An introduction. (No Title) (1998)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_29"},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of the 3rd International Conference on Learning Representations (ICLR). arXiv.","author":"Kingma Diederik P","year":"2015","unstructured":"Diederik P Kingma and Jimmy Lei Ba. 2015. Adam: A method for stochastic optimization. In Proceedings of the 3rd International Conference on Learning Representations (ICLR). arXiv."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/1148170.1148307"},{"key":"e_1_3_2_1_33_1","first-page":"6389","article-title":"Visualizing the loss landscape of neural nets","author":"Li Hao","year":"2018","unstructured":"Hao Li, Zheng Xu, Gavin Taylor, Christoph Studer, and Tom Goldstein. 2018. Visualizing the loss landscape of neural nets. In NeurIPS. 6389-6399.","journal-title":"NeurIPS."},{"key":"e_1_3_2_1_34_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_35_1","first-page":"5652","article-title":"SelfDoc","author":"Li Peizhao","year":"2021","unstructured":"Peizhao Li, Jiuxiang Gu, Jason Kuen, Vlad I. Morariu, Handong Zhao, Rajiv Jain, Varun Manjunatha, and Hongfu Liu. 2021a. SelfDoc: Self-Supervised Document Representation Learning. In CVPR. 5652-5660.","journal-title":"Self-Supervised Document Representation Learning. In CVPR."},{"key":"e_1_3_2_1_36_1","first-page":"1912","article-title":"StrucTexT: Structured text understanding with multi-modal Transformers","author":"Li Yulin","year":"2021","unstructured":"Yulin Li, Yuxi Qian, Yuechen Yu, Xiameng Qin, Chengquan Zhang, Yan Liu, Kun Yao, Junyu Han, Jingtuo Liu, and Errui Ding. 2021b. StrucTexT: Structured text understanding with multi-modal Transformers. In ACM Multimedia. 1912-1920.","journal-title":"ACM Multimedia."},{"key":"e_1_3_2_1_37_1","volume-title":"DocLayLLM: An Efficient and Effective Multi-modal Extension of Large Language Models for Text-rich Document Understanding. arXiv preprint arXiv:2408.15045","author":"Liao Wenhui","year":"2024","unstructured":"Wenhui Liao, Jiapeng Wang, Hongliang Li, Chengyu Wang, Jun Huang, and Lianwen Jin. 2024. DocLayLLM: An Efficient and Effective Multi-modal Extension of Large Language Models for Text-rich Document Understanding. arXiv preprint arXiv:2408.15045 (2024)."},{"key":"e_1_3_2_1_38_1","volume-title":"Improved Baselines with Visual Instruction Tuning. arXiv preprint arXiv:2310.03744","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2023a. Improved Baselines with Visual Instruction Tuning. arXiv preprint arXiv:2310.03744 (2023)."},{"key":"e_1_3_2_1_39_1","first-page":"34892","article-title":"Visual Instruction Tuning","volume":"36","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023b. Visual Instruction Tuning. NeurIPS, Vol. 36 (2023), 34892-34916.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_40_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024a. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_41_1","volume-title":"TextMonkey: An OCR-Free Large Multimodal Model for Understanding Document. arXiv: Comp. Res. Repository","author":"Liu Yuliang","year":"2024","unstructured":"Yuliang Liu, Biao Yang, Qiang Liu, Zhang Li, Zhiyin Ma, Shuo Zhang, and Xiang Bai. 2024b. TextMonkey: An OCR-Free Large Multimodal Model for Understanding Document. arXiv: Comp. Res. Repository, Vol. abs\/2403.04473 (2024)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2024.110816"},{"key":"e_1_3_2_1_43_1","first-page":"7092","article-title":"GeoLayoutLM","author":"Luo Chuwei","year":"2023","unstructured":"Chuwei Luo, Changxu Cheng, Qi Zheng, and Cong Yao. 2023. GeoLayoutLM: Geometric Pre-Training for Visual Information Extraction. In CVPR. 7092-7101.","journal-title":"Geometric Pre-Training for Visual Information Extraction. In CVPR."},{"key":"e_1_3_2_1_44_1","first-page":"15630","article-title":"LayoutLLM","author":"Luo Chuwei","year":"2024","unstructured":"Chuwei Luo, Yufan Shen, Zhaoqing Zhu, Qi Zheng, Zhi Yu, and Cong Yao. 2024. LayoutLLM: Layout Instruction Tuning with Large Language Models for Document Understanding. In CVPR. 15630-15640.","journal-title":"In CVPR."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1016\/J.PATCOG.2022.109006"},{"key":"e_1_3_2_1_46_1","first-page":"2200","article-title":"Docvqa: A dataset for vqa on document images","author":"Mathew Minesh","year":"2021","unstructured":"Minesh Mathew, Dimosthenis Karatzas, and CV Jawahar. 2021. Docvqa: A dataset for vqa on document images. In WACV. 2200-2209.","journal-title":"WACV."},{"key":"e_1_3_2_1_47_1","volume-title":"Runger","author":"Montgomery Douglas C.","year":"2010","unstructured":"Douglas C. Montgomery and George C. Runger. 2010. Applied Statistics and Probability for Engineers (5th ed.). Wiley."},{"key":"e_1_3_2_1_48_1","volume-title":"Numerical optimization","author":"Nocedal Jorge","unstructured":"Jorge Nocedal and Stephen Wright. 2006. Numerical optimization. Springer."},{"key":"e_1_3_2_1_49_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. arXiv: Comp. Res. Repository Vol. abs\/2303.08774 (2023)."},{"key":"e_1_3_2_1_50_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_51_1","unstructured":"Sebastian Ruder. 2016. An overview of gradient descent optimization algorithms."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.5555\/2627435.2670313"},{"key":"e_1_3_2_1_53_1","volume-title":"TextSquare: Scaling up Text-Centric Visual Instruction Tuning. arXiv: Comp. Res. Repository","author":"Tang Jingqun","year":"2024","unstructured":"Jingqun Tang, Chunhui Lin, Zhen Zhao, Shu Wei, Binghong Wu, Qi Liu, Hao Feng, Yang Li, Siqi Wang, Lei Liao, Wei Shi, Yuliang Liu, Hao Liu, Yuan Xie, Xiang Bai, and Can Huang. 2024. TextSquare: Scaling up Text-Centric Visual Instruction Tuning. arXiv: Comp. Res. Repository, Vol. abs\/2404.12803 (2024)."},{"key":"e_1_3_2_1_54_1","first-page":"19254","article-title":"Unifying Vision, Text, and Layout for Universal Document Processing","author":"Tang Zineng","year":"2023","unstructured":"Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, and Mohit Bansal. 2023. Unifying Vision, Text, and Layout for Universal Document Processing. In CVPR. 19254-19264.","journal-title":"CVPR."},{"key":"e_1_3_2_1_55_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton-Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aur\u00e9lien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv: Comp. Res. Repository Vol. abs\/2307.09288 (2023)."},{"key":"e_1_3_2_1_56_1","first-page":"15641","article-title":"OmniParser","author":"Wan Jianqiang","year":"2024","unstructured":"Jianqiang Wan, Sibo Song, Wenwen Yu, Yuliang Liu, Wenqing Cheng, Fei Huang, Xiang Bai, Cong Yao, and Zhibo Yang. 2024. OmniParser: A Unified Framework for Text Spotting Key Information Extraction and Table Recognition. In CVPR. 15641-15653.","journal-title":"In CVPR."},{"key":"e_1_3_2_1_57_1","first-page":"7747","article-title":"LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding","author":"Wang Jiapeng","year":"2022","unstructured":"Jiapeng Wang, Lianwen Jin, and Kai Ding. 2022a. LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding. In Proc. ACL. 7747-7757.","journal-title":"Proc. ACL."},{"key":"e_1_3_2_1_58_1","volume-title":"International conference on machine learning. PMLR, 23318-23340","author":"Wang Peng","year":"2022","unstructured":"Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, Zhikang Li, Jianxin Ma, Chang Zhou, Jingren Zhou, and Hongxia Yang. 2022b. Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In International conference on machine learning. PMLR, 23318-23340."},{"key":"e_1_3_2_1_59_1","volume-title":"Zihang Dai, Yulia Tsvetkov, and Yuan Cao.","author":"Wang Zirui","year":"2021","unstructured":"Zirui Wang, Jiahui Yu, Adams Wei Yu, Zihang Dai, Yulia Tsvetkov, and Yuan Cao. 2021. Simvlm: Simple visual language model pretraining with weak supervision. arXiv preprint arXiv:2108.10904 (2021)."},{"key":"e_1_3_2_1_60_1","volume-title":"All of Statistics: A Concise Course in Statistical Inference","author":"Wasserman Larry","unstructured":"Larry Wasserman. 2004. All of Statistics: A Concise Course in Statistical Inference. Springer."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25402"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403172"},{"key":"e_1_3_2_1_63_1","first-page":"1192","article-title":"LayoutLM: Pre-training of text and layout for document image understanding","author":"Xu Yiheng","year":"2020","unstructured":"Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. 2020b. LayoutLM: Pre-training of text and layout for document image understanding. In ACM-SIGKDD. 1192-1200.","journal-title":"ACM-SIGKDD."},{"key":"e_1_3_2_1_64_1","volume-title":"LayoutXLM: Multimodal pre-training for multilingual visually-rich document understanding. arXiv preprint arXiv:2104.08836","author":"Xu Yiheng","year":"2021","unstructured":"Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, and Furu Wei. 2021a. LayoutXLM: Multimodal pre-training for multilingual visually-rich document understanding. arXiv preprint arXiv:2104.08836 (2021)."},{"key":"e_1_3_2_1_65_1","unstructured":"Yang Xu Yiheng Xu Tengchao Lv Lei Cui Furu Wei Guoxin Wang Yijuan Lu Dinei Florencio Cha Zhang Wanxiang Che et al. 2021b. LayoutLMv2: Multi-modal pre-training for visually-rich document understanding. In ACL."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.6084\/m9.figshare.20647788"},{"key":"e_1_3_2_1_67_1","unstructured":"Jiabo Ye Anwen Hu Haiyang Xu Qinghao Ye Ming Yan Yuhao Dan Chenlin Zhao Guohai Xu Chenliang Li Junfeng Tian et al. 2023a. mplug-docowl: Modularized multimodal large language model for document understanding. arXiv preprint arXiv:2307.02499 (2023)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2307.02499"},{"key":"e_1_3_2_1_69_1","first-page":"2841","article-title":"UReader","author":"Ye Jiabo","year":"2023","unstructured":"Jiabo Ye, Anwen Hu, Haiyang Xu, Qinghao Ye, Ming Yan, Guohai Xu, Chenliang Li, Junfeng Tian, Qi Qian, Ji Zhang, Qin Jin, Liang He, Xin Lin, and Fei Huang. 2023c. UReader: Universal OCR-free Visually-situated Language Understanding with Multimodal Large Language Model. In Findings of EMNLP. 2841-2858.","journal-title":"In Findings of EMNLP."},{"key":"e_1_3_2_1_70_1","unstructured":"Jiaquan Ye Xianbiao Qi Yelin He Yihao Chen Dengyi Gu Peng Gao and Rong Xiao. 2021. PingAn-VCGroup's Solution for ICDAR 2021 Competition on Scientific Literature Parsing Task B: Table Recognition to HTML. (2021). Preprint at https:\/\/arxiv.org\/abs\/2105.01848."},{"key":"e_1_3_2_1_71_1","volume-title":"All you need is a good init. arXiv preprint arXiv:1911.07100","author":"Zhang Chiyuan","year":"2019","unstructured":"Chiyuan Zhang, Yin Luo, Tengyu Ma, and Andrej Risteski. 2019. All you need is a good init. arXiv preprint arXiv:1911.07100 (2019)."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"e_1_3_2_1_73_1","volume-title":"LLaVAR: Enhanced Visual Instruction Tuning for Text-Rich Image Understanding. arXiv: Comp. Res. Repository","author":"Zhang Yanzhe","year":"2023","unstructured":"Yanzhe Zhang, Ruiyi Zhang, Jiuxiang Gu, Yufan Zhou, Nedim Lipka, Diyi Yang, and Tong Sun. 2023. LLaVAR: Enhanced Visual Instruction Tuning for Text-Rich Image Understanding. arXiv: Comp. Res. Repository, Vol. abs\/2306.17107 (2023)."},{"key":"e_1_3_2_1_74_1","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric. P Xing Hao Zhang Joseph E. Gonzalez and Ion Stoica. 2023. Judging LLM-as-a-judge with MT-Bench and Chatbot Arena. arXiv:2306.05685 [cs.CL]"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_34"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755204","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:54:29Z","timestamp":1765310069000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755204"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":75,"alternative-id":["10.1145\/3746027.3755204","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755204","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}