{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:41:34Z","timestamp":1765309294921,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754906","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:56:44Z","timestamp":1761375404000},"page":"2949-2957","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["DREAM: Document Reconstruction via End-to-end Autoregressive Model"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1164-9644","authenticated-orcid":false,"given":"Xin","family":"Li","sequence":"first","affiliation":[{"name":"Tencent YouTu Lab, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3454-915X","authenticated-orcid":false,"given":"Mingming","family":"Gong","sequence":"additional","affiliation":[{"name":"Tencent YouTu Lab, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3950-4367","authenticated-orcid":false,"given":"Yunfei","family":"Wu","sequence":"additional","affiliation":[{"name":"Tencent YouTu Lab, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6859-2893","authenticated-orcid":false,"given":"Jianxin","family":"Dai","sequence":"additional","affiliation":[{"name":"Tencent YouTu Lab, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0168-4028","authenticated-orcid":false,"given":"Antai","family":"Guo","sequence":"additional","affiliation":[{"name":"Tencent YouTu Lab, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7791-5159","authenticated-orcid":false,"given":"Xinghua","family":"Jiang","sequence":"additional","affiliation":[{"name":"Tencent YouTu Lab, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3789-9705","authenticated-orcid":false,"given":"Haoyu","family":"Cao","sequence":"additional","affiliation":[{"name":"Tencent YouTu Lab, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2998-3155","authenticated-orcid":false,"given":"Yinsong","family":"Liu","sequence":"additional","affiliation":[{"name":"Tencent YouTu Lab, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3987-2431","authenticated-orcid":false,"given":"Deqiang","family":"Jiang","sequence":"additional","affiliation":[{"name":"Tencent YouTu Lab, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8132-9083","authenticated-orcid":false,"given":"Xing","family":"Sun","sequence":"additional","affiliation":[{"name":"Tencent YouTu Lab, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Nougat: Neural Optical Understanding for Academic Documents. arXiv preprint arXiv:2308.13418","author":"Blecher Lukas","year":"2023","unstructured":"Lukas Blecher, Guillem Cucurull, Thomas Scialom, and Robert Stojnic. 2023. Nougat: Neural Optical Understanding for Academic Documents. arXiv preprint arXiv:2308.13418 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01453"},{"key":"e_1_3_2_1_4_1","volume-title":"Complicated Table Structure Recognition. arXiv preprint arXiv:1908.04729","author":"Chi Zewen","year":"2019","unstructured":"Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. 2019. Complicated Table Structure Recognition. arXiv preprint arXiv:1908.04729 (2019)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01783"},{"key":"e_1_3_2_1_6_1","volume-title":"International Conference on Machine Learning. 980-989","author":"Deng Yuntian","year":"2017","unstructured":"Yuntian Deng, Anssi Kanervisto, Jeffrey Ling, and Alexander M Rush. 2017. Image-to-markup generation with coarse-to-fine attention. In International Conference on Machine Learning. 980-989."},{"key":"e_1_3_2_1_7_1","volume-title":"Pp-ocr: A practical ultra lightweight ocr system. arXiv preprint arXiv:2009.09941","author":"Du Yuning","year":"2020","unstructured":"Yuning Du, Chenxia Li, Ruoyu Guo, Xiaoting Yin, Weiwei Liu, Jun Zhou, Yifan Bai, Zilin Yu, Yehua Yang, Qingqing Dang, et al., 2020. Pp-ocr: A practical ultra lightweight ocr system. arXiv preprint arXiv:2009.09941 (2020)."},{"key":"e_1_3_2_1_8_1","volume-title":"ICDAR 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition. IEEE, 1449-1453","author":"G\u00f6bel Max","year":"2013","unstructured":"Max G\u00f6bel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. 2013. ICDAR 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition. IEEE, 1449-1453."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00455"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01071"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548112"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00244"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDARW.2019.10029"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_29"},{"key":"e_1_3_2_1_15_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_16_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Lee Kenton","year":"2023","unstructured":"Kenton Lee, Mandar Joshi, Iulia Raluca Turc, Hexiang Hu, Fangyu Liu, Julian Martin Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, and Kristina Toutanova. 2023. Pix2struct: Screenshot parsing as pretraining for visual language understanding. In International Conference on Machine Learning. PMLR, 18893-18912."},{"key":"e_1_3_2_1_17_1","volume-title":"Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461","author":"Lewis Mike","year":"2019","unstructured":"Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer. 2019. Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461 (2019)."},{"key":"e_1_3_2_1_18_1","volume-title":"Pp-structurev2: A stronger document analysis system. arXiv preprint arXiv:2210.05391","author":"Li Chenxia","year":"2022","unstructured":"Chenxia Li, Ruoyu Guo, Jun Zhou, Mengtao An, Yuning Du, Lingfeng Zhu, Yi Liu, Xiaoguang Hu, and Dianhai Yu. 2022a. Pp-structurev2: A stronger document analysis system. arXiv preprint arXiv:2210.05391 (2022)."},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the Twelfth Language Resources and Evaluation Conference. 1918-1925","author":"Li Minghao","year":"2020","unstructured":"Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou, and Zhoujun Li. 2020a. Tablebank: Table benchmark for image-based table detection and recognition. In Proceedings of the Twelfth Language Resources and Evaluation Conference. 1918-1925."},{"key":"e_1_3_2_1_20_1","volume-title":"DocBank: A benchmark dataset for document layout analysis. arXiv preprint arXiv:2006.01038","author":"Li Minghao","year":"2020","unstructured":"Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. 2020b. DocBank: A benchmark dataset for document layout analysis. arXiv preprint arXiv:2006.01038 (2020)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547751"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548038"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00449"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3481534"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00983"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_27_1","unstructured":"Tengchao Lv Yupan Huang Jingye Chen Lei Cui Shuming Ma Yaoyao Chang Shaohan Huang Wenhui Wang Li Dong Weiyao Luo et al. 2023. Kosmos-2.5: A Multimodal Literate Model. arXiv preprint arXiv:2309.11419 (2023)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_5"},{"key":"e_1_3_2_1_29_1","volume-title":"ICFHR 2014 competition on recognition of on-line handwritten mathematical expressions (CROHME 2014). In 2014 14th International Conference on Frontiers in Handwriting Recognition. IEEE, 791-796","author":"Mouchere Harold","year":"2014","unstructured":"Harold Mouchere, Christian Viard-Gaudin, Richard Zanibbi, and Utpal Garain. 2014. ICFHR 2014 competition on recognition of on-line handwritten mathematical expressions (CROHME 2014). In 2014 14th International Conference on Frontiers in Handwriting Recognition. IEEE, 791-796."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00457"},{"key":"e_1_3_2_1_31_1","volume-title":"Workshop on Document Intelligence at NeurIPS","author":"Park Seunghyun","year":"2019","unstructured":"Seunghyun Park, Seung Shin, Bado Lee, Junyeop Lee, Jaeheung Surh, Minjoon Seo, and Hwalsuk Lee. 2019. CORD: a consolidated receipt dataset for post-OCR parsing. In Workshop on Document Intelligence at NeurIPS 2019."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539043"},{"key":"e_1_3_2_1_33_1","volume-title":"Teaching machines to code: neural markup generation with visual attention. arXiv preprint arXiv:1802.05415","author":"Singh Sumeet S","year":"2018","unstructured":"Sumeet S Singh. 2018. Teaching machines to code: neural markup generation with visual attention. arXiv preprint arXiv:1802.05415 (2018)."},{"key":"e_1_3_2_1_34_1","volume-title":"Layoutreader: Pre-training of text and layout for reading order detection. arXiv preprint arXiv:2108.11591","author":"Wang Zilong","year":"2021","unstructured":"Zilong Wang, Yiheng Xu, Lei Cui, Jingbo Shang, and Furu Wei. 2021. Layoutreader: Pre-training of text and layout for reading order detection. arXiv preprint arXiv:2108.11591 (2021)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00451"},{"key":"e_1_3_2_1_36_1","volume-title":"Reading Order Matters: Information Extraction from Visually-rich Documents by Token Path Prediction. arXiv preprint arXiv:2310.11016","author":"Zhang Chong","year":"2023","unstructured":"Chong Zhang, Ya Guo, Yi Tu, Huan Chen, Jinyang Tang, Huijia Zhu, Qi Zhang, and Tao Gui. 2023. Reading Order Matters: Information Extraction from Visually-rich Documents by Token Path Prediction. arXiv preprint arXiv:2310.11016 (2023)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_34"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00166"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754906","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:38:53Z","timestamp":1765309133000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754906"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":38,"alternative-id":["10.1145\/3746027.3754906","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754906","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}