{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T02:53:24Z","timestamp":1768272804781,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":29,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819556755","type":"print"},{"value":"9789819556762","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5676-2_9","type":"book-chapter","created":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T20:32:12Z","timestamp":1768249932000},"page":"125-139","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Boosting Document Image Translation via\u00a0Layout-Aware Semantic Paragraph Clustering"],"prefix":"10.1007","author":[{"given":"Zhiyuan","family":"Chen","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yaping","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhiyang","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yupu","family":"Liang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yue","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yunfei","family":"Lu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dandan","family":"Tu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chengqing","family":"Zong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yu","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,1,13]]},"reference":[{"key":"9_CR1","unstructured":"Afli, H., Way, A.: Integrating optical character recognition and machine translation of historical documents. In: Proceedings of the Workshop on Language Technology Resources and Tools for Digital Humanities (LT4DH), pp. 109\u2013116 (2016)"},{"key":"9_CR2","doi-asserted-by":"crossref","unstructured":"Binmakhashen, G.M., Mahmoud, S.A.: Document layout analysis: a comprehensive survey. ACM Comput. Surv. 52(6), 109:1\u2013109:36 (2019), https:\/\/doi.org\/10.1145\/3355610","DOI":"10.1145\/3355610"},{"key":"9_CR3","doi-asserted-by":"crossref","unstructured":"Breuel, T.M.: An algorithm for finding maximal whitespace rectangles at arbitrary orientations for document layout analysis. In: Seventh International Conference on Document Analysis and Recognition, 2003. Proceedings, pp. 66\u201370. IEEE (2003)","DOI":"10.1109\/ICDAR.2003.1227629"},{"key":"9_CR4","doi-asserted-by":"crossref","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). pp. 4171\u20134186 (2019)","DOI":"10.18653\/v1\/N19-1423"},{"key":"9_CR5","doi-asserted-by":"crossref","unstructured":"Hong, T., Kim, D., Ji, M., Hwang, W., Nam, D., Park, S.: Bros: a pre-trained language model focusing on text and layout for better key information extraction from documents (2022). https:\/\/arxiv.org\/abs\/2108.04539","DOI":"10.1609\/aaai.v36i10.21322"},{"issue":"2","key":"9_CR6","first-page":"3","volume":"1","author":"EJ Hu","year":"2022","unstructured":"Hu, E.J., et al.: Lora: Low-rank adaptation of large language models. ICLR 1(2), 3 (2022)","journal-title":"ICLR"},{"key":"9_CR7","doi-asserted-by":"crossref","unstructured":"Huang, Y., Lv, T., Cui, L., Lu, Y., Wei, F.: Layoutlmv3: pre-training for document ai with unified text and image masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 4083\u20134091 (2022)","DOI":"10.1145\/3503161.3548112"},{"key":"9_CR8","unstructured":"JaidedAI: Easyocr (2023). https:\/\/github.com\/JaidedAI\/EasyOCR Accessed 1 June 2025"},{"key":"9_CR9","unstructured":"Lan, Z., Chen, M., Goodman, S., Gimpel, K., Sharma, P., Soricut, R.: Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942 (2019)"},{"key":"9_CR10","doi-asserted-by":"crossref","unstructured":"Liang, Y., et al.: Document image machine translation with dynamic multi-pre-trained models assembling. In: Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pp. 7084\u20137095 (2024)","DOI":"10.18653\/v1\/2024.naacl-long.392"},{"key":"9_CR11","unstructured":"Liu, A., et\u00a0al.: Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437 (2024)"},{"key":"9_CR12","doi-asserted-by":"crossref","unstructured":"Ma, C., et al.: Improving end-to-end text image translation from the auxiliary text translation task. In: 2022 26th International Conference on Pattern Recognition (ICPR), pp. 1664\u20131670. IEEE (2022)","DOI":"10.1109\/ICPR56361.2022.9956695"},{"key":"9_CR13","unstructured":"OpenAI: Gpt-4o system card (2024). https:\/\/arxiv.org\/abs\/2410.21276"},{"key":"9_CR14","doi-asserted-by":"publisher","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: Isabelle, P., Charniak, E., Lin, D. (eds.) Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318. Association for Computational Linguistics, Philadelphia, Pennsylvania, USA (Jul 2002). https:\/\/doi.org\/10.3115\/1073083.1073135, https:\/\/aclanthology.org\/P02-1040\/","DOI":"10.3115\/1073083.1073135"},{"key":"9_CR15","doi-asserted-by":"publisher","unstructured":"Popovi\u0107, M.: chrF++: words helping character n-grams. In: Bojar, O., Buck, C., Chatterjee, R., Federmann, C., Graham, Y., Haddow, B., Huck, M., Yepes, A.J., Koehn, P., Kreutzer, J. (eds.) Proceedings of the Second Conference on Machine Translation, pp. 612\u2013618. Association for Computational Linguistics, Copenhagen, Denmark (Sep 2017). https:\/\/doi.org\/10.18653\/v1\/W17-4770, https:\/\/aclanthology.org\/W17-4770\/","DOI":"10.18653\/v1\/W17-4770"},{"key":"9_CR16","unstructured":"Qwen Yang, A., et al.: Qwen2.5 technical report (2025). https:\/\/arxiv.org\/abs\/2412.15115"},{"key":"9_CR17","doi-asserted-by":"crossref","unstructured":"Sable, N.P., Shelke, P., Deogaonkar, N., Joshi, N., Kabadi, R., Joshi, T.: Doc-handler: document scanner, manipulator, and translator based on image and natural language processing. In: 2023 International Conference on Emerging Smart Computing and Informatics (ESCI), pp.\u00a01\u20136. IEEE (2023)","DOI":"10.1109\/ESCI56872.2023.10099625"},{"issue":"1","key":"9_CR18","doi-asserted-by":"publisher","first-page":"61","DOI":"10.1109\/TNN.2008.2005605","volume":"20","author":"F Scarselli","year":"2008","unstructured":"Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE Trans. Neural Netw. 20(1), 61\u201380 (2008)","journal-title":"IEEE Trans. Neural Netw."},{"key":"9_CR19","doi-asserted-by":"publisher","unstructured":"Smith, R.W.: Hybrid page layout analysis via tab-stop detection. In: 2009 10th International Conference on Document Analysis and Recognition, pp. 241\u2013245 (Jul 2009). https:\/\/doi.org\/10.1109\/ICDAR.2009.257, https:\/\/ieeexplore.ieee.org\/document\/5277715, iSSN: 2379-2140","DOI":"10.1109\/ICDAR.2009.257"},{"key":"9_CR20","doi-asserted-by":"crossref","unstructured":"Wang, J., Jin, L., Ding, K.: Lilt: a simple yet effective language-independent layout transformer for structured document understanding (2022). https:\/\/arxiv.org\/abs\/2202.13669","DOI":"10.18653\/v1\/2022.acl-long.534"},{"key":"9_CR21","doi-asserted-by":"crossref","unstructured":"Wang, R., Fujii, Y., Popat, A.C.: Post-ocr paragraph recognition by graph convolutional networks. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 493\u2013502 (2022)","DOI":"10.1109\/WACV51458.2022.00259"},{"key":"9_CR22","doi-asserted-by":"crossref","unstructured":"Xu, Y., Lv, T., Cui, L., Wang, G., Lu, Y., Florencio, D., Zhang, C., Wei, F.: Layoutxlm: Multimodal pre-training for multilingual visually-rich document understanding. arXiv preprint arXiv:2104.08836 (2021)","DOI":"10.18653\/v1\/2022.findings-acl.253"},{"key":"9_CR23","doi-asserted-by":"crossref","unstructured":"Yang, X., Yumer, E., Asente, P., Kraley, M., Kifer, D., Lee\u00a0Giles, C.: Learning to extract semantic structure from documents using multimodal fully convolutional neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5315\u20135324 (2017)","DOI":"10.1109\/CVPR.2017.462"},{"key":"9_CR24","unstructured":"Yao, C.: Docxchain: A powerful open-source toolchain for document parsing and beyond. arXiv preprint arXiv:2310.12430 (2023)"},{"key":"9_CR25","doi-asserted-by":"crossref","unstructured":"Yu, F., Wang, D., Shelhamer, E., Darrell, T.: Deep layer aggregation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2403\u20132412 (2018)","DOI":"10.1109\/CVPR.2018.00255"},{"issue":"1","key":"9_CR26","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s40649-019-0069-y","volume":"6","author":"S Zhang","year":"2019","unstructured":"Zhang, S., Tong, H., Xu, J., Maciejewski, R.: Graph convolutional networks: a comprehensive review. Comput. Soc. Netw. 6(1), 1\u201323 (2019). https:\/\/doi.org\/10.1186\/s40649-019-0069-y","journal-title":"Comput. Soc. Netw."},{"key":"9_CR27","doi-asserted-by":"crossref","unstructured":"Zhang, Z., et al.: Layoutdit: layout-aware end-to-end document image translation with multi-step conductive decoder. In: Findings of the Association for Computational Linguistics: EMNLP 2023, pp. 10043\u201310053 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.673"},{"key":"9_CR28","unstructured":"Zhang, Z., et al.: From chaotic ocr words to coherent document: A fine-to-coarse zoom-out network for complex-layout document image translation. In: Proceedings of the 31st International Conference on Computational Linguistics, pp. 10877\u201310890 (2025)"},{"key":"9_CR29","doi-asserted-by":"crossref","unstructured":"Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International conference on document analysis and recognition (ICDAR), pp. 1015\u20131022. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00166"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5676-2_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T20:32:15Z","timestamp":1768249935000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5676-2_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819556755","9789819556762"],"references-count":29,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5676-2_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"13 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}