{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T06:08:02Z","timestamp":1758089282735,"version":"3.44.0"},"publisher-location":"Cham","reference-count":28,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783032046239"},{"type":"electronic","value":"9783032046246"}],"license":[{"start":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T00:00:00Z","timestamp":1758067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T00:00:00Z","timestamp":1758067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-04624-6_14","type":"book-chapter","created":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T05:33:26Z","timestamp":1758000806000},"page":"238-256","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Multimodal Content Alignment with\u00a0LLM for\u00a0Visual Presentation of\u00a0Papers"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-0990-1232","authenticated-orcid":false,"given":"Huiying","family":"Hu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6233-440X","authenticated-orcid":false,"given":"Zhicheng","family":"He","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6142-5956","authenticated-orcid":false,"given":"Yixiao","family":"Zhou","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2916-6784","authenticated-orcid":false,"given":"Tongwei","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2354-7749","authenticated-orcid":false,"given":"Xiaoqing","family":"Lyu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,9,17]]},"reference":[{"key":"14_CR1","unstructured":"Achiam, J., et\u00a0al.: Gpt-4 technical report. arXiv e-prints pp. arXiv\u20132303 (2023)"},{"key":"14_CR2","unstructured":"Bai, J., et\u00a0al.: Qwen technical report. arXiv e-prints pp. arXiv\u20132309 (2023)"},{"key":"14_CR3","unstructured":"Bai, J., et al.: Qwen-vl: a versatile vision-language model for understanding, localization, text reading, and beyond. arXiv e-prints pp. arXiv\u20132308 (2023)"},{"key":"14_CR4","doi-asserted-by":"publisher","unstructured":"Bay, H., Tuytelaars, T., Van Gool, L.: SURF: speeded up robust features. In: Leonardis, A., Bischof, H., Pinz, A. (eds.) ECCV 2006. LNCS, vol. 3951, pp. 404\u2013417. Springer, Heidelberg (2006). https:\/\/doi.org\/10.1007\/11744023_32","DOI":"10.1007\/11744023_32"},{"key":"14_CR5","doi-asserted-by":"crossref","unstructured":"Bulut, B., G\u00fcndo\u011fan, E., Kaya, B., Alhajj, R., Kaya, M.: User\u2019s research interests based paper recommendation system: a deep learning approach. In: Putting Social Media and Networking Data in Practice for Education, Planning, Prediction and Recommendation, pp. 117\u2013130 (2020)","DOI":"10.1007\/978-3-030-33698-1_7"},{"key":"14_CR6","doi-asserted-by":"crossref","unstructured":"Chen, Z., et\u00a0al.: Internvl: scaling up vision foundation models and aligning for generic visual-linguistic tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 24185\u201324198 (2024)","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"14_CR7","doi-asserted-by":"crossref","unstructured":"Ebesu, T., Fang, Y.: Neural citation network for context-aware citation recommendation. In: Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 1093\u20131096 (2017)","DOI":"10.1145\/3077136.3080730"},{"issue":"12","key":"14_CR8","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s11432-024-4250-y","volume":"67","author":"H Feng","year":"2024","unstructured":"Feng, H., et al.: Docpedia: unleashing the power of large multimodal model in the frequency domain for versatile document understanding. Sci. China Inf. Sci. 67(12), 1\u201314 (2024)","journal-title":"Sci. China Inf. Sci."},{"key":"14_CR9","doi-asserted-by":"crossref","unstructured":"Fu, T.J., Wang, W.Y., McDuff, D., Song, Y.: Doc2ppt: automatic presentation slides generation from scientific documents. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a036, pp. 634\u2013642 (2022)","DOI":"10.1609\/aaai.v36i1.19943"},{"key":"14_CR10","unstructured":"He, X., et al.: Matchanything: universal cross-modality image matching with large-scale pre-training. arXiv e-prints pp. arXiv\u20132501 (2025)"},{"issue":"6","key":"14_CR11","doi-asserted-by":"publisher","first-page":"717","DOI":"10.1006\/jmla.1993.1036","volume":"32","author":"M Hegarty","year":"1993","unstructured":"Hegarty, M., Just, M.A.: Constructing mental models of machines from text and diagrams. J. Mem. Lang. 32(6), 717\u2013742 (1993)","journal-title":"J. Mem. Lang."},{"key":"14_CR12","doi-asserted-by":"crossref","unstructured":"Hu, A., et al.: mplug-paperowl: scientific diagram analysis with the multimodal large language model. In: Proceedings of the 32nd ACM International Conference on Multimedia, pp. 6929\u20136938 (2024)","DOI":"10.1145\/3664647.3681294"},{"key":"14_CR13","doi-asserted-by":"publisher","first-page":"283","DOI":"10.1007\/s00799-023-00352-7","volume":"24","author":"AR Kashyap","year":"2023","unstructured":"Kashyap, A.R., Yang, Y., Kan, M.Y.: Scientific document processing: challenges for modern learning methods. Int. J. Digit. Libr. 24, 283\u2013309 (2023). https:\/\/doi.org\/10.1007\/s00799-023-00352-7","journal-title":"Int. J. Digit. Libr."},{"key":"14_CR14","doi-asserted-by":"crossref","unstructured":"Li, H., Ying, L., Zhang, H., Wu, Y., Qu, H., Wang, Y.: Notable: on-the-fly assistant for data storytelling in computational notebooks. In: Proceedings of the 2023 CHI Conference on Human Factors in Computing Systems, pp. 1\u201316 (2023)","DOI":"10.1145\/3544548.3580965"},{"key":"14_CR15","unstructured":"Liu, Y., et al.: Textmonkey: an ocr-free large multimodal model for understanding document. CoRR (2024)"},{"issue":"4","key":"14_CR16","doi-asserted-by":"publisher","first-page":"444","DOI":"10.1037\/0022-0663.84.4.444","volume":"84","author":"RE Mayer","year":"1992","unstructured":"Mayer, R.E., Anderson, R.B.: The instructive animation: helping students build connections between words and pictures in multimedia learning. J. Educ. Psychol. 84(4), 444 (1992)","journal-title":"J. Educ. Psychol."},{"key":"14_CR17","doi-asserted-by":"publisher","first-page":"349","DOI":"10.1007\/978-3-031-70533-5_21","volume-title":"Document Analysis and Recognition - ICDAR 2024","author":"O Moured","year":"2024","unstructured":"Moured, O., Zhang, J., Sarfraz, M.S., Stiefelhagen, R.: Altchart: enhancing vlm-based chart summarization through multi-pretext tasks. In: Barney Smith, E.H., Liwicki, M., Peng, L. (eds.) Document Analysis and Recognition - ICDAR 2024, pp. 349\u2013366. Springer, Cham (2024). https:\/\/doi.org\/10.1007\/978-3-031-70533-5_21"},{"issue":"13","key":"14_CR18","doi-asserted-by":"publisher","first-page":"3812","DOI":"10.1093\/nar\/gkg509","volume":"31","author":"PC Ng","year":"2003","unstructured":"Ng, P.C., Henikoff, S.: Sift: predicting amino acid changes that affect protein function. Nucleic Acids Res. 31(13), 3812\u20133814 (2003)","journal-title":"Nucleic Acids Res."},{"key":"14_CR19","doi-asserted-by":"crossref","unstructured":"Pu, X., Yang, P., Yuan, L., Gao, X.: Improving image-text matching by integrating word sense disambiguation. IEEE Signal Process. Lett. (2024)","DOI":"10.1109\/LSP.2024.3466992"},{"key":"14_CR20","doi-asserted-by":"crossref","unstructured":"Sun, E., Hou, Y., Wang, D., Zhang, Y., Wang, N.X.: D2s: document-to-slide generation via query-based text summarization. In: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 1405\u20131418 (2021)","DOI":"10.18653\/v1\/2021.naacl-main.111"},{"key":"14_CR21","doi-asserted-by":"publisher","first-page":"208","DOI":"10.1007\/978-3-031-70533-5_13","volume-title":"Document Analysis and Recognition - ICDAR 2024","author":"A Trivedi","year":"2024","unstructured":"Trivedi, A., Upadhyay, A., Mukhopadhyay, R., Chaudhury, S.: GDP: generic document pretraining to improve document understanding. In: Barney Smith, E.H., Liwicki, M., Peng, L. (eds.) Document Analysis and Recognition - ICDAR 2024, pp. 208\u2013226. Springer, Cham (2024)"},{"key":"14_CR22","doi-asserted-by":"crossref","unstructured":"Wang, D., et al.: Docllm: a layout-aware generative language model for multimodal document understanding. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics, vol. 1: Long Papers, pp. 8529\u20138548 (2024)","DOI":"10.18653\/v1\/2024.acl-long.463"},{"key":"14_CR23","doi-asserted-by":"crossref","unstructured":"Wang, F., et al.: Slide4n: creating presentation slides from computational notebooks with human-ai collaboration. In: Proceedings of the 2023 CHI Conference on Human Factors in Computing Systems, pp. 1\u201318 (2023)","DOI":"10.1145\/3544548.3580753"},{"key":"14_CR24","unstructured":"Wu, Z., et\u00a0al.: Deepseek-vl2: mixture-of-experts vision-language models for advanced multimodal understanding. arXiv e-prints pp. arXiv\u20132412 (2024)"},{"issue":"12","key":"14_CR25","doi-asserted-by":"publisher","first-page":"5412","DOI":"10.1109\/TNNLS.2020.2967597","volume":"31","author":"X Xu","year":"2020","unstructured":"Xu, X., Wang, T., Yang, Y., Zuo, L., Shen, F., Shen, H.T.: Cross-modal attention with semantic consistence for image-text matching. IEEE Trans. Neural Netw. Learn. Syst. 31(12), 5412\u20135425 (2020)","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"14_CR26","doi-asserted-by":"crossref","unstructured":"Ye, J., et\u00a0al.: Ureader: universal ocr-free visually-situated language understanding with multimodal large language model. In: Findings of the Association for Computational Linguistics: EMNLP 2023, pp. 2841\u20132858 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.187"},{"key":"14_CR27","doi-asserted-by":"crossref","unstructured":"Zhao, Y., et al.: Attention bootstrapping for multi-modal test-time adaptation. arXiv preprint arXiv:2503.02221 (2025)","DOI":"10.1609\/aaai.v39i21.34446"},{"key":"14_CR28","doi-asserted-by":"crossref","unstructured":"Zheng, C., Wang, D., Wang, A.Y., Ma, X.: Telling stories from computational notebooks: AI-assisted presentation slides creation for presenting data science work. In: Proceedings of the 2022 CHI Conference on Human Factors in Computing Systems, pp. 1\u201320 (2022)","DOI":"10.1145\/3491102.3517615"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition \u2013 ICDAR 2025"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-04624-6_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T05:33:37Z","timestamp":1758000817000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-04624-6_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,17]]},"ISBN":["9783032046239","9783032046246"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-04624-6_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025,9,17]]},"assertion":[{"value":"17 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that\u00a0are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Wuhan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/iapr.org\/icdar2025","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}