{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,13]],"date-time":"2025-09-13T16:28:43Z","timestamp":1757780923198,"version":"3.40.3"},"publisher-location":"Cham","reference-count":33,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031705328"},{"type":"electronic","value":"9783031705335"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-70533-5_6","type":"book-chapter","created":{"date-parts":[[2024,9,7]],"date-time":"2024-09-07T05:02:25Z","timestamp":1725685345000},"page":"79-96","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["SlideCraft: Synthetic Slides Generation for\u00a0Robust Slide Analysis"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1548-8841","authenticated-orcid":false,"given":"Travis","family":"Seng","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6838-3445","authenticated-orcid":false,"given":"Axel","family":"Carlier","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2085-9503","authenticated-orcid":false,"given":"Thomas","family":"Forgione","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6755-5774","authenticated-orcid":false,"given":"Vincent","family":"Charvillat","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8994-1736","authenticated-orcid":false,"given":"Wei Tsang","family":"Ooi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,9,8]]},"reference":[{"key":"6_CR1","unstructured":"marp-team\/marp (2024). https:\/\/github.com\/marp-team\/marp. Original-date: 2018-03-25T12:47:38Z"},{"key":"6_CR2","doi-asserted-by":"publisher","unstructured":"Adcock, J., Cooper, M., Denoue, L., Pirsiavash, H., Rowe, L.A.: TalkMiner: a search engine for online lecture video. In: Proceedings of the International Conference on Multimedia - MM 2010, Firenze, Italy, p.\u00a01507. ACM Press (2010). https:\/\/doi.org\/10.1145\/1873951.1874263. http:\/\/dl.acm.org\/citation.cfm?doid=1873951.1874263","DOI":"10.1145\/1873951.1874263"},{"key":"6_CR3","doi-asserted-by":"crossref","unstructured":"Araujo, A., Chaves, J., Lakshman, H., Angst, R., Girod, B.: Large-Scale Query-by-Image Video Retrieval Using Bloom Filters. arXiv arXiv:1604.07939 (2015)","DOI":"10.1109\/ICIP.2015.7351054"},{"key":"6_CR4","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"358","DOI":"10.1007\/978-3-030-67667-4_22","volume-title":"Machine Learning and Knowledge Discovery in Databases: Applied Data Science Track","author":"T Blanc-Beyne","year":"2021","unstructured":"Blanc-Beyne, T., Carlier, A., Mouysset, S., Charvillat, V.: Unsupervised human pose estimation on depth images. In: Dong, Y., Mladeni\u0107, D., Saunders, C. (eds.) ECML PKDD 2020. LNCS (LNAI), vol. 12460, pp. 358\u2013373. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-67667-4_22"},{"key":"6_CR5","doi-asserted-by":"publisher","unstructured":"Capobianco, S., Marinai, S.: DocEmul: A Toolkit to Generate Structured Historical Documents, pp. 1186\u20131191 (2017). https:\/\/doi.org\/10.1109\/ICDAR.2017.196","DOI":"10.1109\/ICDAR.2017.196"},{"key":"6_CR6","unstructured":"Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with coarse-to-fine attention (2017)"},{"key":"6_CR7","doi-asserted-by":"publisher","unstructured":"Ferreira, A., Nowroozi, E., Barni, M.: VIPPrint: validating synthetic image detection and source linking methods on a large scale dataset of printed documents. J. Imaging 7(3), 50 (2021). https:\/\/doi.org\/10.3390\/jimaging7030050. https:\/\/www.mdpi.com\/2313-433X\/7\/3\/50","DOI":"10.3390\/jimaging7030050"},{"key":"6_CR8","doi-asserted-by":"publisher","unstructured":"Haurilet, M., Al-Halah, Z., Stiefelhagen, R.: SPaSe - multi-label page segmentation for presentation slides. In: 2019 IEEE Winter Conference on Applications of Computer Vision (WACV), Waikoloa Village, HI, USA, pp. 726\u2013734. IEEE (2019). https:\/\/doi.org\/10.1109\/WACV.2019.00082. https:\/\/ieeexplore.ieee.org\/document\/8659181\/","DOI":"10.1109\/WACV.2019.00082"},{"key":"6_CR9","doi-asserted-by":"publisher","unstructured":"Haurilet, M., Roitberg, A., Martinez, M., Stiefelhagen, R.: WiSe - slide segmentation in the wild. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), Sydney, Australia, pp. 343\u2013348. IEEE (2019). https:\/\/doi.org\/10.1109\/ICDAR.2019.00062. https:\/\/ieeexplore.ieee.org\/document\/8978089\/","DOI":"10.1109\/ICDAR.2019.00062"},{"key":"6_CR10","doi-asserted-by":"publisher","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep Residual Learning for Image Recognition (2015). https:\/\/doi.org\/10.48550\/arXiv.1512.03385. http:\/\/arxiv.org\/abs\/1512.03385. arXiv:1512.03385","DOI":"10.48550\/arXiv.1512.03385"},{"key":"6_CR11","unstructured":"Jiang, A.Q., et al.: Mistral 7B (2023)"},{"key":"6_CR12","unstructured":"Jocher, G., Chaurasia, A., Qiu, J.: Ultralytics YOLO (2023). https:\/\/github.com\/ultralytics\/ultralytics"},{"issue":"159","key":"6_CR13","first-page":"2","volume":"2007","author":"A Kay","year":"2007","unstructured":"Kay, A.: Tesseract: an open-source optical character recognition engine. Linux J. 2007(159), 2 (2007)","journal-title":"Linux J."},{"key":"6_CR14","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"235","DOI":"10.1007\/978-3-319-46493-0_15","volume-title":"ECCV 2016","author":"A Kembhavi","year":"2016","unstructured":"Kembhavi, A., Salvato, M., Kolve, E., Seo, M., Hajishirzi, H., Farhadi, A.: A diagram is worth a dozen images. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9908, pp. 235\u2013251. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46493-0_15"},{"key":"6_CR15","doi-asserted-by":"publisher","unstructured":"Kim, J., Choi, Y., Kahng, M., Kim, J.: FitVid: responsive and flexible video content adaptation. In: CHI Conference on Human Factors in Computing Systems, New Orleans, LA, USA, pp. 1\u201316. ACM (2022). https:\/\/doi.org\/10.1145\/3491102.3501948. https:\/\/dl.acm.org\/doi\/10.1145\/3491102.3501948","DOI":"10.1145\/3491102.3501948"},{"key":"6_CR16","doi-asserted-by":"publisher","unstructured":"Kim, J., Guo, P.J., Cai, C.J., Li, S.W.D., Gajos, K.Z., Miller, R.C.: Data-driven interaction techniques for improving navigation of educational videos. In: Proceedings of the 27th Annual ACM Symposium on User Interface Software and Technology, UIST 2014, pp. 563\u2013572. Association for Computing Machinery, New York (2014). https:\/\/doi.org\/10.1145\/2642918.2647389","DOI":"10.1145\/2642918.2647389"},{"key":"6_CR17","unstructured":"Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: TableBank: table benchmark for image-based table detection and recognition. In: Calzolari, N., et al. (eds.) Proceedings of the Twelfth Language Resources and Evaluation Conference, Marseille, France, pp. 1918\u20131925. European Language Resources Association (2020). https:\/\/aclanthology.org\/2020.lrec-1.236"},{"key":"6_CR18","doi-asserted-by":"publisher","unstructured":"Li, M., Xu, Y., Cui, L., Huang, S., Wei, F., Li, Z., Zhou, M.: DocBank: a benchmark dataset for document layout analysis. In: Scott, D., Bel, N., Zong, C. (eds.) Proceedings of the 28th International Conference on Computational Linguistics, Barcelona, Spain, pp. 949\u2013960. International Committee on Computational Linguistics (2020). https:\/\/doi.org\/10.18653\/v1\/2020.coling-main.82. https:\/\/aclanthology.org\/2020.coling-main.82","DOI":"10.18653\/v1\/2020.coling-main.82"},{"key":"6_CR19","doi-asserted-by":"publisher","unstructured":"Lin, T.Y., Dollar, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), Honolulu, HI, pp. 936\u2013944. IEEE (2017). https:\/\/doi.org\/10.1109\/CVPR.2017.106. http:\/\/ieeexplore.ieee.org\/document\/8099589\/","DOI":"10.1109\/CVPR.2017.106"},{"key":"6_CR20","doi-asserted-by":"crossref","unstructured":"Liwicki, M., Bunke, H.: IAM-OnDB-an on-line english sentence database acquired from handwritten text on a whiteboard. In: Eighth International Conference on Document Analysis and Recognition (ICDAR 2005), pp. 956\u2013961. IEEE (2005)","DOI":"10.1109\/ICDAR.2005.132"},{"key":"6_CR21","doi-asserted-by":"publisher","unstructured":"Masry, A., Do, X.L., Tan, J.Q., Joty, S., Hoque, E.: ChartQA: a benchmark for question answering about charts with visual and logical reasoning. In: Muresan, S., Nakov, P., Villavicencio, A. (eds.) Findings of the Association for Computational Linguistics: ACL 2022, Dublin, Ireland, pp. 2263\u20132279. Association for Computational Linguistics (2022). https:\/\/doi.org\/10.18653\/v1\/2022.findings-acl.177. https:\/\/aclanthology.org\/2022.findings-acl.177","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"6_CR22","doi-asserted-by":"crossref","unstructured":"Methani, N., Ganguly, P., Khapra, M.M., Kumar, P.: PlotQA: Reasoning over Scientific Plots (2020). http:\/\/arxiv.org\/abs\/1909.00997. arXiv:1909.00997","DOI":"10.1109\/WACV45572.2020.9093523"},{"key":"6_CR23","doi-asserted-by":"publisher","unstructured":"Mukhopadhyay, S., Smith, B.: Passive capture and structuring of lectures. In: Proceedings of the seventh ACM International Conference on Multimedia (Part 1) - MULTIMEDIA 1999, Orlando, Florida, United States, pp. 477\u2013487. ACM Press (1999). https:\/\/doi.org\/10.1145\/319463.319690. http:\/\/portal.acm.org\/citation.cfm?doid=319463.319690","DOI":"10.1145\/319463.319690"},{"key":"6_CR24","doi-asserted-by":"publisher","unstructured":"Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.: DocLayNet: a large human-annotated dataset for document-layout segmentation. In: Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Washington DC, USA, pp. 3743\u20133751. ACM (2022). https:\/\/doi.org\/10.1145\/3534678.3539043. https:\/\/dl.acm.org\/doi\/10.1145\/3534678.3539043","DOI":"10.1145\/3534678.3539043"},{"key":"6_CR25","doi-asserted-by":"publisher","unstructured":"Pisaneschi, L., Gemelli, A., Marinai, S.: Automatic generation of scientific papers for data augmentation in document layout analysis. Pattern Recogn. Lett. 167, 38\u201344 (2023). https:\/\/doi.org\/10.1016\/j.patrec.2023.01.018. https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0167865523000247","DOI":"10.1016\/j.patrec.2023.01.018"},{"key":"6_CR26","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: Cortes, C., Lawrence, N., Lee, D., Sugiyama, M., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol.\u00a028. Curran Associates, Inc. (2015). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2015\/file\/14bfa6bb14875e45bba028a21ed38046-Paper.pdf"},{"key":"6_CR27","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"102","DOI":"10.1007\/978-3-319-46475-6_7","volume-title":"Computer Vision \u2013 ECCV 2016","author":"SR Richter","year":"2016","unstructured":"Richter, S.R., Vineet, V., Roth, S., Koltun, V.: Playing for data: ground truth from computer games. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9906, pp. 102\u2013118. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46475-6_7"},{"key":"6_CR28","doi-asserted-by":"publisher","unstructured":"Tanaka, R., Nishida, K., Nishida, K., Hasegawa, T., Saito, I., Saito, K.: SlideVQA: A Dataset for Document Visual Question Answering on Multiple Images (2023). https:\/\/doi.org\/10.48550\/arXiv.2301.04883. http:\/\/arxiv.org\/abs\/2301.04883. arXiv:2301.04883","DOI":"10.48550\/arXiv.2301.04883"},{"key":"6_CR29","doi-asserted-by":"crossref","unstructured":"Wang, J., Min, W., Hou, S., Ma, S., Zheng, Y., Jiang, S.: Logodet-3k: a large-scale image dataset for logo detection. ACM Trans. Multimedia Comput. Commun. Appl. (TOMM) 18(1), 1\u201319 (2022)","DOI":"10.1145\/3466780"},{"key":"6_CR30","doi-asserted-by":"publisher","unstructured":"Xu, C., et al.: Lecture2Note: automatic generation of lecture notes from slide-based educational videos. In: 2019 IEEE International Conference on Multimedia and Expo (ICME), pp. 898\u2013903 (2019). https:\/\/doi.org\/10.1109\/ICME.2019.00159. ISSN: 1945-788X","DOI":"10.1109\/ICME.2019.00159"},{"key":"6_CR31","doi-asserted-by":"publisher","unstructured":"Yoo, T., Jeong, H., Lee, D., Jung, H.: LectYS: a system for summarizing lecture videos on YouTube. In: 26th International Conference on Intelligent User Interfaces, College Station, TX, USA, pp. 90\u201392. ACM (2021). https:\/\/doi.org\/10.1145\/3397482.3450722. https:\/\/dl.acm.org\/doi\/10.1145\/3397482.3450722","DOI":"10.1145\/3397482.3450722"},{"key":"6_CR32","doi-asserted-by":"publisher","unstructured":"Zhao, B., Xu, S., Lin, S., Wang, R., Luo, X.: A new visual interface for searching and navigating slide-based lecture videos. In: 2019 IEEE International Conference on Multimedia and Expo (ICME), pp. 928\u2013933 (2019). https:\/\/doi.org\/10.1109\/ICME.2019.00164. ISSN: 1945-788X","DOI":"10.1109\/ICME.2019.00164"},{"key":"6_CR33","doi-asserted-by":"publisher","unstructured":"Zhong, X., Tang, J., Jimeno-Yepes, A.: PubLayNet: Largest Dataset Ever for Document Layout Analysis (2019). https:\/\/doi.org\/10.1109\/ICDAR.2019.00166","DOI":"10.1109\/ICDAR.2019.00166"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition - ICDAR 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-70533-5_6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,7]],"date-time":"2024-09-07T05:02:53Z","timestamp":1725685373000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-70533-5_6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031705328","9783031705335"],"references-count":33,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-70533-5_6","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"8 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Athens","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Greece","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30 August 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 September 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icdar2024.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}