{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T23:25:07Z","timestamp":1772148307488,"version":"3.50.1"},"publisher-location":"Cham","reference-count":50,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031416811","type":"print"},{"value":"9783031416828","type":"electronic"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-41682-8_27","type":"book-chapter","created":{"date-parts":[[2023,8,18]],"date-time":"2023-08-18T07:02:59Z","timestamp":1692342179000},"page":"438-453","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Diffusion Models for\u00a0Document Image Generation"],"prefix":"10.1007","author":[{"given":"Noman","family":"Tanveer","sequence":"first","affiliation":[]},{"given":"Adnan","family":"Ul-Hasan","sequence":"additional","affiliation":[]},{"given":"Faisal","family":"Shafait","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,8,19]]},"reference":[{"key":"27_CR1","unstructured":"Ramesh A., et al.: Zero-Shot Text-to-Image Generation. In: International Conference on Machine Learning (ICML), pp. 8821\u20138831 (2021)"},{"key":"27_CR2","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical Text-Conditional Image Generation with CLIP Latents. In: arXiv, preprint: arXiv:2204.06125, (2022)"},{"key":"27_CR3","unstructured":"Saharia, C., et al.: Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding. In: arXiv, preprint: arXiv:2205.11487, (2022)"},{"key":"27_CR4","unstructured":"Razavi, A., Van-den-Oord, A., Vinyals, O.: Generating diverse high-fidelity images with VQ-VAE-2. Adv. Neural Inf. Process. Syst. (NeurIPS) 32, 14837\u201314847 (2019)"},{"key":"27_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"555","DOI":"10.1007\/978-3-030-86334-0_36","volume-title":"Document Analysis and Recognition \u2013 ICDAR 2021","author":"S Biswas","year":"2021","unstructured":"Biswas, S., Riba, P., Llad\u00f3s, J., Pal, U.: DocSynth: A Layout Guided Approach for Controllable Document Image Synthesis. In: Llad\u00f3s, J., Lopresti, D., Uchida, S. (eds.) ICDAR 2021. LNCS, vol. 12823, pp. 555\u2013568. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86334-0_36"},{"key":"27_CR6","doi-asserted-by":"crossref","unstructured":"Bui, Q.A., Mollard, D., Tabbone, S.: Automatic synthetic document image Generation using generative adversarial networks: application in mobile-captured document analysis. In: International Conference on Document Analysis and Recognition (ICDAR), pp. 393\u2013400, IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00070"},{"key":"27_CR7","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using non-equilibrium thermodynamics. In: International Conference on Machine Learning (ICML), pp. 2256\u20132265, PMLR (2015)"},{"key":"27_CR8","unstructured":"Welling, M., Teh, Y.W.: Bayesian learning via stochastic gradient langevin dynamics. In: Proceedings of the 28th International Conference on Machine Learning (ICML), vol 28, pp. 681\u2013688 (2011)"},{"key":"27_CR9","unstructured":"Song, Y., Ermon, S.: Generative modeling by estimating gradients of the data distribution. Adv. Neural Inf. Process. Syst. (NeurIPS) 32, 11895\u201311907 (2019)"},{"key":"27_CR10","unstructured":"Song, Y., Ermon, S.: Improved techniques for training score-based generative models. Adv. Neural Inf. Process. Syst.(NeurIPS) 33, 12438\u201312448 (2020)"},{"key":"27_CR11","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural Inf. Process. Syst. (NeurIPS) 33, 6840\u20136851 (2020)"},{"key":"27_CR12","unstructured":"Song, J., Meng, C., Ermon, S: Denoising Diffusion Implicit Models. In: arXiv, preprint: arXiv:2010.02502 (2020)"},{"key":"27_CR13","unstructured":"Nichol, A.Q., Dhariwal, P.: Improved denoising diffusion probabilistic models. In: International Conference on Machine Learning (ICML), pp. 8162\u20138171 PMLR (2021)"},{"key":"27_CR14","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. Adv. Neural Inf. Process. Syst. (NIPS) 34, 8780\u20138794 (2021)"},{"key":"27_CR15","unstructured":"Ho, J., Salimans, T.: Classifier-free Diffusion Guidance. In: arXiv, preprint: arXiv:2207.12598 (2022)"},{"key":"27_CR16","unstructured":"Song, Y., Sohl-Dickstein, J., Kingma, D.P., Kumar, A., Ermon, S., Poole, B.: Score-based generative modeling through stochastic differential equations. In: arXiv, preprint: arXiv:2011.13456 (2020)"},{"key":"27_CR17","unstructured":"Nichol, A.,et al.: Glide: towards photorealistic image generation and editing with text-guided diffusion models. In: arXiv, preprint: arXiv:2112.10741 (2021)"},{"key":"27_CR18","doi-asserted-by":"crossref","unstructured":"Ho, J., Saharia, C., Chan, W., Fleet, D.J., Norouzi, M., Salimans, T.: Cascaded diffusion models for high fidelity image generation. J. Mach. Learn. Res. 23, 1\u201333 (2022)","DOI":"10.1109\/TPAMI.2022.3204461"},{"key":"27_CR19","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. In: arXiv, preprint: arXiv:2204.06125 (2022)"},{"key":"27_CR20","unstructured":"Saharia, C., et al.: Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding. arXiv, preprint: arXiv:2205.11487 (2022)"},{"key":"27_CR21","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"27_CR22","doi-asserted-by":"crossref","unstructured":"Zhong, X., Tang, J., Yepes, A.J.: PubLayNet: largest dataset-ever for document layout analysis. In: International Conference on Document Analysis and Recognition (ICDAR), pp. 1015\u20131022. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00166"},{"key":"27_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: Common Objects in Context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"27_CR24","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 248\u2013255, IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"27_CR25","doi-asserted-by":"crossref","unstructured":"Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.W.: DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis. arXiv, preprint: arXiv:2206.01062 (2022)","DOI":"10.1145\/3534678.3539043"},{"key":"27_CR26","unstructured":"EPA United States Environment Protection Agency. https:\/\/www.epa.gov\/facts-and-figures-about-materials-waste-and-recycling\/national-overview-facts-and-figures-materials?_ga=2.202832145.1018593204.1622837058-191240632.1618425162"},{"key":"27_CR27","unstructured":"Forbes Report. https:\/\/www.forbes.com\/sites\/forbestechcouncil\/2020\/04\/02\/going-paperless-a-journey-worth-taking\/?sh=72561e4a5ca1"},{"key":"27_CR28","doi-asserted-by":"crossref","unstructured":"Wiseman, S., Shieber, S.M., Rush, A.M.: Challenges in data-to-document generation. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing, pp. 2253\u20132263, Copenhagen, Denmark. Association for Computational Linguistics (2017)","DOI":"10.18653\/v1\/D17-1239"},{"key":"27_CR29","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"525","DOI":"10.1007\/978-3-030-86159-9_38","volume-title":"Document Analysis and Recognition \u2013 ICDAR 2021 Workshops","author":"S Biswas","year":"2021","unstructured":"Biswas, S., Riba, P., Llad\u00f3s, J., Pal, U.: Graph-Based Deep Generative Modelling for Document Layout Generation. In: Barney Smith, E.H., Pal, U. (eds.) ICDAR 2021. LNCS, vol. 12917, pp. 525\u2013537. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86159-9_38"},{"key":"27_CR30","unstructured":"Brown, T., et al.: Language models are Few-shot learners. Adv. Neural Inf. Process. Syst. NIPS 33, 1877\u20131901 (2020)"},{"key":"27_CR31","doi-asserted-by":"crossref","unstructured":"Horak, W.: Office document architecture and office document interchange formats. current status of international standardization. Computer 10, 50\u201360 (1985)","DOI":"10.1109\/MC.1985.1662713"},{"key":"27_CR32","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 12873\u201312883, (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"27_CR33","unstructured":"Kim, T., Bengio, Y.: Deep-directed Generative Models with Energy-based Probability Estimation. In: arXiv, preprint arXiv:1606.03439 (2016)"},{"key":"27_CR34","doi-asserted-by":"crossref","unstructured":"Yang, L., Karniadakis, G.E.: Potential flow generator with L-2 optimal transport regularity for generative models. IEEE Trans. Neural Netw. Learn. Syst. 33, 528\u2013538 (2020)","DOI":"10.1109\/TNNLS.2020.3028042"},{"key":"27_CR35","unstructured":"Zhang, L., E., W., Wang, L.: Monge-ampere flow for generative modeling. arXiv, preprint arXiv:1809.10188 (2018)"},{"key":"27_CR36","unstructured":"Metz, L., Poole, B., Pfau, D., Sohl-Dickstein, J.: Unrolled generative adversarial networks. In: International Conference on Learning Representations, ICLR (2017)"},{"key":"27_CR37","unstructured":"Arjovsky, M., Chintala, S., Bottou, L.: Wasserstein generative adversarial network. In: Proceedings of the International Conference on Machine Learning (ICML), vol. 70, pp. 214\u2013223 (2017)"},{"key":"27_CR38","doi-asserted-by":"crossref","unstructured":"Goodfellow, I., et al.: Generative adversarial nets. Adv. Neural Inf. Process. Syst. (NIPS) 27, 139\u2013144 (2014)","DOI":"10.1145\/3422622"},{"key":"27_CR39","unstructured":"Brock, A., Donahue, J., Simonyan, K.: Large scale gan training for high fidelity natural image synthesis. In: International Conference on Learning Representations (ICLR), vol. 7 (2019)"},{"key":"27_CR40","doi-asserted-by":"crossref","unstructured":"Vo, D.M., Nguyen, D.M., Le, T.P., Lee, S.W.: HI-GAN: a hierarchical generative adversarial network for blind denoising of real photographs, Elsevier Science Inc. Inf. Sci. 570, 225\u2013240 (2021)","DOI":"10.1016\/j.ins.2021.04.045"},{"key":"27_CR41","unstructured":"Karras, T., et al.: Alias-free generative adversarial networks. Adv. Neural Inf. Process. Syst. (NeurIPS) 34, 852\u2013863 (2021)"},{"key":"27_CR42","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT), vol 1, pp. 4171\u20134186 (2019)"},{"key":"27_CR43","unstructured":"Oord, A.V.D., Vinyals, O., Kavukcuoglu, K.: Neural discrete representation learning. Adv. Neural Inf. Process. Syst. (NIPS) 30, 6306\u20136315 (2017)"},{"key":"27_CR44","unstructured":"Hutter, L.: Decoupled weight decay regularization. In: International Conference on Learning Representations (ICLR), vol 7 (2019)"},{"key":"27_CR45","volume-title":"Child, Luan, Amodei, Sutskever: Language Models are Unsupervised Multitask Learners","author":"W Radford","year":"2019","unstructured":"Radford, W.: Child, Luan, Amodei, Sutskever: Language Models are Unsupervised Multitask Learners. OpenAI, Technical Report (2019)"},{"key":"27_CR46","doi-asserted-by":"crossref","unstructured":"Peters, M.E., Neumann, M., Iyyer, M., Gardner, M., Clark, C., Lee, K., Zettlemoyer, L.: Deep Contextualized Word Representations. In: Proceedings of the conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. vol. 1, pp. 2227\u20132237 (2018)","DOI":"10.18653\/v1\/N18-1202"},{"key":"27_CR47","doi-asserted-by":"crossref","unstructured":"Karras, T, Laine, S., Aittala, M., Hellsten, J., Lehtinen, J., Aila, T.: Analyzing and Improving the Image Quality of StyleGAN. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8110\u20138119, (2020)","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"27_CR48","unstructured":"Beaumont, R.: img2dataset: Easily turn large sets of image urls to an image dataset. In Github, https:\/\/github.com\/rom1504\/img2dataset (2021)"},{"key":"27_CR49","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. Adv. Neural Inf. Process. Syst. (NIPS) 28, 91\u201399 (2015)"},{"key":"27_CR50","doi-asserted-by":"publisher","first-page":"6460","DOI":"10.3390\/app10186460","volume":"10","author":"J Younas","year":"2020","unstructured":"Younas, J., Siddiqui, S.A., Munir, M., Malik, M.I., Shafait, F., Lukowicz, P., Ahmed, S.: Fi-Fo detector: figure and formula detection using deformable networks. Appl. Sci. 10, 6460 (2020)","journal-title":"Appl. Sci."}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition - ICDAR 2023"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-41682-8_27","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,18]],"date-time":"2023-08-18T07:21:42Z","timestamp":1692343302000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-41682-8_27"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031416811","9783031416828"],"references-count":50,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-41682-8_27","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"19 August 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"San Jos\u00e9, CA","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"USA","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 August 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26 August 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icdar2023.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Easychair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"316","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"154","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"49% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.89","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1.50","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Number and type of other papers accepted : IJDAR track papers","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}