{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T15:54:52Z","timestamp":1759334092890,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":13,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783032061089"},{"type":"electronic","value":"9783032061096"}],"license":[{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-06109-6_23","type":"book-chapter","created":{"date-parts":[[2025,9,30]],"date-time":"2025-09-30T10:05:53Z","timestamp":1759226753000},"page":"402-418","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A Scalable Model for Frequency Distribution of Low Occurrence Multi-words Towards Handling Very Large Spectrum of Text Corpora Sizes"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5223-1180","authenticated-orcid":false,"given":"Joaquim F.","family":"Silva","sequence":"first","affiliation":[]},{"given":"Jose C.","family":"Cunha","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,10,1]]},"reference":[{"key":"23_CR1","doi-asserted-by":"crossref","unstructured":"Bernhardsson, S., da\u00a0Rocha, L.E.C., Minnhagen, P.: The meta book and size-dependent properties of written language. CoRR arxiv.org\/abs\/0909.4385 (2009)","DOI":"10.1088\/1367-2630\/11\/12\/123015"},{"key":"23_CR2","unstructured":"Brants, T., Popat, A.C., Xu, P., Och, F.J., Dean, J.: Large language models in machine translation. In: Joint Conference on Empirical Methods in NLP and Computational Natural Language Learning, pp. 858\u2013867. ACL (2007)"},{"key":"23_CR3","unstructured":"Buck, C., Heafield, K., van Ooyen, B.: N-gram counts and language models from the common crawl. In: LREC 2014, pp. 3579\u20133584 (2014)"},{"key":"23_CR4","doi-asserted-by":"crossref","unstructured":"Chierichetti, F., Kumar, R., Pang, B.: On the power laws of language: word frequency distributions. In: ACM SIGIR Conference, pp. 385\u2013394 (2017)","DOI":"10.1145\/3077136.3080821"},{"key":"23_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"75","DOI":"10.1007\/978-3-030-22741-8_6","volume-title":"Computational Science \u2013 ICCS 2019","author":"C Goncalves","year":"2019","unstructured":"Goncalves, C., Silva, J.F., Cunha, J.C.: n-gram cache performance in statistical extraction of relevant terms in large Corpora. In: Rodrigues, J.M.F., et al. (eds.) ICCS 2019. LNCS, vol. 11537, pp. 75\u201388. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-22741-8_6"},{"key":"23_CR6","doi-asserted-by":"publisher","first-page":"101","DOI":"10.1007\/s10462-009-9135-4","volume":"32","author":"LQ Ha","year":"2009","unstructured":"Ha, L.Q., Hanna, P., Ming, J., Smith, F.: Extending Zipf\u2019s law to n-grams for large corpora. Artif. Intell. Rev. 32, 101\u2013113 (2009)","journal-title":"Artif. Intell. Rev."},{"key":"23_CR7","doi-asserted-by":"publisher","unstructured":"Jones, E., et al.: SciPy: open source scientific tools for Python (2001). https:\/\/www.scipy.org\/. https:\/\/doi.org\/10.5281\/zenodo.1913564","DOI":"10.5281\/zenodo.1913564"},{"key":"23_CR8","doi-asserted-by":"crossref","unstructured":"L\u00fc, L., Zhang, Z., Zhou, T.: Deviation of zipf\u2019s and heaps\u2019 laws in human languages with limited dictionary sizes. Sci. Rep. 3(1082) (2013)","DOI":"10.1038\/srep01082"},{"issue":"5","key":"23_CR9","doi-asserted-by":"publisher","first-page":"323","DOI":"10.1080\/00107510500052444","volume":"46","author":"MEJ Newman","year":"2005","unstructured":"Newman, M.E.J.: Power laws, pareto distributions and zipf\u2019s law. Contemp. Phys. 46(5), 323\u2013351 (2005)","journal-title":"Contemp. Phys."},{"key":"23_CR10","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"840","DOI":"10.1007\/978-3-030-47436-2_63","volume-title":"Advances in Knowledge Discovery and Data Mining","author":"JF Silva","year":"2020","unstructured":"Silva, J.F., Cunha, J.C.: An empirical model for n-gram frequency distribution in large corpora. In: Lauw, H.W., Wong, R.C.-W., Ntoulas, A., Lim, E.-P., Ng, S.-K., Pan, S.J. (eds.) PAKDD 2020. LNCS (LNAI), vol. 12085, pp. 840\u2013851. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-47436-2_63"},{"key":"23_CR11","doi-asserted-by":"crossref","unstructured":"Silva, J., Cunha, J.: How large corpora sizes influence the distribution of low frequency text n-grams. In: Advances in Knowledge Discovery and Data Mining: PAKDD 2024. LNCS, pp. 210\u2013222. Springer (2024)","DOI":"10.1007\/978-981-97-2259-4_16"},{"issue":"3\/4","key":"23_CR12","doi-asserted-by":"publisher","first-page":"425","DOI":"10.2307\/2333389","volume":"42","author":"H Simon","year":"1955","unstructured":"Simon, H.: On a class of skew distribution functions. Biometrika 42(3\/4), 425\u2013440 (1955)","journal-title":"Biometrika"},{"key":"23_CR13","volume-title":"Human Behavior and the Principle of Least-Effort","author":"GK Zipf","year":"1949","unstructured":"Zipf, G.K.: Human Behavior and the Principle of Least-Effort. Addison-Wesley, Cambridge (1949)"}],"container-title":["Lecture Notes in Computer Science","Machine Learning and Knowledge Discovery in Databases. Research Track"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-06109-6_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,30]],"date-time":"2025-09-30T10:05:57Z","timestamp":1759226757000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-06109-6_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,1]]},"ISBN":["9783032061089","9783032061096"],"references-count":13,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-06109-6_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025,10,1]]},"assertion":[{"value":"1 October 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"ECML PKDD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Joint European Conference on Machine Learning and Knowledge Discovery in Databases","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Porto","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Portugal","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecml2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ecmlpkdd.org\/2025\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}