{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,30]],"date-time":"2025-09-30T10:54:52Z","timestamp":1759229692208,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":19,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819722617"},{"type":"electronic","value":"9789819722594"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-981-97-2259-4_16","type":"book-chapter","created":{"date-parts":[[2024,4,24]],"date-time":"2024-04-24T09:02:31Z","timestamp":1713949351000},"page":"210-222","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["How Large Corpora Sizes Influence the\u00a0Distribution of\u00a0Low Frequency Text n-grams"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5223-1180","authenticated-orcid":false,"given":"Joaquim F.","family":"Silva","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6729-8348","authenticated-orcid":false,"given":"Jose C.","family":"Cunha","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,4,25]]},"reference":[{"issue":"1","key":"16_CR1","doi-asserted-by":"publisher","first-page":"47","DOI":"10.1103\/RevModPhys.74.47","volume":"74","author":"R Albert","year":"2002","unstructured":"Albert, R., Barab\u00e1si, A.L.: Statistical mechanics of complex networks. Rev. Mod. Phys. 74(1), 47\u201397 (2002)","journal-title":"Rev. Mod. Phys."},{"doi-asserted-by":"publisher","unstructured":"Baca\u00ebr, N.: Verhulst and the logistic equation (1838). In: A Short History of Mathematical Population Dynamics, pp. 35\u201339. Springer, London (2011). https:\/\/doi.org\/10.1007\/978-0-85729-115-8_6","key":"16_CR2","DOI":"10.1007\/978-0-85729-115-8_6"},{"key":"16_CR3","first-page":"1","volume":"4","author":"VK Balasubrahmanyan","year":"2002","unstructured":"Balasubrahmanyan, V.K., Naranan, S.: Algorithmic information, complexity and Zipf law. Glottometrics 4, 1\u201326 (2002)","journal-title":"Glottometrics"},{"issue":"5","key":"16_CR4","doi-asserted-by":"publisher","first-page":"215","DOI":"10.1287\/mnsc.15.5.215","volume":"15","author":"FM Bass","year":"1969","unstructured":"Bass, F.M.: A new product growth for model consumer durables. Manage. Sci. 15(5), 215\u2013227 (1969)","journal-title":"Manage. Sci."},{"unstructured":"Bernhardsson, S., da\u00a0Rocha, L.E.C., Minnhagen, P.: Size dependent word frequencies and translational invariance of books. CoRR abs\/0906.0716 (2009)","key":"16_CR5"},{"key":"16_CR6","doi-asserted-by":"publisher","first-page":"386","DOI":"10.1016\/S0019-9958(67)90201-X","volume":"10","author":"AD Booth","year":"1967","unstructured":"Booth, A.D.: A \u201claw\u2019\u2019 of occurrences for words of low frequency. Inf. Control 10, 386\u2013393 (1967)","journal-title":"Inf. Control"},{"unstructured":"Brants, T., Popat, A.C., Xu, P., Och, F.J., Dean, J.: Large language models in machine translation. In: Joint Conference on EMNLP - CoNLL, pp. 858\u2013867. ACL (2007)","key":"16_CR7"},{"unstructured":"Buck, C., Heafield, K., van Ooyen, B.: N-gram counts and language models from the Common Crawl. In: LREC\u201914. European Language Resources Association (2014)","key":"16_CR8"},{"issue":"5","key":"16_CR9","doi-asserted-by":"publisher","first-page":"702","DOI":"10.1002\/asi.20524","volume":"58","author":"L Egghe","year":"2007","unstructured":"Egghe, L.: Untangling Herdan\u2019s law and Heaps\u2019 law: mathematical and informetric arguments. J. Am. Soc. Inf. Sci. Technol. 58(5), 702\u2013709 (2007)","journal-title":"J. Am. Soc. Inf. Sci. Technol."},{"key":"16_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"75","DOI":"10.1007\/978-3-030-22741-8_6","volume-title":"Computational Science \u2013 ICCS 2019","author":"C Goncalves","year":"2019","unstructured":"Goncalves, C., Silva, J.F., Cunha, J.C.: n-gram cache performance in statistical extraction of relevant terms in large corpora. In: Rodrigues, J.M.F., et al. (eds.) ICCS 2019. LNCS, vol. 11537, pp. 75\u201388. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-22741-8_6"},{"key":"16_CR11","doi-asserted-by":"publisher","first-page":"1082","DOI":"10.1038\/srep01082","volume":"3","author":"L L\u00fc","year":"2013","unstructured":"L\u00fc, L., Zhang, Z.K., Zhou, T.: Deviation of Zipf and Heaps laws in human languages with limited dictionary sizes. Sci. Rep. 3, 1082 (2013). https:\/\/doi.org\/10.1038\/srep01082","journal-title":"Sci. Rep."},{"key":"16_CR12","first-page":"190","volume":"12","author":"B Mandelbrot","year":"1953","unstructured":"Mandelbrot, B.: On the theory of word frequencies and on related Markovian models of discourse. Struct. Lang. Math. Aspects 12, 190\u2013219 (1953)","journal-title":"Struct. Lang. Math. Aspects"},{"issue":"5","key":"16_CR13","doi-asserted-by":"publisher","first-page":"323","DOI":"10.1080\/00107510500052444","volume":"46","author":"M Newman","year":"2005","unstructured":"Newman, M.: Power laws, Pareto distributions and Zipf law. Contemp. Phys. 46(5), 323\u2013351 (2005)","journal-title":"Contemp. Phys."},{"issue":"5","key":"16_CR14","doi-asserted-by":"publisher","first-page":"292","DOI":"10.1002\/asi.4630270505","volume":"27","author":"DS Price","year":"1976","unstructured":"Price, D.S.: A general theory of bibliometric and other cumulative advantage processes. J. Am. Soc. Inf. Sci. 27(5), 292\u2013306 (1976)","journal-title":"J. Am. Soc. Inf. Sci."},{"key":"16_CR15","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"840","DOI":"10.1007\/978-3-030-47436-2_63","volume-title":"Advances in Knowledge Discovery and Data Mining","author":"JF Silva","year":"2020","unstructured":"Silva, J.F., Cunha, J.C.: An empirical model for n-gram frequency distribution in large corpora. In: Lauw, H.W., Wong, R.C.-W., Ntoulas, A., Lim, E.-P., Ng, S.-K., Pan, S.J. (eds.) PAKDD 2020. LNCS (LNAI), vol. 12085, pp. 840\u2013851. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-47436-2_63"},{"key":"16_CR16","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"699","DOI":"10.1007\/978-3-030-77961-0_55","volume-title":"Computational Science \u2013 ICCS 2021","author":"JF Silva","year":"2021","unstructured":"Silva, J.F., Cunha, J.C.: A model for predicting n-gram frequency distribution in large corpora. In: Paszynski, M., Kranzlm\u00fcller, D., Krzhizhanovskaya, V.V., Dongarra, J.J., Sloot, P.M.A. (eds.) ICCS 2021. LNCS, vol. 12742, pp. 699\u2013706. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-77961-0_55"},{"doi-asserted-by":"crossref","unstructured":"Silva, J.F., Gon\u00e7alves, C., Cunha, J.C.: A theoretical model for n-gram distribution in big data corpora. In: 2016 IEEE International Conference on Big Data, pp. 134\u2013141 (2016)","key":"16_CR17","DOI":"10.1109\/BigData.2016.7840598"},{"issue":"3\/4","key":"16_CR18","doi-asserted-by":"publisher","first-page":"425","DOI":"10.2307\/2333389","volume":"42","author":"H Simon","year":"1955","unstructured":"Simon, H.: On a class of skew distribution functions. Biometrika 42(3\/4), 425\u2013440 (1955)","journal-title":"Biometrika"},{"key":"16_CR19","volume-title":"Human Behavior and the Principle of Least-Effort","author":"GK Zipf","year":"1949","unstructured":"Zipf, G.K.: Human Behavior and the Principle of Least-Effort. Addison-Wesley, Cambridge (1949)"}],"container-title":["Lecture Notes in Computer Science","Advances in Knowledge Discovery and Data Mining"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-2259-4_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,24]],"date-time":"2024-04-24T23:18:44Z","timestamp":1714000724000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-2259-4_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9789819722617","9789819722594"],"references-count":19,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-2259-4_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"25 April 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PAKDD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Pacific-Asia Conference on Knowledge Discovery and Data Mining","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Taipei","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Taiwan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 May 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10 May 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"pakdd2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/pakdd2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}