{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T13:13:12Z","timestamp":1740143592772,"version":"3.37.3"},"reference-count":62,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2023,5,12]],"date-time":"2023-05-12T00:00:00Z","timestamp":1683849600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,5,12]],"date-time":"2023-05-12T00:00:00Z","timestamp":1683849600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Lang Resources &amp; Evaluation"],"published-print":{"date-parts":[[2023,9]]},"DOI":"10.1007\/s10579-023-09651-6","type":"journal-article","created":{"date-parts":[[2023,5,12]],"date-time":"2023-05-12T07:02:44Z","timestamp":1683874964000},"page":"1361-1387","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Automatic language identification: a case study of Pahari languages"],"prefix":"10.1007","volume":"57","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0250-9682","authenticated-orcid":false,"given":"Rachana","family":"Gusain","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Satya Ranjan","family":"Dash","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shantipriya","family":"Parida","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Girish Nath","family":"Jha","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,5,12]]},"reference":[{"issue":"2","key":"9651_CR1","first-page":"71","volume":"36","author":"G Bharadwaja Kumar","year":"2007","unstructured":"Bharadwaja Kumar, G., Murthy, K. N., & Chaudhuri, B. (2007). Statistical analyses of Telugu text corpora. IJDL. International Journal of Dravidian linguistics, 36(2), 71\u201399.","journal-title":"IJDL. International Journal of Dravidian linguistics"},{"unstructured":"Cavnar, W. B. , & Trenkle, J. M. (1994). N-gram-based text categorization. In Proceedings of sdair-94, 3rd annual symposium on document analysis and information retrieval (Vol. 161175).","key":"9651_CR2"},{"unstructured":"Chang, J. C. , & Lin, C.- C. (2014). Recurrent-neural-network for language detection on twitter code-switching corpus. arXiv:1412.4314.","key":"9651_CR3"},{"unstructured":"\u00c7\u00f6ltekin, \u00c7. , Rama, T. , & Blaschke, V. (2018). T\u00fcbingen-oslo team at the VarDial 2018 evaluation campaign: An analysis of n-gram features in language variety identification. In Proceedings of the fifth workshop on nlp for similar languages, varieties and dialects (vardial 2018) (pp. 55\u201365).","key":"9651_CR4"},{"issue":"3","key":"9651_CR5","doi-asserted-by":"publisher","first-page":"273","DOI":"10.1007\/BF00994018","volume":"20","author":"C Cortes","year":"1995","unstructured":"Cortes, C., & Vapnik, V. (1995). Support-vector networks. Machine Learning, 20(3), 273\u2013297.","journal-title":"Machine Learning"},{"doi-asserted-by":"publisher","unstructured":"Devlin, J. , Chang, M. , Lee, K. , & Toutanova, K. (2019). BERT: pre-training of deep bidirectional transformers for language understanding. In Proceedings of NAACL-HLT (pp. 4171\u20134186). Association for Computational Linguistics. https:\/\/doi.org\/10.18653\/v1\/n19-1423","key":"9651_CR6","DOI":"10.18653\/v1\/n19-1423"},{"issue":"4","key":"9651_CR7","doi-asserted-by":"crossref","first-page":"181","DOI":"10.1080\/00031305.1976.10479172","volume":"30","author":"DP Doane","year":"1976","unstructured":"Doane, D. P. (1976). Aesthetic frequency classifications. The American Statistician, 30(4), 181\u2013183.","journal-title":"The American Statistician"},{"doi-asserted-by":"crossref","unstructured":"Dubey, P. , et al. (2013). Machine translation system for Hindi-Dogri language pair. In 2013 international conference on machine intelligence and research advancement (pp. 422\u2013425).","key":"9651_CR8","DOI":"10.1109\/ICMIRA.2013.89"},{"key":"9651_CR9","volume-title":"Statistical identification of language","author":"T Dunning","year":"1994","unstructured":"Dunning, T. (1994). Statistical identification of language. Computing Research Laboratory: New Mexico State University Las Cruces, NM, USA."},{"unstructured":"Duvenhage, B. (2019). Short text language identification for under resourced languages. arXiv:1911.07555.","key":"9651_CR10"},{"unstructured":"Elfardy, H. , & Diab, M. (2013). Sentence level dialect identification in Arabic. In Proceedings of the 51st annual meeting of the association for computational linguistics (Vol. 2: Short papers) (pp. 456\u2013461).","key":"9651_CR11"},{"unstructured":"Grierson, G. A. (1916). Linguistic survey of India. Vol. 9: Indo-Aryan family: Central group; Part IV: Specimens of the Pah\u0101r\u012b languages and Gujur\u012b. Calcutta: Govt. of India, Central Publication Branch.","key":"9651_CR12"},{"doi-asserted-by":"crossref","unstructured":"Gupta, C. P. , & Bal, B. K. (2015). Detecting sentiment in nepali texts: A bootstrap approach for sentiment analysis of texts in the nepali language. In 2015 international conference on cognitive computing and information processing (ccip) (pp. 1\u20134).","key":"9651_CR13","DOI":"10.1109\/CCIP.2015.7100739"},{"doi-asserted-by":"crossref","unstructured":"Harrat, S. , Meftouh, K. , Abbas, M. , Jamoussi, S. , Saad, M. , & Smaili, K. (2015). Cross-dialectal arabic processing. In International conference on intelligent text processing and computational linguistics (pp. 620\u2013632).","key":"9651_CR14","DOI":"10.1007\/978-3-319-18111-0_47"},{"unstructured":"Indhuja, K. , Indu, M. , Sreejith, C. , Sreekrishnapuram, P. , & Raj, P. R. (2014). Text based language identification system for indian languages following devanagiri script. International Journal of Engineering 3(4).","key":"9651_CR15"},{"key":"9651_CR16","doi-asserted-by":"publisher","first-page":"675","DOI":"10.1613\/jair.1.11675","volume":"65","author":"T Jauhiainen","year":"2019","unstructured":"Jauhiainen, T., Lui, M., Zampieri, M., Baldwin, T., & Lind\u00e9n, K. (2019). Automatic language identification in texts: A survey. Journal of Artificial Intelligence Research, 65, 675\u2013782.","journal-title":"Journal of Artificial Intelligence Research"},{"unstructured":"Jauhiainen, T. S. , Jauhiainen, H. A. , Linden, B. K. J. , et al.(2018). Iterative language model adaptation for Indo-Aryan language identification. In Proceedings of the fifth workshop on nlp for similar languages, varieties and dialects (vardial 2018).","key":"9651_CR17"},{"unstructured":"Jha, G. N. (2012). The tdil program and the indian language corpora initiative. In Language resources and evaluation conference.","key":"9651_CR18"},{"issue":"2","key":"9651_CR19","doi-asserted-by":"publisher","first-page":"51","DOI":"10.2478\/v10122-010-0013-y","volume":"52","author":"M Joshi","year":"2010","unstructured":"Joshi, M. (2010). On the origin of the Neo Indo-Aryan Pah\u0101d\u012b Language of Uttarakhand and Western Nepal Himalaya. Lingua Posnaniensis, 52(2), 51\u201365.","journal-title":"Lingua Posnaniensis"},{"unstructured":"Khanuja, S. , Bansal, D. , Mehtani, S. , Khosla, S. , Dey, A. , Gopalan, B. , et al. (2021). Muril: Multilingual representations for indian languages.","key":"9651_CR20"},{"issue":"2","key":"9651_CR21","doi-asserted-by":"publisher","first-page":"265","DOI":"10.1016\/0388-0001(91)90018-V","volume":"13","author":"LM Khubchandani","year":"1991","unstructured":"Khubchandani, L. M. (1991). India as a sociolinguistic area. Language Sciences, 13(2), 265\u2013288. https:\/\/doi.org\/10.1016\/0388-0001(91)90018-V","journal-title":"Language Sciences"},{"doi-asserted-by":"publisher","unstructured":"Koehn, P. , Guzm\u00e1n, F. , Chaudhary, V. , & Pino, J. (2019, August). Findings of the WMT 2019 shared task on parallel corpus filtering for low-resource conditions. In Proceedings of the fourth conference on machine translation (Vol. 3: Shared task papers, day 2) (pp. 54\u201372). Florence, ItalyAssociation for Computational Linguistics. https:\/\/doi.org\/10.18653\/v1\/W19-5404","key":"9651_CR22","DOI":"10.18653\/v1\/W19-5404"},{"unstructured":"Kumar, R. , Lahiri, B. , Alok, D. , Ojha, A. K. , Jain, M. , Basit, A. , & Dawer, Y. (2018). Automatic identification of closely-related indian languages: Resources and experiments. In Proceedings of the eleventh international conference on language resources and evaluation (lrec).","key":"9651_CR23"},{"doi-asserted-by":"publisher","unstructured":"Lamsal, R. (2020). A large scale nepali text corpus. IEEE Dataport. https:\/\/doi.org\/10.21227\/jxrd-d245","key":"9651_CR24","DOI":"10.21227\/jxrd-d245"},{"doi-asserted-by":"crossref","unstructured":"Ljubesic, N. , Mikelic, N. , & Boras, D. (2007). Language identification: How to distinguish similar languages? In 2007 29th international conference on information technology interfaces (pp. 541\u2013546).","key":"9651_CR25","DOI":"10.1109\/ITI.2007.4283829"},{"doi-asserted-by":"crossref","unstructured":"Maharjan, S. , Blair, E. , Bethard, S. , & Solorio, T. (2015). Developing language-tagged corpora for code-switching tweets. In Proceedings of the 9th linguistic annotation workshop (pp. 72\u201384).","key":"9651_CR26","DOI":"10.3115\/v1\/W15-1608"},{"unstructured":"Mallikarjun, B. (2019). Metamorphosis of \u2018 Hindi\u2019 in Modern India\u2013A study of Census of India. Language in India 19(8).","key":"9651_CR27"},{"unstructured":"Malmasi, S. , & Dras, M. (2015). Automatic language identification for Persian and Dari texts. In Proceedings of pacling (pp. 59\u201364).","key":"9651_CR28"},{"unstructured":"Malmasi, S. , Zampieri, M. , Ljube\u0161i\u0107, N. , Nakov, P. , Ali, A. , & Tiedemann, J. (2016). Discriminating between similar languages and arabic dialect identification: A report on the third dsl shared task. In Proceedings of the third workshop on nlp for similar languages, varieties and dialects (vardial3) (pp. 1\u201314).","key":"9651_CR29"},{"doi-asserted-by":"crossref","unstructured":"Martins, B. , & Silva, M. J. (2005). Language identification in web pages. In Proceedings of the 2005 acm symposium on applied computing (pp. 764\u2013768).","key":"9651_CR30","DOI":"10.1145\/1066677.1066852"},{"unstructured":"Masica, C. P. (1993). The Indo-Aryan Languages. Cambridge University Press.","key":"9651_CR31"},{"unstructured":"Mathur, P. , Misra, A. , & Budur, E. (2017). Lide: Language identification from text documents. arXiv:1701.03682.","key":"9651_CR32"},{"unstructured":"McCallum, A. , Nigam, K. , et al.(1998). A comparison of event models for Naive Bayes text classification. In AAAI-98 workshop on learning for text categorization (Vol. 752, pp. 41\u201348).","key":"9651_CR33"},{"doi-asserted-by":"publisher","unstructured":"Mundotiya, R. K. , Singh, M. K. , Kapur, R. , Mishra, S. , & Singh, A. K. (2021, sep). Linguistic resources for bhojpuri, magahi, and maithili: Statistics about them, their similarity estimates, and baselines for three applications. ACM Transcation on Asian Low-Resource Language Information Processinghttps:\/\/doi.org\/10.1145\/3458250","key":"9651_CR34","DOI":"10.1145\/3458250"},{"issue":"01","key":"9651_CR35","doi-asserted-by":"publisher","first-page":"57","DOI":"10.1080\/09296170500500694","volume":"13","author":"KN Murthy","year":"2006","unstructured":"Murthy, K. N., & Kumar, G. B. (2006). Language identification from small text samples. Journal of Quantitative Linguistics, 13(01), 57\u201380.","journal-title":"Journal of Quantitative Linguistics"},{"unstructured":"Mustonen, S. (1965). Multiple discriminant analysis in linguistic problems. Statistical Methods in Linguistics 437\u201344.","key":"9651_CR36"},{"unstructured":"Nakkeerar, R. (2011). Nepali in Sikkim. Linguistic Survey of India-Sikkim, Part II23\u2013120.","key":"9651_CR37"},{"unstructured":"Padr\u00f3, M. , & Padr\u00f3, L. (2004). Comparing methods for language identification. Procesamiento del lenguaje natural 33.","key":"9651_CR38"},{"doi-asserted-by":"crossref","unstructured":"Paul, A. , Purkayastha, B. S. , & Sarkar, S. (2015). Hidden markov model based part of speech tagging for nepali language. In 2015 international symposium on advanced computing and communication (isacc) (pp. 149\u2013156).","key":"9651_CR39","DOI":"10.1109\/ISACC.2015.7377332"},{"key":"9651_CR40","first-page":"2825","volume":"12","author":"F Pedregosa","year":"2011","unstructured":"Pedregosa, F., Varoquaux, G., Gramfort, A., Michel, V., Thirion, B., Grisel, O., et al. (2011). Scikit-learn: Machine Learning in Python Scikit-learn: Machine learning in Python. Journal of Machine Learning Research, 12, 2825\u20132830.","journal-title":"Journal of Machine Learning Research"},{"issue":"2","key":"9651_CR41","doi-asserted-by":"publisher","first-page":"2201","DOI":"10.3233\/JIFS-179884","volume":"39","author":"R Piryani","year":"2020","unstructured":"Piryani, R., Piryani, B., Singh, V. K., & Pinto, D. (2020). Sentiment analysis in nepali: Exploring machine learning and lexicon-based approaches. Journal of Intelligent & Fuzzy Systems, 39(2), 2201\u20132212.","journal-title":"Journal of Intelligent & Fuzzy Systems"},{"doi-asserted-by":"crossref","unstructured":"Ranaivo-Malan\u00e7on, B. (2006). Automatic identification of close languages-case study: Malay and indonesian. ECTI Transactions on Computer and Information Technology (ECTI-CIT) 2(2):126\u2013134.","key":"9651_CR42","DOI":"10.37936\/ecti-cit.200622.53288"},{"doi-asserted-by":"crossref","unstructured":"Riyal, M. K. , Upadhyay, R. K. , & Kumar, S. (2021). Entropic analysis of Garhwali text. In Recent developments in acoustics (pp. 43\u201349). Springer.","key":"9651_CR43","DOI":"10.1007\/978-981-15-5776-7_3"},{"doi-asserted-by":"crossref","unstructured":"Sarkar, S. , Roy, A. , & Purkayastha, B. (2014). A comparative analysis of particle swarm optimization and k-means algorithm for text clustering using nepali wordnet. International Journal on Natural Language Computing (IJNLC) 3(3).","key":"9651_CR44","DOI":"10.5121\/ijnlc.2014.3308"},{"unstructured":"Scannell, K. P. (2007). The cr\u00fabad\u00e1n project: Corpus building for under-resourced languages. In Building and exploring web corpora: Proceedings of the 3rd web as corpus workshop (Vol\u00a04, pp. 5\u201315).","key":"9651_CR45"},{"doi-asserted-by":"crossref","unstructured":"Sch\u00fctze, H. , Manning, C. D. , & Raghavan, P. 2008. Introduction to information retrieval (Vol\u00a039). Cambridge University Press.","key":"9651_CR46","DOI":"10.1017\/CBO9780511809071"},{"key":"9651_CR47","first-page":"24","volume":"70","author":"TB Shahi","year":"2013","unstructured":"Shahi, T. B., Dhamala, T. N., & Balami, B. (2013). Support vector machines based part of speech tagging for nepali text. International Journal of Computer Applications, 70, 24.","journal-title":"International Journal of Computer Applications"},{"doi-asserted-by":"crossref","unstructured":"Shahi, T. B. , & Pant, A. K. (2018). Nepali news classification using na\u00efve bayes, support vector machines and neural networks. In 2018 international conference on communication information and computing technology (iccict) (pp. 1\u20135).","key":"9651_CR48","DOI":"10.1109\/ICCICT.2018.8325883"},{"doi-asserted-by":"crossref","unstructured":"Shahi, T. B. , & Sitaula, C. (2021). Natural language processing for nepali text: a review. Artificial Intelligence Review 1\u201329.","key":"9651_CR49","DOI":"10.1007\/s10462-021-10093-1"},{"issue":"1","key":"9651_CR50","doi-asserted-by":"publisher","first-page":"45","DOI":"10.22632\/ccs-2016-251-25","volume":"1","author":"A Singh","year":"2016","unstructured":"Singh, A., Kour, A., & Jamwal, S. S. (2016). English to Dogri translation system using MOSES. Circulation in Computer Science, 1(1), 45\u201349.","journal-title":"Circulation in Computer Science"},{"doi-asserted-by":"crossref","unstructured":"Singh, O. M. , Padia, A. , & Joshi, A. (2019). Named entity recognition for nepali language. In 2019 IEEE 5th international conference on collaboration and internet computing (cic) (pp. 184\u2013190).","key":"9651_CR51","DOI":"10.1109\/CIC48465.2019.00031"},{"key":"9651_CR52","first-page":"41","volume":"4","author":"C Sitaula","year":"2012","unstructured":"Sitaula, C. (2012). Semantic text clustering using enhanced vector space model using nepali language. Computer Sciences and Telecommunications, 4, 41\u201346.","journal-title":"Computer Sciences and Telecommunications"},{"key":"9651_CR53","doi-asserted-by":"publisher","DOI":"10.7717\/peerj-cs.412","volume":"7","author":"C Sitaula","year":"2021","unstructured":"Sitaula, C., Basnet, A., & Aryal, S. (2021). Vector representation based on a supervised codebook for nepali documents classification. PeerJ Computer Science, 7, e412.","journal-title":"PeerJ Computer Science"},{"key":"9651_CR54","doi-asserted-by":"publisher","first-page":"275","DOI":"10.1515\/flih.2014.008","volume":"35","author":"K Stro\u0144ski","year":"2014","unstructured":"Stro\u0144ski, K. (2014). On the syntax and semantics of the past perfect participle and gerundive in early New Indo Arian Evidence from Eastern Pahari. Folia Linguistica, 35, 275\u2013306.","journal-title":"Folia Linguistica"},{"unstructured":"Tan, L. , Zampieri, M. , Ljube\u0161ic, N. , & Tiedemann, J. (2014). Merging comparable data sources for the discrimination of similar languages: The dsl corpus collection. In Proceedings of the 7th workshop on building and using comparable corpora (bucc) (pp. 11\u201315). Reykjavik, Iceland.","key":"9651_CR55"},{"doi-asserted-by":"crossref","unstructured":"Thapa, L. B. R. , & Bal, B. K. (2016). Classifying sentiments in nepali subjective texts. In 2016 7th international conference on information, intelligence, systems & applications (iisa) (pp. 1\u20136).","key":"9651_CR56","DOI":"10.1109\/IISA.2016.7785374"},{"unstructured":"Tiedemann, J. , & Ljube\u0161i\u0107, N. (2012). Efficient discrimination between closely related languages. In Proceedings of coling 2012 (pp. 2619\u20132634).","key":"9651_CR57"},{"doi-asserted-by":"crossref","unstructured":"Xue, L. , Constant, N. , Roberts, A. , Kale, M. , Al-Rfou, R. , Siddhant, A. , et al. (2021). mt5: A massively multilingual pre-trained text-to-text transformer. In Proceedings of the 2021 conference of the North American chapter of the association for computational linguistics: Human language technologies (pp. 483\u2013498).","key":"9651_CR58","DOI":"10.18653\/v1\/2021.naacl-main.41"},{"issue":"1","key":"9651_CR59","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1162\/COLI_a_00169","volume":"40","author":"OF Zaidan","year":"2014","unstructured":"Zaidan, O. F., & Callison-Burch, C. (2014). Arabic dialect identification. Computational Linguistics, 40(1), 171\u2013202.","journal-title":"Computational Linguistics"},{"unstructured":"Zampieri, M. , & Gebre, B. G. (2012). Automatic identification of language varieties: The case of Portuguese. In Konvens 2012-the 11th conference on natural language processing (pp. 233\u2013237).","key":"9651_CR60"},{"unstructured":"Zampieri, M. , Gebre, B. G. , & Diwersy, S. (2013). N-gram language models and POS distribution for the identification of spanish varieties (ngrammes et traits morphosyntaxiques pour la identification de vari\u00e9t\u00e9s de l\u2019espagnol)[in french]. In Proceedings of taln 2013 (volume 2: Short papers) (pp. 580\u2013587).","key":"9651_CR61"},{"doi-asserted-by":"crossref","unstructured":"Zampieri, M. , Malmasi, S. , Nakov, P. , Ali, A. , Shon, S. , Glass, J. , et al.(2018). Language identification and morphosyntactic tagging. the second VarDial evaluation campaign. In Proceedings of the fifth workshop on nlp for similar languages, varieties and dialects (vardial) (pp. 1\u201317).","key":"9651_CR62","DOI":"10.18653\/v1\/W17-1201"}],"container-title":["Language Resources and Evaluation"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10579-023-09651-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10579-023-09651-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10579-023-09651-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,20]],"date-time":"2024-10-20T07:48:57Z","timestamp":1729410537000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10579-023-09651-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,5,12]]},"references-count":62,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2023,9]]}},"alternative-id":["9651"],"URL":"https:\/\/doi.org\/10.1007\/s10579-023-09651-6","relation":{},"ISSN":["1574-020X","1574-0218"],"issn-type":[{"type":"print","value":"1574-020X"},{"type":"electronic","value":"1574-0218"}],"subject":[],"published":{"date-parts":[[2023,5,12]]},"assertion":[{"value":"22 February 2023","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 May 2023","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no conflicts of interest to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of interest"}}]}}