{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T08:23:44Z","timestamp":1775550224594,"version":"3.50.1"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,3,28]],"date-time":"2025-03-28T00:00:00Z","timestamp":1743120000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,3,28]],"date-time":"2025-03-28T00:00:00Z","timestamp":1743120000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"DOI":"10.13039\/100009100","name":"Universiti Brunei Darussalam","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100009100","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Soc. Netw. Anal. Min."],"DOI":"10.1007\/s13278-025-01444-9","type":"journal-article","created":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T22:36:23Z","timestamp":1743374183000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Pre-trained language model for code-mixed text in Indonesian, Javanese, and English using transformer"],"prefix":"10.1007","volume":"15","author":[{"given":"Ahmad Fathan","family":"Hidayatullah","sequence":"first","affiliation":[]},{"given":"Rosyzie Anna","family":"Apong","sequence":"additional","affiliation":[]},{"given":"Daphne Teck Ching","family":"Lai","sequence":"additional","affiliation":[]},{"given":"Atika","family":"Qazi","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,28]]},"reference":[{"key":"1444_CR1","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K (2018) Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805"},{"issue":"8","key":"1444_CR2","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford A, Wu J, Child R, Luan D, Amodei D, Sutskever I et al (2019) Language models are unsupervised multitask learners. OpenAI blog 1(8):9","journal-title":"OpenAI blog"},{"issue":"1","key":"1444_CR3","first-page":"5485","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel C, Shazeer N, Roberts A, Lee K, Narang S, Matena M, Zhou Y, Li W, Liu PJ (2020) Exploring the limits of transfer learning with a unified text-to-text transformer. The J Mach Learn Res 21(1):5485\u20135551","journal-title":"The J Mach Learn Res"},{"key":"1444_CR4","unstructured":"Santy S, Srinivasan A, Choudhury M (2021) Bertologicomix: How does code-mixing interact with multilingual bert? In: Proceedings of the Second Workshop on Domain Adaptation for NLP, pp. 111\u2013121"},{"key":"1444_CR5","doi-asserted-by":"crossref","unstructured":"Koto F, Rahimi A, Lau JH, Baldwin T (2020) Indolem and indobert: A benchmark dataset and pre-trained language model for indonesian nlp. In: Proceedings of the 28th International Conference on Computational Linguistics, pp. 757\u2013770","DOI":"10.18653\/v1\/2020.coling-main.66"},{"key":"1444_CR6","doi-asserted-by":"crossref","unstructured":"Wongso W, Setiawan DS, Suhartono D (2021) Causal and masked language modeling of javanese language using transformer-based architectures. In: 2021 International Conference on Advanced Computer Science and Information Systems (ICACSIS), pp. 1\u20137. IEEE","DOI":"10.1109\/ICACSIS53237.2021.9631331"},{"issue":"1","key":"1444_CR7","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1186\/s40537-022-00590-7","volume":"9","author":"W Wongso","year":"2022","unstructured":"Wongso W, Lucky H, Suhartono D (2022) Pre-trained transformer-based language models for sundanese. J Big Data 9(1):39","journal-title":"J Big Data"},{"key":"1444_CR8","doi-asserted-by":"crossref","unstructured":"Srinivasan A (2020) Msr india at semeval-2020 task 9: Multilingual models can do code-mixing too. In: Proceedings of the Fourteenth Workshop on Semantic Evaluation, pp. 951\u2013956","DOI":"10.18653\/v1\/2020.semeval-1.122"},{"key":"1444_CR9","unstructured":"Aguilar G, Kar S, Solorio T (2020) Lince: A centralized benchmark for linguistic code-switching evaluation. arXiv preprint arXiv:2005.04322"},{"key":"1444_CR10","doi-asserted-by":"crossref","unstructured":"Khanuja S, Dandapat S, Srinivasan A, Sitaram S, Choudhury M (2020) Gluecos: An evaluation benchmark for code-switched nlp. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 3575\u20133585","DOI":"10.18653\/v1\/2020.acl-main.329"},{"key":"1444_CR11","unstructured":"Baral A, Sarkar A, Deeksha D, Joshi AM (2022) Calbert-code-mixed adaptive language representations using bert. In: AAAI Spring Symposium: MAKE"},{"issue":"3","key":"1444_CR12","first-page":"41","volume":"54","author":"A Das","year":"2013","unstructured":"Das A, Gamb\u00e4ck B (2013) Code-mixing in social media text. Traitement Automatique des Langues 54(3):41\u201364","journal-title":"Traitement Automatique des Langues"},{"key":"1444_CR13","doi-asserted-by":"crossref","unstructured":"Yong Z-X, Zhang R, Forde JZ, Wang S, Cahyawijaya S, Lovenia H, Sutawika L, Cruz JCB, Phan L, Tan YL, et al (2023) Prompting large language models to generate code-mixed texts: The case of south east asian languages. arXiv preprint arXiv:2303.13592","DOI":"10.18653\/v1\/2023.calcs-1.5"},{"key":"1444_CR14","doi-asserted-by":"crossref","unstructured":"Winata GI, Cahyawijaya S, Liu Z, Lin Z, Madotto A, Fung P (2021) Are multilingual models effective in code-switching? In: Proceedings of the Fifth Workshop on Computational Approaches to Linguistic Code-Switching, pp. 142\u2013153. Association for Computational Linguistics, Online","DOI":"10.18653\/v1\/2021.calcs-1.20"},{"key":"1444_CR15","doi-asserted-by":"crossref","unstructured":"Aji AF, Winata GI, Koto F, Cahyawijaya S, Romadhony A, Mahendra R, Kurniawan K, Moeljadi D, Prasojo RE, Baldwin T, et al (2022) One country, 700+ languages: Nlp challenges for underrepresented languages and dialects in indonesia. arXiv preprint arXiv:2203.13357","DOI":"10.18653\/v1\/2022.acl-long.500"},{"key":"1444_CR16","volume-title":"What are the top 200 most spoken languages","author":"DM Eberhard","year":"2021","unstructured":"Eberhard DM, Simons GF, Fennig CD (2021) What are the top 200 most spoken languages. Languages of the world, Ethnologue"},{"key":"1444_CR17","doi-asserted-by":"crossref","unstructured":"Yulianti E, Kurnia A, Adriani M, Duto YS (2021) Normalisation of indonesian-english code-mixed text and its effect on emotion classification. International Journal of Advanced Computer Science and Applications 12(11)","DOI":"10.14569\/IJACSA.2021.0121177"},{"key":"1444_CR18","unstructured":"Vaswani A. Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. Advances in neural information processing systems 30"},{"key":"1444_CR19","unstructured":"Liu Y, Ott M, Goyal N, Du J, Joshi M, Chen D, Levy O, Lewis M, Zettlemoyer L, Stoyanov V (2019) Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692"},{"key":"1444_CR20","unstructured":"Sanh V, Debut L, Chaumond J, Wolf T (2019) Distilbert, a distilled version of bert: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108"},{"key":"1444_CR21","unstructured":"Conneau A, Lample G (2019) Cross-lingual language model pretraining. Advances in neural information processing systems 32"},{"key":"1444_CR22","doi-asserted-by":"crossref","unstructured":"Conneau A, Khandelwal K, Goyal N, Chaudhary V, Wenzek G, Guzm\u00e1n F, Grave E, Ott M, Zettlemoyer L, Stoyanov V (2019) Unsupervised cross-lingual representation learning at scale. arXiv preprint arXiv:1911.02116","DOI":"10.18653\/v1\/2020.acl-main.747"},{"key":"1444_CR23","unstructured":"De\u00a0Vries W, Cranenburgh A, Bisazza A, Caselli T, Noord G, Nissim M (2019) Bertje: A dutch bert model. arXiv preprint arXiv:1912.09582"},{"key":"1444_CR24","unstructured":"Ralethe S (2020) Adaptation of deep bidirectional transformers for afrikaans language. In: Proceedings of the Twelfth Language Resources and Evaluation Conference, pp. 2475\u20132478"},{"key":"1444_CR25","doi-asserted-by":"crossref","unstructured":"Kakwani D, Kunchukuttan A, Golla S, Gokul N, Bhattacharyya A, Khapra MM, Kumar P (2020) Indicnlpsuite: Monolingual corpora, evaluation benchmarks and pre-trained multilingual language models for indian languages. In: Findings of the Association for Computational Linguistics: EMNLP 2020, pp. 4948\u20134961","DOI":"10.18653\/v1\/2020.findings-emnlp.445"},{"key":"1444_CR26","doi-asserted-by":"crossref","unstructured":"Nguyen DQ, Nguyen AT (2020) Phobert: Pre-trained language models for vietnamese. In: Findings of the Association for Computational Linguistics: EMNLP 2020, pp. 1037\u20131042","DOI":"10.18653\/v1\/2020.findings-emnlp.92"},{"key":"1444_CR27","doi-asserted-by":"crossref","unstructured":"Souza F, Nogueira R, Lotufo R(2020) Bertimbau: pretrained bert models for brazilian portuguese. In: Intelligent Systems: 9th Brazilian Conference, BRACIS 2020, Rio Grande, Brazil, October 20\u201323, 2020, Proceedings, Part I 9, pp. 403\u2013417. Springer","DOI":"10.1007\/978-3-030-61377-8_28"},{"key":"1444_CR28","doi-asserted-by":"crossref","unstructured":"Martin L, Muller B, Suarez PO, Dupont Y, Romary L, De\u00a0La\u00a0Clergerie \u00c9V, Seddah D, Sagot B (2020) Camembert: a tasty french language model. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 7203\u20137219","DOI":"10.18653\/v1\/2020.acl-main.645"},{"key":"1444_CR29","unstructured":"Le H, Vial L, Frej J, Segonne V, Coavoux M, Lecouteux B, Allauzen A, Crabb\u00e9 B, Besacier L, Schwab D (2020) Flaubert: Unsupervised language model pre-training for french. In: Proceedings of the Twelfth Language Resources and Evaluation Conference, pp. 2479\u20132490"},{"key":"1444_CR30","doi-asserted-by":"crossref","unstructured":"Koto F, Lau JH, Baldwin T (2021) Indobertweet: A pretrained language model for indonesian twitter with effective domain-specific vocabulary initialization. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 10660\u201310668","DOI":"10.18653\/v1\/2021.emnlp-main.833"},{"key":"1444_CR31","unstructured":"Li SS, Murray K (2022) Language agnostic code-mixing data augmentation by predicting linguistic patterns. arXiv preprint arXiv:2211.07628"},{"key":"1444_CR32","doi-asserted-by":"crossref","unstructured":"Ansari MZ, Beg, MS, Ahmad T, Khan MJ, Wasim G (2021) Language identification of hindi-english tweets using code-mixed bert. In: 2021 IEEE 20th International Conference on Cognitive Informatics & Cognitive Computing (ICCI* CC), pp. 248\u2013252. IEEE","DOI":"10.1109\/ICCICC53683.2021.9811292"},{"key":"1444_CR33","unstructured":"Nayak R, Joshi R (2022) L3cube-hingcorpus and hingbert: A code mixed hindi-english dataset and bert language models. In: Proceedings of the WILDRE-6 Workshop Within the 13th Language Resources and Evaluation Conference, pp. 7\u201312"},{"key":"1444_CR34","doi-asserted-by":"crossref","unstructured":"Chavan T, Gokhale O, Kane A, Patankar S, Joshi R (2023) My boli: Code-mixed marathi-english corpora, pretrained language models and evaluation benchmarks. arXiv preprint arXiv:2306.14030","DOI":"10.18653\/v1\/2023.findings-ijcnlp.22"},{"key":"1444_CR35","unstructured":"Raihan MN, Goswami D, Mahmud A (2023) Mixed-distil-bert: Code-mixed language modeling for bangla, english, and hindi. arXiv preprint arXiv:2309.10272"},{"key":"1444_CR36","doi-asserted-by":"crossref","unstructured":"Nguyen DQ, Vu T, Nguyen AT (2020) Bertweet: A pre-trained language model for english tweets. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pp. 9\u201314","DOI":"10.18653\/v1\/2020.emnlp-demos.2"},{"key":"1444_CR37","unstructured":"Wu Y, Schuster M, Chen Z, Le QV, Norouzi M, Macherey W, Krikun M, Cao Y, Gao Q, Macherey K, et al (2016) Google\u2019s neural machine translation system: Bridging the gap between human and machine translation. arXiv preprint arXiv:1609.08144"},{"key":"1444_CR38","doi-asserted-by":"crossref","unstructured":"Sennrich R, Haddow B, Birch A (2015) Neural machine translation of rare words with subword units. arXiv preprint arXiv:1508.07909","DOI":"10.18653\/v1\/P16-1162"},{"key":"1444_CR39","doi-asserted-by":"crossref","unstructured":"Wang K, Reimers N, Gurevych I (2021) Tsdae: Using transformer-based sequential denoising auto-encoder for unsupervised sentence embedding learning. arXiv preprint arXiv:2104.06979","DOI":"10.18653\/v1\/2021.findings-emnlp.59"},{"key":"1444_CR40","doi-asserted-by":"crossref","unstructured":"Gonen H, Iyer S, Blevins T, Smith NA, Zettlemoyer L (2022) Demystifying prompts in language models via perplexity estimation. arXiv preprint arXiv:2212.04037","DOI":"10.18653\/v1\/2023.findings-emnlp.679"},{"key":"1444_CR41","doi-asserted-by":"publisher","first-page":"1312","DOI":"10.7717\/peerj-cs.1312","volume":"9","author":"AF Hidayatullah","year":"2023","unstructured":"Hidayatullah AF, Apong RA, Lai DTC, Qazi A (2023) Corpus creation and language identification for code-mixed indonesian-javanese-english tweets. PeerJ Comput Sci 9:1312","journal-title":"PeerJ Comput Sci"},{"key":"1444_CR42","doi-asserted-by":"crossref","unstructured":"Barik AM, Mahendra R, Adriani M (2019) Normalization of indonesian-english code-mixed twitter data. In: Proceedings of the 5th Workshop on Noisy User-generated Text (W-NUT 2019), pp. 417\u2013424","DOI":"10.18653\/v1\/D19-5554"},{"key":"1444_CR43","doi-asserted-by":"crossref","unstructured":"Utomo HR, Romadhony A (2023) Sentiment analysis on indonesia-english code-mixed data. In: 2023 IEEE 8th International Conference for Convergence in Technology (I2CT), pp. 1\u20136. IEEE","DOI":"10.1109\/I2CT57861.2023.10126234"},{"key":"1444_CR44","doi-asserted-by":"crossref","unstructured":"Tho C, Heryadi Y, Lukas L, Wibowo A (2021) Code-mixed sentiment analysis of indonesian language and javanese language using lexicon based approach. In: Journal of Physics: Conference Series, 1869:012084. IOP Publishing","DOI":"10.1088\/1742-6596\/1869\/1\/012084"},{"key":"1444_CR45","doi-asserted-by":"crossref","unstructured":"Pamungkas EW, Fatmawati A, Nugroho YS, Gunawan D, Sudarmilah E (2022) Hate speech detection in code-mixed indonesian social media: Exploiting multilingual languages resources. In: 2022 Seventh International Conference on Informatics and Computing (ICIC), pp. 1\u20135. IEEE","DOI":"10.1109\/ICIC56845.2022.10006940"}],"container-title":["Social Network Analysis and Mining"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13278-025-01444-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13278-025-01444-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13278-025-01444-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,17]],"date-time":"2025-12-17T08:26:34Z","timestamp":1765959994000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13278-025-01444-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,28]]},"references-count":45,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["1444"],"URL":"https:\/\/doi.org\/10.1007\/s13278-025-01444-9","relation":{},"ISSN":["1869-5469"],"issn-type":[{"value":"1869-5469","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,3,28]]},"assertion":[{"value":"31 July 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 February 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 February 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 March 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 April 2025","order":6,"name":"change_date","label":"Change Date","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Update","order":7,"name":"change_type","label":"Change Type","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The original online version of this article was revised: to update table 7.","order":8,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"Not applicable.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval and consent to participate"}}],"article-number":"30"}}