{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T21:23:58Z","timestamp":1771536238667,"version":"3.50.1"},"reference-count":18,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T00:00:00Z","timestamp":1762905600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T00:00:00Z","timestamp":1762905600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,11,12]]},"DOI":"10.1109\/o-cocosda68185.2025.11384905","type":"proceedings-article","created":{"date-parts":[[2026,2,18]],"date-time":"2026-02-18T21:14:09Z","timestamp":1771449249000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["LAFAEK-Corpus-1M+: A Large-Scale Tetun Corpus to Build a Low-Resourced LLM for Speech and Text Processing"],"prefix":"10.1109","author":[{"given":"Yuichi","family":"Nishida","sequence":"first","affiliation":[{"name":"Gifu University, Graduate School of Engineering,Gifu,Japan,501-1193"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuto","family":"Kuroda","sequence":"additional","affiliation":[{"name":"Gifu University, Graduate School of Engineering,Gifu,Japan,501-1193"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Satoshi","family":"Tamura","sequence":"additional","affiliation":[{"name":"Gifu University, Graduate School of Engineering,Gifu,Japan,501-1193"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"ChatGPT","year":"2025"},{"key":"ref2","article-title":"Continual pre-training of large language models: How to re-warm your model?","volume-title":"CoRR","author":"Gupta","year":"2023"},{"key":"ref3","article-title":"Continual pre-training for cross-lingual 11 m adaptation: Enhancing japanese language capabilities","volume-title":"COLM","author":"Fujii","year":"2024"},{"key":"ref4","article-title":"Efficient and effective text encoding for chinese LLaMA and Alpaca","author":"Cui","year":"2023","journal-title":"arXiv preprint"},{"key":"ref5","volume-title":"LT4All"},{"key":"ref6","article-title":"The Llama 3 herd of models","author":"Grattafiori","year":"2024","journal-title":"arXiv preprint"},{"key":"ref7","article-title":"The Pile: An 800 GB dataset of diverse text for language modeling","author":"Gao","year":"2021","journal-title":"arXiv preprint"},{"key":"ref8","volume-title":"The RefinedWeb dataset for falcon LLM: Outperforming curated corpora with web data, and web data only","author":"Penedo","year":"2023"},{"key":"ref9","article-title":"The Stack: 3TB of permissively licensed source code","author":"Kocetkov","year":"2023","journal-title":"Transactions on Machine Learning Research"},{"key":"ref10","article-title":"MADLAD-400: A multilingual and document-level large audited dataset","author":"Kudugunta","year":"2023","journal-title":"arXiv preprint"},{"key":"ref11","article-title":"Labadain-30k+: A monolingual tetun document-level audited dataset","volume-title":"SIGUL","author":"de Jesus","year":"2024"},{"key":"ref12","volume-title":"Peace Corps East Timor Tetun language course","author":"Klinken","year":"2015"},{"key":"ref13","volume-title":"Discourse Structures of Tetun Dili, an Austronesian Language of Timor-Leste","author":"van Klinken","year":"2020"},{"key":"ref14","article-title":"2024 country report timor leste","volume-title":"Oriental COCOSDA","author":"de Jesus Ornai","year":"2024"},{"key":"ref15","article-title":"LoRA: Low-rank adaptation of large language models","volume-title":"ICLR","author":"Hu","year":"2022"},{"key":"ref16","article-title":"Flashattention-2: Faster attention with better parallelism and work partitioning","author":"Dao","year":"2023","journal-title":"arXiv preprint"},{"key":"ref17","article-title":"Bleu: a method for automatic evaluation of machine translation","volume-title":"ACL","author":"Papineni","year":"2002"},{"key":"ref18","article-title":"Bertscore: Evaluating text generation with bert","volume-title":"ICLR","author":"Zhang","year":"2020"}],"event":{"name":"2025 28th Conference of the Oriental COCOSDA International Committee for the Co-ordination and Standardisation of Speech Databases and Assessment Techniques (O-COCOSDA)","location":"Yogyakarta, Indonesia","start":{"date-parts":[[2025,11,12]]},"end":{"date-parts":[[2025,11,14]]}},"container-title":["2025 28th Conference of the Oriental COCOSDA International Committee for the Co-ordination and Standardisation of Speech Databases and Assessment Techniques (O-COCOSDA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11383559\/11384821\/11384905.pdf?arnumber=11384905","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T20:55:17Z","timestamp":1771534517000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11384905\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,12]]},"references-count":18,"URL":"https:\/\/doi.org\/10.1109\/o-cocosda68185.2025.11384905","relation":{},"subject":[],"published":{"date-parts":[[2025,11,12]]}}}