{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,12]],"date-time":"2026-06-12T00:59:18Z","timestamp":1781225958460,"version":"3.54.1"},"reference-count":48,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100004410","name":"Scientific and Technological Research Council of Turkey","doi-asserted-by":"publisher","award":["124E055"],"award-info":[{"award-number":["124E055"]}],"id":[{"id":"10.13039\/501100004410","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.knosys.2026.115880","type":"journal-article","created":{"date-parts":[[2026,4,5]],"date-time":"2026-04-05T10:06:30Z","timestamp":1775383590000},"page":"115880","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["KFPT: Reliability and uncertainty filtered self-distillation for language model training"],"prefix":"10.1016","volume":"343","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-2094-4358","authenticated-orcid":false,"given":"Muzaffer","family":"Kaan Yuce","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0404-5973","authenticated-orcid":false,"given":"Mehmet","family":"Fatih Amasyali","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.knosys.2026.115880_bib0001","unstructured":"R. Bommasani, D. A. Hudson, E. Adeli, R. Altman, S. Arora, S. von Arx, M. S. Bernstein, J. Bohg, A. Bosselut, E. Brunskill, E. Brynjolfsson, S. Buch, D. Card, R. Castellon, N. Chatterji, A. Chen, K. Creel, J. Q. Davis, D. Demszky, C. Donahue, M. Doumbouya, E. Durmus, S. Ermon, J. Etchemendy, K. Ethayarajh, L. Fei-Fei, C. Finn, T. Gale, L. Gillespie, K. Goel, et al., On the opportunities and risks of foundation models, (2021). https:\/\/arxiv.org\/abs\/2108.07258."},{"key":"10.1016\/j.knosys.2026.115880_bib0002","series-title":"Advances in Neural Information Processing Systems (NeurIPS)","article-title":"Attention is all you need","author":"Vaswani","year":"2017"},{"key":"10.1016\/j.knosys.2026.115880_bib0003","series-title":"Advances in Neural Information Processing Systems (NeurIPS)","article-title":"Language models are few-shot learners","author":"Brown","year":"2020"},{"key":"10.1016\/j.knosys.2026.115880_bib0004","unstructured":"A. Radford, J. Wu, R. Child, D. Luan, D. Amodei, I. Sutskever, Language Models are Unsupervised Multitask Learners, 2019, (OpenAI Technical Report). https:\/\/cdn.openai.com\/better-language-models\/language_models_are_unsupervised_multitask_learners.pdf."},{"key":"10.1016\/j.knosys.2026.115880_bib0005","series-title":"Deep Learning","author":"Goodfellow","year":"2016"},{"key":"10.1016\/j.knosys.2026.115880_bib0006","series-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL)","first-page":"7871","article-title":"BART: denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension","author":"Lewis","year":"2020"},{"issue":"140","key":"10.1016\/j.knosys.2026.115880_bib0007","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"10.1016\/j.knosys.2026.115880_bib0008","unstructured":"A. Chowdhery, S. Narang, J. Devlin, M. Bosma, G. Mishra, A. Roberts, P. Barham, H.W. Chung, C. Sutton, S. Gehrmann, et al., PaLM: scaling language modeling with pathways, (2022). https:\/\/arxiv.org\/abs\/2204.02311."},{"key":"10.1016\/j.knosys.2026.115880_bib0009","doi-asserted-by":"crossref","unstructured":"J. Hoffmann, S. Borgeaud, A. Mensch, E. Buchatskaya, T. Cai, E. Rutherford, D. de Las Casas, L.A. Hendricks, J. Welbl, A. Clark, T. Hennigan, E. Noland, K. Millican, G. van den Driessche, B. Damoc, A. Guy, S. Osindero, K. Simonyan, E. Elsen, J.W. Rae, O. Vinyals, L. Sifre, Training compute-optimal large language models, (2022). https:\/\/arxiv.org\/abs\/2203.15556.","DOI":"10.52202\/068431-2176"},{"key":"10.1016\/j.knosys.2026.115880_bib0010","unstructured":"J. Kaplan, S. McCandlish, T. Henighan, T.B. Brown, B. Chess, R. Child, S. Gray, A. Radford, J. Wu, D. Amodei, Scaling laws for neural language models, (2020). https:\/\/arxiv.org\/abs\/2001.08361."},{"key":"10.1016\/j.knosys.2026.115880_bib0011","series-title":"Proceedings of the 34th International Conference on Machine Learning (ICML)","first-page":"1321","article-title":"On calibration of modern neural networks","author":"Guo","year":"2017"},{"key":"10.1016\/j.knosys.2026.115880_bib0012","series-title":"Proceedings of the IEEE International Conference on Computer Vision (ICCV)","article-title":"Focal loss for dense object detection","author":"Lin","year":"2017"},{"key":"10.1016\/j.knosys.2026.115880_bib0013","series-title":"Advances in Neural Information Processing Systems (NeurIPS)","first-page":"1171","article-title":"Scheduled sampling for sequence prediction with recurrent neural networks","author":"Bengio","year":"2015"},{"key":"10.1016\/j.knosys.2026.115880_bib0014","series-title":"International Conference on Learning Representations (ICLR)","article-title":"Neural text generation with unlikelihood training","author":"Welleck","year":"2020"},{"key":"10.1016\/j.knosys.2026.115880_bib0015","unstructured":"G. Pereyra, G. Tucker, J. Chorowski, \u0141. Kaiser, G. Hinton, Regularizing neural networks by penalizing confident output distributions, (2017). https:\/\/arxiv.org\/abs\/1701.06548."},{"key":"10.1016\/j.knosys.2026.115880_bib0016","series-title":"Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP)","first-page":"423","article-title":"Self-knowledge distillation in natural language processing","author":"Hahn","year":"2019"},{"key":"10.1016\/j.knosys.2026.115880_bib0017","unstructured":"Y. Fu, Y. Yu, X. Han, R. Li, X. Long, H. Yu, P. Li, Dynamic self-distillation via previous mini-batches for fine-tuning small language models, (2024). https:\/\/arxiv.org\/abs\/2411.16991."},{"key":"10.1016\/j.knosys.2026.115880_bib0018","series-title":"Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING)","first-page":"10208","article-title":"Layer-wise regularized dropout for neural language models","author":"Ni","year":"2024"},{"key":"10.1016\/j.knosys.2026.115880_bib0019","unstructured":"Z. Su, X. Wu, X. Bai, Z. Lin, H. Chen, G. Ding, W. Zhou, S. Hu, MiLe loss: a new loss for mitigating the bias of learning difficulties in generative language models, (2023). https:\/\/arxiv.org\/abs\/2310.19531."},{"key":"10.1016\/j.knosys.2026.115880_bib0020","unstructured":"D. Mahajan, S. Goyal, B. Youbi Idrissi, M. Pezeshki, I. Mitliagkas, D. Lopez-Paz, K. Ahuja, Beyond multi-token prediction: pretraining LLMs with future summaries, (2025). https:\/\/arxiv.org\/abs\/2510.14751."},{"key":"10.1016\/j.knosys.2026.115880_bib0021","series-title":"International Conference on Learning Representations (ICLR)","article-title":"Preserving diversity in supervised fine-tuning of large language models","author":"Li","year":"2025"},{"key":"10.1016\/j.knosys.2026.115880_bib0022","series-title":"Advances in Neural Information Processing Systems (NeurIPS)","article-title":"Do deep nets really need to be deep?","author":"Ba","year":"2014"},{"key":"10.1016\/j.knosys.2026.115880_bib0023","series-title":"Proceedings of the 12th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD)","first-page":"535","article-title":"Model compression","author":"Bucilu\u0103","year":"2006"},{"key":"10.1016\/j.knosys.2026.115880_bib0024","series-title":"International Conference on Learning Representations (ICLR)","article-title":"FitNets: hints for thin deep nets","author":"Romero","year":"2015"},{"key":"10.1016\/j.knosys.2026.115880_bib0025","series-title":"Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing (EMNLP)","first-page":"1317","article-title":"Sequence-level knowledge distillation","author":"Kim","year":"2016"},{"key":"10.1016\/j.knosys.2026.115880_bib0026","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)","first-page":"5191","article-title":"Improved knowledge distillation via teacher assistant","author":"Mirzadeh","year":"2020"},{"key":"10.1016\/j.knosys.2026.115880_bib0027","doi-asserted-by":"crossref","first-page":"12013","DOI":"10.1007\/s00521-025-11162-0","article-title":"Mitigating exposure bias in large language model distillation: an imitation learning approach","volume":"37","author":"Pozzi","year":"2025","journal-title":"Neural Comput. Appl."},{"key":"10.1016\/j.knosys.2026.115880_bib0028","series-title":"Findings of the Association for Computational Linguistics: EMNLP 2020","article-title":"TinyBERT: distilling BERT for natural language understanding","author":"Jiao","year":"2020"},{"key":"10.1016\/j.knosys.2026.115880_bib0029","series-title":"Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)","first-page":"4323","article-title":"Patient knowledge distillation for BERT model compression","author":"Sun","year":"2019"},{"key":"10.1016\/j.knosys.2026.115880_bib0030","series-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL)","first-page":"2158","article-title":"MobileBERT: a compact task-agnostic BERT for resource-limited devices","author":"Sun","year":"2020"},{"key":"10.1016\/j.knosys.2026.115880_bib0031","series-title":"Advances in Neural Information Processing Systems (NeurIPS)","article-title":"MiniLM: deep self-attention distillation for task-agnostic compression of pre-trained transformers","author":"Wang","year":"2020"},{"issue":"6","key":"10.1016\/j.knosys.2026.115880_bib0032","doi-asserted-by":"crossref","first-page":"1789","DOI":"10.1007\/s11263-021-01453-z","article-title":"Knowledge distillation: a survey","volume":"129","author":"Gou","year":"2021","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.knosys.2026.115880_bib0033","series-title":"International Conference on Machine Learning (ICML)","article-title":"Towards understanding knowledge distillation","author":"Phuong","year":"2021"},{"key":"10.1016\/j.knosys.2026.115880_bib0034","series-title":"International Conference on Machine Learning (ICML)","article-title":"Born again neural networks","author":"Furlanello","year":"2018"},{"key":"10.1016\/j.knosys.2026.115880_bib0035","unstructured":"G. Hinton, O. Vinyals, J. Dean, Distilling the knowledge in a neural network, (2015). https:\/\/arxiv.org\/abs\/1503.02531."},{"key":"10.1016\/j.knosys.2026.115880_bib0036","unstructured":"V. Sanh, L. Debut, J. Chaumond, T. Wolf, DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter, (2019). https:\/\/arxiv.org\/abs\/1910.01108."},{"key":"10.1016\/j.knosys.2026.115880_bib0037","series-title":"Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (EACL)","article-title":"Aligning large and small language models via chain-of-Thought reasoning","author":"Ranaldi","year":"2024"},{"key":"10.1016\/j.knosys.2026.115880_bib0038","series-title":"Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing (EMNLP)","article-title":"Self-refine instruction-tuning for aligning reasoning in language models","author":"Ranaldi","year":"2024"},{"key":"10.1016\/j.knosys.2026.115880_bib0039","series-title":"International Conference on Learning Representations (ICLR)","article-title":"Spectrally-normalized margin bounds for neural networks","author":"Bartlett","year":"2017"},{"key":"10.1016\/j.knosys.2026.115880_bib0040","series-title":"Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING)","first-page":"4226","article-title":"CulturaX: a cleaned, enormous, and multilingual dataset for large language models in 167 languages","author":"Nguyen","year":"2024"},{"key":"10.1016\/j.knosys.2026.115880_bib0041","unstructured":"A.Q. Jiang, A. Sablayrolles, A. Roux, A. Mensch, et al., Mixtral of Experts, 2024, 10.48550\/arXiv.2401.04088."},{"key":"10.1016\/j.knosys.2026.115880_bib0042","unstructured":"L. Ben Allal, A. Lozhkov, G. Penedo, T. Wolf, L. von Werra, Cosmopedia, 2024. https:\/\/huggingface.co\/datasets\/HuggingFaceTB\/cosmopedia."},{"key":"10.1016\/j.knosys.2026.115880_bib0043","unstructured":"YTU CE COSMOS Research Group, Cosmos-Turkish-Corpus-v1.0, 2025, https:\/\/huggingface.co\/datasets\/ytu-ce-cosmos\/Cosmos-Turkish-Corpus-v1.0."},{"key":"10.1016\/j.knosys.2026.115880_bib0044","doi-asserted-by":"crossref","unstructured":"H.T. Kesgin, M.K. Yuce, E. Dogan, M.E. Uzun, A. Uz, H.E. Seyrek, A. Zeer, M.F. Amasyal, Introducing cosmosGPT: monolingual training for Turkish language models,(2024) arXiv preprint arXiv: 2404.17336.","DOI":"10.1109\/INISTA62901.2024.10683863"},{"key":"10.1016\/j.knosys.2026.115880_bib0045","unstructured":"Meta, meta-llama\/Llama-3.2-1B, 2024, https:\/\/huggingface.co\/meta-llama\/Llama-3.2-1B."},{"key":"10.1016\/j.knosys.2026.115880_bib0046","unstructured":"Q. Team, Qwen3 Technical Report, 2025. arXiv: 2505.09388."},{"key":"10.1016\/j.knosys.2026.115880_bib0047","unstructured":"W. Lian, B. Goodson, E. Pentland, A. Cook, C. Vong, \u201dTeknium\u201d, OpenOrca: An Open Dataset of GPT Augmented FLAN Reasoning Traces, 2023, https:\/\/https:\/\/huggingface.co\/datasets\/Open-Orca\/OpenOrca."},{"key":"10.1016\/j.knosys.2026.115880_bib0048","unstructured":"Nam Pham, tiny-strange-textbooks (Revision 6f304f1), 2024. https:\/\/huggingface.co\/datasets\/nampdn-ai\/tiny-strange-textbooks. 10.57967\/hf\/1612."}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126006064?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126006064?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,12]],"date-time":"2026-06-12T00:15:53Z","timestamp":1781223353000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705126006064"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":48,"alternative-id":["S0950705126006064"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2026.115880","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"KFPT: Reliability and uncertainty filtered self-distillation for language model training","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2026.115880","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"115880"}}