{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T10:21:58Z","timestamp":1774002118090,"version":"3.50.1"},"reference-count":67,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100013050","name":"Guangdong Provincial Applied Science and Technology Research and Development Program","doi-asserted-by":"publisher","award":["2024B0101050003"],"award-info":[{"award-number":["2024B0101050003"]}],"id":[{"id":"10.13039\/501100013050","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100021171","name":"Basic and Applied Basic Research Foundation of Guangdong Province","doi-asserted-by":"publisher","award":["2024A1515011491"],"award-info":[{"award-number":["2024A1515011491"]}],"id":[{"id":"10.13039\/501100021171","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100017610","name":"Shenzhen Science and Technology Innovation Program","doi-asserted-by":"publisher","award":["ZDSYS20230626091203008"],"award-info":[{"award-number":["ZDSYS20230626091203008"]}],"id":[{"id":"10.13039\/501100017610","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100017610","name":"Shenzhen Science and Technology Innovation Program","doi-asserted-by":"publisher","award":["KJZD20231023094700001"],"award-info":[{"award-number":["KJZD20231023094700001"]}],"id":[{"id":"10.13039\/501100017610","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100017610","name":"Shenzhen Science and Technology Innovation Program","doi-asserted-by":"publisher","award":["KQTD20240729102154066"],"award-info":[{"award-number":["KQTD20240729102154066"]}],"id":[{"id":"10.13039\/501100017610","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1016\/j.knosys.2026.115382","type":"journal-article","created":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T07:45:01Z","timestamp":1769499901000},"page":"115382","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Exploring and enhancing the transfer of distribution in knowledge distillation for autoregressive language models"],"prefix":"10.1016","volume":"337","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5804-1508","authenticated-orcid":false,"given":"Jun","family":"Rao","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8524-2006","authenticated-orcid":false,"given":"Xuebo","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3355-9648","authenticated-orcid":false,"given":"Zepeng","family":"Lin","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8976-2084","authenticated-orcid":false,"given":"Liang","family":"Ding","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3262-3734","authenticated-orcid":false,"given":"Jing","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3895-5510","authenticated-orcid":false,"given":"Min","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.knosys.2026.115382_bib0001","series-title":"NeurIPS","article-title":"Language models are few-shot learners","author":"Brown","year":"2020"},{"key":"10.1016\/j.knosys.2026.115382_bib0002","unstructured":"OpenAI, GPT-4 technical report, in: ArXiv, 2023. https:\/\/arxiv.org\/abs\/2303.08774."},{"key":"10.1016\/j.knosys.2026.115382_bib0003","series-title":"ArXiv","article-title":"LLaMa: open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"10.1016\/j.knosys.2026.115382_bib0004","series-title":"NeurIPS Workshop","article-title":"Distilling the knowledge in a neural network","author":"Hinton","year":"2015"},{"key":"10.1016\/j.knosys.2026.115382_bib0005","series-title":"EMNLP","article-title":"Patient knowledge distillation for BERT model compression","author":"Sun","year":"2019"},{"key":"10.1016\/j.knosys.2026.115382_bib0006","series-title":"CIKM","article-title":"Student can also be a good teacher: extracting knowledge from vision-and-language model for cross-modal retrieval","author":"Rao","year":"2021"},{"key":"10.1016\/j.knosys.2026.115382_bib0007","series-title":"CVPR","article-title":"A gift from knowledge distillation: fast optimization, network minimization and transfer learning","author":"Yim","year":"2017"},{"key":"10.1016\/j.knosys.2026.115382_bib0008","series-title":"CVPR","article-title":"Online knowledge distillation via collaborative learning","author":"Guo","year":"2020"},{"issue":"5","key":"10.1016\/j.knosys.2026.115382_bib0009","doi-asserted-by":"crossref","first-page":"3465","DOI":"10.1109\/TCSVT.2023.3325814","article-title":"Improving knowledge distillation via head and tail categories","volume":"34","author":"Xu","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"6","key":"10.1016\/j.knosys.2026.115382_bib0010","doi-asserted-by":"crossref","first-page":"4188","DOI":"10.1109\/TPAMI.2024.3354928","article-title":"Learning from human educational wisdom: a student-centered knowledge distillation method","volume":"46","author":"Yang","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.knosys.2026.115382_bib0011","series-title":"ICLR","article-title":"Neural machine translation by jointly learning to align and translate","author":"Bahdanau","year":"2015"},{"key":"10.1016\/j.knosys.2026.115382_bib0012","series-title":"NeurIPS","article-title":"Attention is all you need","author":"Vaswani","year":"2017"},{"key":"10.1016\/j.knosys.2026.115382_bib0013","article-title":"Scheduled sampling for sequence prediction with recurrent neural networks","volume":"28","author":"Bengio","year":"2015","journal-title":"NeurIPS"},{"key":"10.1016\/j.knosys.2026.115382_bib0014","series-title":"Findings of the ACL","first-page":"700","article-title":"Why exposure bias matters: an imitation learning perspective of error accumulation in language generation","author":"Arora","year":"2022"},{"key":"10.1016\/j.knosys.2026.115382_bib0015","article-title":"GKD: generalized knowledge distillation for auto-regressive sequence models","author":"Agarwal","year":"2024","journal-title":"ICLR"},{"key":"10.1016\/j.knosys.2026.115382_bib0016","series-title":"ICLR","article-title":"MiniLLM: knowledge distillation of large language models","author":"Gu","year":"2024"},{"key":"10.1016\/j.knosys.2026.115382_bib0017","series-title":"ACL","first-page":"10817","article-title":"f-divergence minimization for sequence-level knowledge distillation","author":"Wen","year":"2023"},{"key":"10.1016\/j.knosys.2026.115382_bib0018","article-title":"Learning student-friendly teacher networks for knowledge distillation","author":"Park","year":"2021","journal-title":"NeurIPS"},{"key":"10.1016\/j.knosys.2026.115382_bib0019","series-title":"ICCV","article-title":"Student customized knowledge distillation: bridging the gap between student and teacher","author":"Zhu","year":"2021"},{"key":"10.1016\/j.knosys.2026.115382_bib0020","series-title":"ICCV","article-title":"Knowledge distillation via route constrained optimization","author":"Jin","year":"2019"},{"key":"10.1016\/j.knosys.2026.115382_bib0021","series-title":"AAAI","article-title":"Improved knowledge distillation via teacher assistant","author":"Mirzadeh","year":"2020"},{"key":"10.1016\/j.knosys.2026.115382_bib0022","series-title":"AISTATS","first-page":"661","article-title":"Efficient reductions for imitation learning","volume":"9","author":"Ross","year":"2010"},{"key":"10.1016\/j.knosys.2026.115382_bib0023","series-title":"EMNLP","first-page":"6121","article-title":"Autoregressive knowledge distillation through imitation learning","author":"Lin","year":"2020"},{"key":"10.1016\/j.knosys.2026.115382_bib0024","unstructured":"R. Taori, I. Gulrajani, T. Zhang, Y. Dubois, X. Li, C. Guestrin, P. Liang, T.B. Hashimoto, Stanford alpaca: an instruction-following LlaMA model, 2023, (https:\/\/github.com\/tatsu-lab\/stanford_alpaca)."},{"key":"10.1016\/j.knosys.2026.115382_bib0025","series-title":"ArXiv","article-title":"Instruction tuning with GPT-4","author":"Peng","year":"2023"},{"key":"10.1016\/j.knosys.2026.115382_bib0026","series-title":"ArXiv","article-title":"Baize: an open-source chat model with parameter-efficient tuning on self-chat data","author":"Xu","year":"2023"},{"key":"10.1016\/j.knosys.2026.115382_bib0027","series-title":"ArXiv","article-title":"Phoenix: democratizing chatGPT across languages","author":"Chen","year":"2023"},{"key":"10.1016\/j.knosys.2026.115382_bib0028","series-title":"NeurIPS","article-title":"How far can camels go? Exploring the state of instruction tuning on open resources","author":"Wang","year":"2023"},{"key":"10.1016\/j.knosys.2026.115382_bib0029","series-title":"ICLR","article-title":"Finetuned language models are zero-shot learners","author":"Wei","year":"2022"},{"key":"10.1016\/j.knosys.2026.115382_bib0030","series-title":"EMNLP","article-title":"Super-naturalinstructions: generalization via declarative instructions on 1600+ NLP tasks","author":"Wang","year":"2022"},{"key":"10.1016\/j.knosys.2026.115382_bib0031","series-title":"NeurIPS","article-title":"Chain-of-thought prompting elicits reasoning in large language models","author":"Wei","year":"2022"},{"key":"10.1016\/j.knosys.2026.115382_bib0032","series-title":"ArXiv","article-title":"Scaling instruction-finetuned language models","author":"Chung","year":"2022"},{"key":"10.1016\/j.knosys.2026.115382_bib0033","series-title":"ArXiv","article-title":"Flan-MoE: scaling instruction-finetuned language models with sparse mixture of experts","author":"Shen","year":"2023"},{"key":"10.1016\/j.knosys.2026.115382_bib0034","series-title":"ACL (Findings)","article-title":"One embedder, any task: instruction-finetuned text embeddings","author":"Su","year":"2023"},{"key":"10.1016\/j.knosys.2026.115382_bib0035","series-title":"ACL","article-title":"BERT learns to teach: knowledge distillation with meta learning","author":"Zhou","year":"2022"},{"key":"10.1016\/j.knosys.2026.115382_bib0036","series-title":"ICLR","article-title":"Understanding and improving lexical choice in non-autoregressive translation","author":"Ding","year":"2021"},{"key":"10.1016\/j.knosys.2026.115382_bib0037","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2024.112573","article-title":"Learning continuation: integrating past knowledge for contrastive distillation","volume":"304","author":"Zhang","year":"2024","journal-title":"Knowl. Based Syst."},{"key":"10.1016\/j.knosys.2026.115382_bib0038","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.112955","article-title":"Multi-level knowledge distillation for fine-grained fashion image retrieval","volume":"310","author":"Xiao","year":"2025","journal-title":"Knowl. Based Syst."},{"key":"10.1016\/j.knosys.2026.115382_bib0039","first-page":"1","article-title":"Dynamic contrastive distillation for image-text retrieval","author":"Rao","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.knosys.2026.115382_bib0040","series-title":"ICLR","article-title":"Contrastive representation distillation","author":"Tian","year":"2020"},{"key":"10.1016\/j.knosys.2026.115382_bib0041","series-title":"CVPR","article-title":"Relational knowledge distillation","author":"Park","year":"2019"},{"key":"10.1016\/j.knosys.2026.115382_bib0042","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2024.112531","article-title":"Adaptive class token knowledge distillation for efficient vision transformer","volume":"304","author":"Kang","year":"2024","journal-title":"Knowl. Based Syst."},{"key":"10.1016\/j.knosys.2026.115382_bib0043","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2024.112503","article-title":"StAlK: structural alignment based self knowledge distillation for medical image classification","volume":"304","author":"Sharma","year":"2024","journal-title":"Knowl. Based Syst."},{"key":"10.1016\/j.knosys.2026.115382_bib0044","series-title":"CVPR","article-title":"Deep mutual learning","author":"Zhang","year":"2018"},{"key":"10.1016\/j.knosys.2026.115382_bib0045","first-page":"1","article-title":"Parameter-efficient and student-friendly knowledge distillation","author":"Rao","year":"2023","journal-title":"IEEE Trans. Multim."},{"key":"10.1016\/j.knosys.2026.115382_bib0046","series-title":"ACL","first-page":"3431","article-title":"Rejuvenating low-frequency words: making the most of parallel data in non-autoregressive translation","author":"Ding","year":"2021"},{"key":"10.1016\/j.knosys.2026.115382_bib0047","series-title":"ACL","first-page":"2417","article-title":"Redistributing low-frequency words: making the most of monolingual data in non-autoregressive translation","author":"Ding","year":"2022"},{"key":"10.1016\/j.knosys.2026.115382_bib0048","series-title":"Coling","article-title":"Rethinking Kullback-Leibler divergence in knowledge distillation for large language models","author":"Wu","year":"2024"},{"key":"10.1016\/j.knosys.2026.115382_bib0049","series-title":"EMNLP-Findings","article-title":"PromptKD: distilling student-friendly knowledge for generative language models via prompt tuning","author":"Kim","year":"2024"},{"key":"10.1016\/j.knosys.2026.115382_bib0050","series-title":"ICML","article-title":"DistiLLM: towards streamlined distillation for large language models","author":"Ko","year":"2024"},{"key":"10.1016\/j.knosys.2026.115382_bib0051","series-title":"ICLR","article-title":"Tailoring language generation models under total variation distance","author":"Ji","year":"2023"},{"key":"10.1016\/j.knosys.2026.115382_bib0052","series-title":"ACL","article-title":"ROUGE: a package for automatic evaluation of summaries","author":"Lin","year":"2004"},{"key":"10.1016\/j.knosys.2026.115382_bib0053","series-title":"ACL","first-page":"8062","article-title":"Towards understanding and improving knowledge distillation for neural machine translation","author":"Zhang","year":"2023"},{"key":"10.1016\/j.knosys.2026.115382_bib0054","series-title":"ACL","article-title":"Revisiting knowledge distillation for autoregressive language models","author":"Zhong","year":"2024"},{"issue":"8","key":"10.1016\/j.knosys.2026.115382_bib0055","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI Blog"},{"key":"10.1016\/j.knosys.2026.115382_bib0056","series-title":"ArXiv","article-title":"TinyLlama: an open-source small language model","author":"Zhang","year":"2024"},{"key":"10.1016\/j.knosys.2026.115382_bib0057","series-title":"SIGIR","article-title":"Where does the performance improvement come from - a reproducibility concern about image-text retrieval","author":"Rao","year":"2022"},{"key":"10.1016\/j.knosys.2026.115382_bib0058","unstructured":"M. Conover, M. Hayes, A. Mathur, J. Xie, J. Wan, S. Shah, A. Ghodsi, P. Wendell, M. Zaharia, R. Xin, Free dolly: introducing the world\u2019s first truly open instruction-tuned LLM, 2023, https:\/\/www.databricks.com\/blog\/2023\/04\/12\/dolly-first-open-commercially-viable-instruction-tuned-llm."},{"key":"10.1016\/j.knosys.2026.115382_bib0059","series-title":"ArXiv","article-title":"Judging LLM-as-a-judge with MT-bench and chatbot arena","author":"Zheng","year":"2023"},{"key":"10.1016\/j.knosys.2026.115382_bib0060","series-title":"ACL","first-page":"13484","article-title":"Self-instruct: aligning language models with self-generated instructions","author":"Wang","year":"2023"},{"key":"10.1016\/j.knosys.2026.115382_bib0061","series-title":"KDD","article-title":"DeepSpeed: system optimizations enable training deep learning models with over 100 billion parameters","author":"Rasley","year":"2020"},{"key":"10.1016\/j.knosys.2026.115382_bib0062","series-title":"SC","first-page":"20","article-title":"ZeRO: memory optimizations toward training trillion parameter models","author":"Rajbhandari","year":"2020"},{"key":"10.1016\/j.knosys.2026.115382_bib0063","series-title":"ICLR","article-title":"LoRA: low-rank adaptation of large language models","author":"Hu","year":"2022"},{"key":"10.1016\/j.knosys.2026.115382_bib0064","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2024.111915","article-title":"Student-friendly knowledge distillation","volume":"296","author":"Yuan","year":"2024","journal-title":"Knowl. Based Syst."},{"key":"10.1016\/j.knosys.2026.115382_bib0065","series-title":"ACL","article-title":"P-tuning: prompt tuning can be comparable to fine-tuning across scales and tasks","author":"Liu","year":"2022"},{"key":"10.1016\/j.knosys.2026.115382_bib0066","series-title":"ACL","article-title":"Prefix-tuning: optimizing continuous prompts for generation","author":"Li","year":"2021"},{"key":"10.1016\/j.knosys.2026.115382_bib0067","series-title":"ICLR","article-title":"Towards a unified view of parameter-efficient transfer learning","author":"He","year":"2022"}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126001255?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126001255?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T07:43:01Z","timestamp":1773992581000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705126001255"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3]]},"references-count":67,"alternative-id":["S0950705126001255"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2026.115382","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2026,3]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Exploring and enhancing the transfer of distribution in knowledge distillation for autoregressive language models","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2026.115382","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"115382"}}