{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,19]],"date-time":"2026-03-19T16:20:40Z","timestamp":1773937240990,"version":"3.50.1"},"reference-count":81,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100018537","name":"National Science and Technology Major Project","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100018537","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100013076","name":"National Major Science and Technology Projects of China","doi-asserted-by":"publisher","award":["2022ZD0116101"],"award-info":[{"award-number":["2022ZD0116101"]}],"id":[{"id":"10.13039\/501100013076","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1016\/j.neunet.2025.108444","type":"journal-article","created":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T23:19:15Z","timestamp":1765063155000},"page":"108444","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Enhancing end-to-end speech translation via multi-stage knowledge distillation"],"prefix":"10.1016","volume":"197","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-4941-2099","authenticated-orcid":false,"given":"Yue","family":"Zhou","sequence":"first","affiliation":[]},{"given":"Yuxuan","family":"Yuan","sequence":"additional","affiliation":[]},{"given":"Yanyan","family":"Feng","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8163-7139","authenticated-orcid":false,"given":"Xiaodong","family":"Shi","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2025.108444_bib0001","series-title":"Proc. of EMNLP","first-page":"8014","article-title":"Effectively pretraining a speech translation decoder with machine translation data","author":"Alinejad","year":"2020"},{"key":"10.1016\/j.neunet.2025.108444_bib0002","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2025.108444_bib0003","series-title":"Asru","article-title":"A comparative study on end-to-end speech to text translation","author":"Bahar","year":"2019"},{"key":"10.1016\/j.neunet.2025.108444_bib0004","unstructured":"Barrault, L., Chung, Y.-A., Meglioli, M. C., Dale, D., Dong, N., Duppenthaler, M., Duquenne, P.-A., Ellis, B., Elsahar, H., Haaheim, J. et al. (2023a). Seamless: Multilingual expressive and streaming speech translation. arXiv preprint arXiv: 2312.05187,."},{"key":"10.1016\/j.neunet.2025.108444_bib0005","unstructured":"Barrault, L., Chung, Y.-A., Meglioli, M. C., Dale, D., Dong, N., Duquenne, P.-A., Elsahar, H., Gong, H., Heffernan, K., Hoffman, J. et al. (2023b). Seamlessm4t: massively multilingual & multimodal machine translation. arXiv preprint arXiv: 2308.11596,."},{"key":"10.1016\/j.neunet.2025.108444_bib0006","series-title":"Interspeech","first-page":"2232","article-title":"Specrec: An alternative solution for improving end-to-end speech-to-text translation via spectrogram reconstruction","author":"Chen","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0007","unstructured":"Chu, Y., Xu, J., Zhou, X., Yang, Q., Zhang, S., Yan, Z., Zhou, C., & Zhou, J. (2023). Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models. arXiv preprint arXiv: 2311.07919,."},{"key":"10.1016\/j.neunet.2025.108444_bib0008","series-title":"2019 Conference of the north american chapter of the association for computational linguistics: Human language technologies","first-page":"2012","article-title":"Must-C: A multilingual speech translation corpus","author":"Di Gangi","year":"2019"},{"key":"10.1016\/j.neunet.2025.108444_bib0009","series-title":"Proc. of AAAI","first-page":"12749","article-title":"Listen, understand and translate: Triple supervision decouples end-to-end speech-to-text translation","volume":"vol. 35","author":"Dong","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0010","unstructured":"Du, Y., Ma, Z., Yang, Y., Deng, K., Chen, X., Yang, B., Xiang, Y., Liu, M., & Qin, B. (2024). Cot-ST: Enhancing LLM-based speech translation with multimodal chain-of-thought. arXiv preprint arXiv: 2409.19510,."},{"key":"10.1016\/j.neunet.2025.108444_bib0011","series-title":"Proc. of AAAI","first-page":"10590","article-title":"Regularizing end-to-end speech translation with triangular decomposition agreement","volume":"vol. 36","author":"Du","year":"2022"},{"key":"10.1016\/j.neunet.2025.108444_bib0012","series-title":"Proceedings of the 60th annual meeting of the association for computational linguistics (volume 1: Long papers)","first-page":"7050","article-title":"Stemm: Self-learning with speech-text manifold mixup for speech translation","author":"Fang","year":"2022"},{"key":"10.1016\/j.neunet.2025.108444_bib0013","series-title":"Proceedings of the 17th international conference on spoken language translation","first-page":"80","article-title":"End-to-end speech-translation with knowledge distillation: FBK@ IWSLT2020","author":"Gaido","year":"2020"},{"key":"10.1016\/j.neunet.2025.108444_bib0014","series-title":"Proceedings of the 23rd international conference on machine learning","first-page":"369","article-title":"Connectionist temporal classification: Labelling unsegmented sequence data with recurrent neural networks","author":"Graves","year":"2006"},{"key":"10.1016\/j.neunet.2025.108444_bib0015","series-title":"Findings of the association for computational linguistics: ACL-IJCNLP 2021","first-page":"2214","article-title":"Learning shared semantic space for speech-to-text translation","author":"Han","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0016","unstructured":"Hinton, G., Vinyals, O., & Dean, J. (2015). Distilling the knowledge in a neural network [J], arXiv preprint arXiv: 1503.02531, 2015."},{"key":"10.1016\/j.neunet.2025.108444_bib0017","doi-asserted-by":"crossref","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","article-title":"HuBERT: Self-supervised speech representation learning by masked prediction of hidden units","volume":"29","author":"Hsu","year":"2021","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10.1016\/j.neunet.2025.108444_bib0018","unstructured":"Hu, E. J., Wallis, P., Allen-Zhu, Z., et al. LoRA: Low-Rank Adaptation of Large Language Models[C]In International conference on learning representations."},{"key":"10.1016\/j.neunet.2025.108444_bib0019","series-title":"Proceedings of the 62nd annual meeting of the association for computational linguistics (volume 1: Long papers)","first-page":"74","article-title":"Gentranslate: Large language models are generative multilingual speech and machine translators","author":"Hu","year":"2024"},{"key":"10.1016\/j.neunet.2025.108444_bib0020","series-title":"Proceedings of the 2021 conference of the north american chapter of the association for computational linguistics: Human language technologies","first-page":"1872","article-title":"Source and target bidirectional knowledge distillation for end-to-end speech translation","author":"Inaguma","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0021","series-title":"Proceedings of the 58th annual meeting of the association for computational linguistics: System demonstrations","first-page":"302","article-title":"ESPNET-ST: All-in-one speech translation toolkit","author":"Inaguma","year":"2020"},{"key":"10.1016\/j.neunet.2025.108444_bib0022","series-title":"Proceedings of the 18th international conference on spoken language translation (IWSLT 2021)","first-page":"100","article-title":"ESPNET-ST IWSLT 2021 offline speech translation system","author":"Inaguma","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0023","series-title":"Proc. of ICASSP","article-title":"Data efficient direct speech-to-text translation with modality agnostic meta-learning","author":"Indurthi","year":"2020"},{"key":"10.1016\/j.neunet.2025.108444_bib0024","series-title":"Proc. of ICASSP","first-page":"7904","article-title":"End-end speech-to-text translation with modality agnostic meta-learning","author":"Indurthi","year":"2020"},{"key":"10.1016\/j.neunet.2025.108444_bib0025","series-title":"ICASSP 2021-2021 IEEE international conference on acoustics, speech and signal processing (icassp)","first-page":"7723","article-title":"Task aware multi-task learning for speech to text tasks","author":"Indurthi","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0026","series-title":"ICASSP 2019-2019 IEEE international conference on acoustics, speech and signal processing (icassp)","first-page":"7180","article-title":"Leveraging weakly supervised data to improve end-to-end speech-to-text translation","author":"Jia","year":"2019"},{"key":"10.1016\/j.neunet.2025.108444_bib0027","series-title":"Findings of the association for computational linguistics: EMNLP 2020","first-page":"4163","article-title":"TinyBERT: Distilling BERT for natural language understanding","author":"Jiao","year":"2020"},{"key":"10.1016\/j.neunet.2025.108444_bib0028","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2024.108001","article-title":"Knowledge distillation with insufficient training data for regression","volume":"132","author":"Kang","year":"2024","journal-title":"Engineering Applications of Artificial Intelligence"},{"key":"10.1016\/j.neunet.2025.108444_bib0029","unstructured":"Khandelwal, U., Fan, A., Jurafsky, D., Zettlemoyer, L., & Lewis, M. (2020). Nearest neighbor machine translation. https:\/\/arxiv.org\/abs\/2010.00710."},{"key":"10.1016\/j.neunet.2025.108444_bib0030","series-title":"Proc. of EMNLP","first-page":"1317","article-title":"Sequence-level knowledge distillation","author":"Kim","year":"2016"},{"key":"10.1016\/j.neunet.2025.108444_bib0031","series-title":"ICLR (Poster)","article-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2015"},{"key":"10.1016\/j.neunet.2025.108444_bib0032","series-title":"Interspeech","first-page":"2272","article-title":"Asr posterior-based loss for multi-task end-to-end speech translation","author":"Ko","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0033","series-title":"ICASSP 2021-2021 IEEE international conference on acoustics, speech and signal processing (icassp)","first-page":"7508","article-title":"Cascaded models with cyclic feedback for direct speech translation","author":"Lam","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0034","series-title":"Proceedings of the 28th international conference on computational linguistics","first-page":"3520","article-title":"Dual-decoder transformer for joint automatic speech recognition and multilingual speech translation","author":"Le","year":"2020"},{"key":"10.1016\/j.neunet.2025.108444_bib0035","series-title":"Proceedings of the 59th annual meeting of the association for computational linguistics and the 11th international joint conference on natural language processing (volume 2: Short papers)","first-page":"817","article-title":"Lightweight adapter tuning for multilingual speech translation","author":"Le","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0036","series-title":"Proceedings of the 59th annual meeting of the association for computational linguistics and the 11th international joint conference on natural language processing (volume 1: Long papers)","first-page":"827","article-title":"Multilingual speech translation from efficient finetuning of pretrained models","author":"Li","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0037","doi-asserted-by":"crossref","first-page":"1128","DOI":"10.21437\/Interspeech.2019-2582","article-title":"End-to-end speech translation with knowledge distillation","author":"Liu","year":"2019","journal-title":"Proc. Interspeech 2019"},{"key":"10.1016\/j.neunet.2025.108444_bib0038","doi-asserted-by":"crossref","first-page":"194","DOI":"10.1016\/j.neunet.2022.01.016","article-title":"Improving data augmentation for low resource speech-to-text translation with diverse paraphrasing","volume":"148","author":"Mi","year":"2022","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2025.108444_bib0039","series-title":"Proceedings of the 61st annual meeting of the association for computational linguistics (volume 1: Long papers)","first-page":"3891","article-title":"Waco: Word-aligned contrastive learning for speech translation","author":"Ouyang","year":"2023"},{"key":"10.1016\/j.neunet.2025.108444_bib0040","series-title":"2015\u202fIEEE International conference on acoustics, speech and signal processing (ICASSP)","first-page":"5206","article-title":"Librispeech: An ASR corpus based on public domain audio books","author":"Panayotov","year":"2015"},{"key":"10.1016\/j.neunet.2025.108444_bib0041","series-title":"Proceedings of the 2021 conference on empirical methods in natural language processing","first-page":"1698","article-title":"Speechformer: Reducing information loss in direct speech translation","author":"Papi","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0042","series-title":"Proc. Interspeech 2020","first-page":"1476","article-title":"Self-training for end-to-end speech translation","author":"Pino","year":"2020"},{"key":"10.1016\/j.neunet.2025.108444_bib0043","series-title":"Proceedings of the 58th annual meeting of the association for computational linguistics","first-page":"3787","article-title":"Simulspeech: End-to-end simultaneous speech to text translation","author":"Ren","year":"2020"},{"key":"10.1016\/j.neunet.2025.108444_bib0044","series-title":"Proceedings of the 2019 conference of the north american chapter of the association for computational linguistics: Human language technologies, volume 1 (long and short papers)","first-page":"2786","article-title":"Fluent translations from disfluent speech in end-to-end speech translation","author":"Salesky","year":"2019"},{"key":"10.1016\/j.neunet.2025.108444_bib0045","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2022.105560","article-title":"An adaptive teacher\u2013student learning algorithm with decomposed knowledge distillation for on-edge intelligence","volume":"117","author":"Sepahvand","year":"2023","journal-title":"Engineering Applications of Artificial Intelligence"},{"key":"10.1016\/j.neunet.2025.108444_bib0046","series-title":"Proceedings of the 2019 conference on empirical methods in natural language processing and the 9th international joint conference on natural language processing (EMNLP-IJCNLP)","first-page":"4323","article-title":"Patient knowledge distillation for BERT model compression","author":"Sun","year":"2019"},{"key":"10.1016\/j.neunet.2025.108444_bib0047","series-title":"Proceedings of the 58th annual meeting of the association for computational linguistics","first-page":"2158","article-title":"MobileBERT: A compact task-agnostic BERT for resource-limited devices","author":"Sun","year":"2020"},{"key":"10.1016\/j.neunet.2025.108444_bib0048","series-title":"The twelfth international conference on learning representations","article-title":"Salmonn: Towards generic hearing abilities for large language models","author":"Tang","year":"2024"},{"key":"10.1016\/j.neunet.2025.108444_bib0049","series-title":"Proceedings of the 59th annual meeting of the association for computational linguistics and the 11th international joint conference on natural language processing (volume 1: Long papers)","first-page":"4252","article-title":"Improving speech translation by understanding and learning from the auxiliary text translation task","author":"Tang","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0050","series-title":"ICASSP 2021-2021 IEEE international conference on acoustics, speech and signal processing (ICASSP)","first-page":"6209","article-title":"A general multi-task learning framework to leverage text data for speech to text tasks","author":"Tang","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0051","series-title":"Proc. neurIPS","article-title":"Attention is all you need","author":"Vaswani","year":"2017"},{"key":"10.1016\/j.neunet.2025.108444_bib0052","series-title":"IberSPEECH","first-page":"60","article-title":"End-to-end speech translation with the transformer","author":"Vila","year":"2018"},{"key":"10.1016\/j.neunet.2025.108444_bib0053","series-title":"ICASSP 2021-2021 IEEE international conference on acoustics, speech and signal processing (icassp)","first-page":"7513","article-title":"Jointly trained transformers models for spoken language translation","author":"Vydana","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0054","unstructured":"Wang, C., Liao, M., Huang, Z., Lu, J., Wu, J., Liu, Y., Zong, C., & Zhang, J. (2023). BLSP: Bootstrapping language-speech pre-training via behavior alignment of continuation writing. arXiv preprint arXiv: 2309.00916,."},{"key":"10.1016\/j.neunet.2025.108444_bib0055","unstructured":"Wang, C., Liao, M., Huang, Z., & Zhang, J. (2024). BLSP-KD: Bootstrapping language-speech pre-training via knowledge distillation. arXiv preprint arXiv: 2405.19041,."},{"key":"10.1016\/j.neunet.2025.108444_bib0056","series-title":"Proceedings of the 1st conference of the asia-pacific chapter of the association for computational linguistics and the 10th international joint conference on natural language processing: System demonstrations","first-page":"33","article-title":"Fairseq s2t: Fast speech-to-text modeling with fairseq","author":"Wang","year":"2020"},{"key":"10.1016\/j.neunet.2025.108444_bib0057","series-title":"Interspeech","first-page":"2247","article-title":"CovoST 2 and massively multilingual speech translation","volume":"vol. 2021","author":"Wang","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0058","series-title":"Proc. of AAAI","first-page":"9161","article-title":"Bridging the gap between pre-training and fine-tuning for end-to-end speech translation","volume":"vol. 34","author":"Wang","year":"2020"},{"key":"10.1016\/j.neunet.2025.108444_bib0059","series-title":"Proc. of ACL","first-page":"3728","article-title":"Curriculum pre-training for end-to-end speech translation","author":"Wang","year":"2020"},{"key":"10.1016\/j.neunet.2025.108444_bib0060","series-title":"Interspeech","article-title":"Sequence-to-sequence models can directly translate foreign speech","author":"Weiss","year":"2017"},{"key":"10.1016\/j.neunet.2025.108444_bib0061","series-title":"Proc. ACL","first-page":"2619","article-title":"Stacked acoustic-and-textual encoding: Integrating the pre-trained models into speech translation encoders","author":"Xu","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0062","series-title":"Proceedings of the 59th annual meeting of the association for computational linguistics and the 11th international joint conference on natural language processing (volume 1: Long papers)","first-page":"2619","article-title":"Stacked acoustic-and-textual encoding: Integrating the pre-trained models into speech translation encoders","author":"Xu","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0063","series-title":"Proceedings of the 61st annual meeting of the association for computational linguistics (volume 1: Long papers)","first-page":"13321","article-title":"CTC-based non-autoregressive speech translation[C]","author":"Xu","year":"2023"},{"key":"10.1016\/j.neunet.2025.108444_bib0064","unstructured":"Xu, J., Guo, Z., He, J., Hu, H., He, T., Bai, S., Chen, K., Wang, J., Fan, Y., Dang, K. et al. (2025). Qwen2. 5-omni technical report. arXiv preprint arXiv: 2503.20215,."},{"key":"10.1016\/j.neunet.2025.108444_bib0065","series-title":"Proceedings of the 59th annual meeting of the association for computational linguistics and the 11th international joint conference on natural language processing (volume 2: Short papers)","first-page":"511","article-title":"Bilingual mutual information based adaptive training for neural machine translation","author":"Xu","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0066","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.123317","article-title":"Speaker voice normalization for end-to-end speech translation","volume":"248","author":"Xue","year":"2024","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.neunet.2025.108444_bib0067","doi-asserted-by":"crossref","unstructured":"Ye, R., Wang, M., & Li, L. (2021). End-to-end speech translation via cross-modal progressive training. arXiv preprint arXiv: 2104.10380,.","DOI":"10.21437\/Interspeech.2021-1065"},{"key":"10.1016\/j.neunet.2025.108444_bib0068","series-title":"Proceedings of the 2022 conference of the north american chapter of the association for computational linguistics: Human language technologies","first-page":"5099","article-title":"Cross-modal contrastive learning for speech translation","author":"Ye","year":"2022"},{"key":"10.1016\/j.neunet.2025.108444_bib0069","series-title":"ICASSP 2024-2024 IEEE international conference on acoustics, speech and signal processing (ICASSP)","first-page":"12727","article-title":"Memory-augmented speech-to-text translation with multi-scale context translation strategy","author":"Yuan","year":"2024"},{"key":"10.1016\/j.neunet.2025.108444_bib0070","series-title":"Findings of the association for computational linguistics: EMNLP 2023","first-page":"2353","article-title":"Adatrans: Adapting with boundary-based shrinking for end-to-end speech translation","author":"Zeng","year":"2023"},{"key":"10.1016\/j.neunet.2025.108444_bib0071","series-title":"Findings of the association for computational linguistics: ACL 2023","first-page":"7147","article-title":"Dub: Discrete unit back-translation for speech translation","author":"Zhang","year":"2023"},{"key":"10.1016\/j.neunet.2025.108444_bib0072","series-title":"ICASSP 2023-2023 IEEE international conference on acoustics, speech and signal processing (ICASSP)","first-page":"1","article-title":"Decoupled non-parametric knowledge distillation for end-to-end speech translation","author":"Zhang","year":"2023"},{"key":"10.1016\/j.neunet.2025.108444_bib0073","series-title":"Proc. of AC","first-page":"6475","article-title":"Lattice transformer for speech translation","author":"Zhang","year":"2019"},{"key":"10.1016\/j.neunet.2025.108444_bib0074","series-title":"Proceedings of the 60th annual meeting of the association for computational linguistics (volume 1: Long papers)","first-page":"2377","article-title":"Conditional bilingual mutual information based adaptive training for neural machine translation","author":"Zhang","year":"2022"},{"key":"10.1016\/j.neunet.2025.108444_bib0075","series-title":"Proceedings of the 19th international conference on spoken language translation (IWSLT 2022)","first-page":"198","article-title":"The USTC-NELSLIP offline speech translation systems for IWSLT 2022[C]","author":"Zhang","year":"2022"},{"key":"10.1016\/j.neunet.2025.108444_bib0076","series-title":"Proc. of EMNLP","article-title":"Mutual-learning improves end-to-end speech translation","author":"Zhao","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0077","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.123241","article-title":"Regularizing cross-attention learning for end-to-end speech translation with ASR and MT attention matrices","volume":"247","author":"Zhao","year":"2024","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.neunet.2025.108444_bib0078","series-title":"Findings of the association for computational linguistics: EMNLP 2023","first-page":"5920","article-title":"CCSRD: Content-centric speech representation disentanglement learning for end-to-end speech translation","author":"Zhao","year":"2023"},{"key":"10.1016\/j.neunet.2025.108444_bib0079","series-title":"International conference on machine learning","first-page":"12736","article-title":"Fused acoustic and text encoding for multimodal bilingual pretraining and speech translation","author":"Zheng","year":"2021"},{"key":"10.1016\/j.neunet.2025.108444_bib0080","series-title":"Proceedings of the 61st annual meeting of the association for computational linguistics (volume 1: Long papers)","first-page":"7873","article-title":"CMOT: Cross-modal mixup via optimal transport for speech translation","author":"Zhou","year":"2023"},{"key":"10.1016\/j.neunet.2025.108444_bib0081","doi-asserted-by":"crossref","first-page":"8641","DOI":"10.1007\/s00521-024-09547-8","article-title":"A multitask co-training framework for improving speech translation by leveraging speech recognition and machine translation tasks[J]","author":"Zhou","year":"2024","journal-title":"Neural Computing and Applications"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608025013255?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608025013255?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,3,19]],"date-time":"2026-03-19T14:38:46Z","timestamp":1773931126000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608025013255"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5]]},"references-count":81,"alternative-id":["S0893608025013255"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2025.108444","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2026,5]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Enhancing end-to-end speech translation via multi-stage knowledge distillation","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2025.108444","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"108444"}}