{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,17]],"date-time":"2025-11-17T01:10:04Z","timestamp":1763341804587,"version":"3.45.0"},"reference-count":46,"publisher":"Tech Science Press","issue":"2","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["CMC"],"published-print":{"date-parts":[[2025]]},"DOI":"10.32604\/cmc.2025.061145","type":"journal-article","created":{"date-parts":[[2025,2,17]],"date-time":"2025-02-17T02:08:21Z","timestamp":1739758101000},"page":"2305-2322","source":"Crossref","is-referenced-by-count":0,"title":["Multimodal Neural Machine Translation Based on Knowledge Distillation and Anti-Noise Interaction"],"prefix":"10.32604","volume":"83","author":[{"given":"Erlin","family":"Tian","sequence":"first","affiliation":[]},{"given":"Zengchao","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Fangmei","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Zuhe","family":"Li","sequence":"additional","affiliation":[]}],"member":"17807","published-online":{"date-parts":[[2025]]},"reference":[{"key":"ref1","doi-asserted-by":"crossref","first-page":"121168","DOI":"10.1016\/j.eswa.2023.121168","article-title":"A survey on multimodal bidirectional machine learning translation of image and natural language processing","volume":"235","author":"Nam","year":"2024","journal-title":"Expert Syst Appl"},{"key":"ref2","series-title":"MM '24: Proceedings of the 32nd ACM International Conference on Multimedia","first-page":"4227","article-title":"Virtual visual-guided domain-shadow fusion via modal exchanging for domain-specific multi-modal neural machine translation","author":"Hou","year":"2024"},{"key":"ref3","doi-asserted-by":"crossref","first-page":"352","DOI":"10.1016\/j.inffus.2022.10.018","article-title":"Multimodality information fusion for automated machine translation","volume":"91","author":"Li","year":"2023","journal-title":"Inf Fusion"},{"key":"ref4","doi-asserted-by":"crossref","unstructured":"Yin Y, Meng F, Su J, Zhou C, Yang Z, Zhou J, et al. A novel graph-based multi-modal fusion encoder for neural machine translation [M.S. dissertation]. China: Xiamen University; 2020.","DOI":"10.18653\/v1\/2020.acl-main.273"},{"key":"ref5","series-title":"Proceedings of the 29th International Conference on Computational Linguistics","first-page":"5098","article-title":"Noise-robust cross-modal interactive learning with text2image mask for multi-modal neural machine translation","author":"Ye","year":"2022"},{"key":"ref6","series-title":"Proceedings of the 28th International Conference on Computational Linguistics","first-page":"4304","article-title":"Supervised visual attention for multimodal neural machine translation","author":"Nishihara","year":"2020"},{"key":"ref7","doi-asserted-by":"crossref","unstructured":"Li J, Ataman D, Sennrich R. Vision matters when it should: sanity checking multimodal machine translation models [M.S. dissertation]. Switzerland: ETH Z\u00fcrich; 2021.","DOI":"10.18653\/v1\/2021.emnlp-main.673"},{"key":"ref8","series-title":"Findings of the Association for Computational Linguistics: EMNLP 2024","first-page":"5596","article-title":"Visual pivoting unsupervised multimodal machine translation in low-resource distant language Pairs","author":"Tayir","year":"2024"},{"key":"ref9","series-title":"ICASSP 2021\u20142021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"7533","article-title":"Modeling homophone noise for robust neural machine translation","author":"Qin","year":"2021"},{"key":"ref10","series-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics","first-page":"4346","article-title":"Multimodal transformer for multimodal machine translation","author":"Yao","year":"2020"},{"key":"ref11","unstructured":"Wang F, Yan J, Meng F, Zhou J. Selective knowledge distillation for neural machine translation [M.S. dissertation]. China: Peking University; 2021."},{"key":"ref12","doi-asserted-by":"crossref","first-page":"49","DOI":"10.1162\/tacl_a_00299","article-title":"Membership inference attacks on sequence-to-sequence models: is my data in your machine translation system?","volume":"8","author":"Hisamoto","year":"2020","journal-title":"Trans Assoc Computat Linguist"},{"key":"ref13","unstructured":"Gain B, Bandyopadhyay D, Ekbal A. Experiences of adapting multimodal machine translation techniques for hindi [M.S. dissertation]. India: Indian Institute of Technology Patna; 2021."},{"key":"ref14","first-page":"639","article-title":"Attention-based multimodal neural machine translation","volume":"2","author":"Huang","year":"2016","journal-title":"First Conf Mach Transl"},{"key":"ref15","doi-asserted-by":"crossref","unstructured":"Calixto I, Liu Q, Campbell N. Incorporating global visual features into attention-based neural machine translation [M.S. dissertation]. Irish: ADAPT Centre Dublin City University; 2017.","DOI":"10.18653\/v1\/D17-1105"},{"key":"ref16","doi-asserted-by":"crossref","unstructured":"Elliott D, Frank S, Barrault L, Bougares F, Specia L. Findings of the second shared task on multimodal machine translation and multilingual image description [M.S. dissertation]. England: School of Informatics University of Edinburgh; 2017.","DOI":"10.18653\/v1\/W17-4718"},{"key":"ref17","doi-asserted-by":"crossref","first-page":"244","DOI":"10.1109\/TASLP.2021.3138719","article-title":"Word-region alignment-guided multimodal neural machine translation","volume":"30","author":"Zhao","year":"2021","journal-title":"IEEE\/ACM Trans Audio, Speech, Lang Process"},{"key":"ref18","doi-asserted-by":"crossref","first-page":"3013","DOI":"10.1109\/TMM.2021.3092187","article-title":"Enhancing neural machine translation with dual-side multimodal awareness","volume":"24","author":"Song","year":"2021","journal-title":"IEEE Trans Multimed"},{"key":"ref19","series-title":"Proceedings of the 28th ACM International Conference on Multimedia","first-page":"1320","article-title":"Dynamic context-guided capsule network for multimodal machine translation","author":"Lin","year":"2020"},{"key":"ref20","series-title":"Proceedings of the 12th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining","first-page":"535","article-title":"Model compression","author":"Caruana","year":"2006"},{"key":"ref21","unstructured":"Hinton G. Distilling the knowledge in a neural network [Ph.D. dissertation]. USA: Google Inc.; 2015."},{"key":"ref22","unstructured":"Romero A, Ballas N, Kahou SE, Chassang A, Gatta C, Bengio Y. FitNets: hints for thin deep nets [M.S. dissertation]. Spanish: Universitat de Barcelona; 2014."},{"key":"ref23","series-title":"2017 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"3084","article-title":"A gift from knowledge distillation: fast optimization, network minimization and transfer learning","author":"Yim","year":"2024"},{"key":"ref24","series-title":"2016 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"2827","article-title":"Cross modal distillation for supervision transfer","author":"Gupta","year":"2016"},{"key":"ref25","series-title":"MM '18: Proceedings of the 26th ACM international conference on Multimedia","first-page":"1407","article-title":"Text-to-image synthesis via symmetrical distillation networks","author":"Yuan","year":"2018"},{"key":"ref26","series-title":"Proceeding of Machine Learning Research","first-page":"1","article-title":"Mass: masked sequence to sequence pre-training for language generation","author":"Song","year":"2019"},{"key":"ref27","doi-asserted-by":"crossref","unstructured":"Li P, Li L, Zhang M, Wu M, Liu Q. Universal conditional masked language pre-training for neural machine translation [Ph.D. dissertation]. China: Huawei Noah\u2019s Ark Lab; 2022.","DOI":"10.18653\/v1\/2022.acl-long.442"},{"key":"ref28","doi-asserted-by":"crossref","unstructured":"Huang H, Liang Y, Duan N, Gong M, Shou L, Jiang D, et al. Unicoder: a universal language encoder by pre-training with multiple cross-lingual tasks [Ph.D. dissertation]. China: Microsoft Research Asia; 2019.","DOI":"10.18653\/v1\/D19-1252"},{"key":"ref29","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J Mach Learn Res"},{"key":"ref30","series-title":"Proceedings of the 37th International Conference on Machine Learning","first-page":"642","article-title":"UniLMv2: pseudo-masked language models for unified language model pre-training","author":"Bao","year":"2020"},{"key":"ref31","doi-asserted-by":"crossref","first-page":"10839","DOI":"10.1007\/s11042-023-15724-z","article-title":"Enhanced copy-move forgery detection using deep convolutional neural network (DCNN) employing the ResNet-101 transfer learning model","volume":"83","author":"Vaishali","year":"2024","journal-title":"Multimed Tools Appl"},{"key":"ref32","first-page":"13165","article-title":"MST: masked self-supervised transformer for visual representation","volume":"34","author":"Li","year":"2021","journal-title":"Adv Neural Inf Process Syst"},{"key":"ref33","doi-asserted-by":"crossref","unstructured":"Elliott D, Frank S, Sima\u2019an K, Specia L. Multi30k: multilingual english-german image descriptions [Ph.D. dissertation]. Holland: University of Amsterdam; 2016.","DOI":"10.18653\/v1\/W16-3210"},{"key":"ref34","doi-asserted-by":"crossref","first-page":"67","DOI":"10.1162\/tacl_a_00166","article-title":"From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions","volume":"2","author":"Young","year":"2014","journal-title":"Trans Assoc Computat Linguist"},{"key":"ref35","doi-asserted-by":"crossref","unstructured":"Mathur N, Baldwin T, Cohn T. Tangled up in BLEU: reevaluating the evaluation of automatic machine translation evaluation metrics [M.S. dissertation]. Australia: The University of Melbourne; 2020.","DOI":"10.18653\/v1\/2020.acl-main.448"},{"key":"ref36","doi-asserted-by":"crossref","first-page":"040036","DOI":"10.1063\/5.0067018","article-title":"Automatic evaluating of Russian-Arabic machine translation quality using METEOR method","volume":"2386","author":"Hameed","year":"2022","journal-title":"AIP Conf Proc"},{"key":"ref37","unstructured":"Calixto I, Rios M, Aziz W. Latent variable model for multi-modal translation [M.S. dissertation]. Holland: The University of Amsterdam; 2018. LIUM-CVC Submissions for WMT17 Multimodal Translation Task [M.S. dissertation]. French: University of Le Mans; 2017."},{"key":"ref38","doi-asserted-by":"crossref","unstructured":"Zhou M, Cheng R, Lee YJ, Yu Z. A visual attention grounding neural model for multimodal machine translation [M.S. dissertation]. Canada: University of California; 2018.","DOI":"10.18653\/v1\/D18-1400"},{"key":"ref39","doi-asserted-by":"crossref","unstructured":"Caglayan O, Madhyastha P, Specia L, Barrault L. Probing the need for visual context in multimodal machine translation [M.S. dissertation]. French: Le Mans University; 2019.","DOI":"10.18653\/v1\/N19-1422"},{"key":"ref40","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"2720","article-title":"Efficient object-level visual context modeling for multimodal machine translation: masking irrelevant objects helps grounding","author":"Wang","year":"2021"},{"key":"ref41","unstructured":"Vaswani A. Attention is all you need [Ph.D. dissertation]. USA: Google Research; 2017."},{"key":"ref42","doi-asserted-by":"crossref","unstructured":"Wu Z, Kong L, Bi W, Li X, Kao B. Good for misconceived reasons: an empirical revisiting on the need for visual context in multimodal machine translation [M.S. dissertation]. China: The University of Hong Kong; 2021.","DOI":"10.18653\/v1\/2021.acl-long.480"},{"key":"ref43","unstructured":"Gong H, Jia M, Jing L. Multimodal interaction modeling via self-supervised multi-task learning for review helpfulness prediction [M.S. dissertation]. China: Sun Yat-sen University; 2024."},{"key":"ref44","series-title":"Proceedings of ICLR","article-title":"The progressive alignment-aware multimodal fusion with easy2hard strategy for multimodal neural machine translation","author":"Ye","year":"2023"},{"key":"ref45","doi-asserted-by":"crossref","unstructured":"Ji B, Zhang T, Zou Y, Hu B, Shen S. Increasing visual awareness in multimodal neural machine translation from an information theoretic perspective [M.S. dissertation]. China: Tencent Minority-Mandarin Translation; 2022.","DOI":"10.18653\/v1\/2022.emnlp-main.453"},{"key":"ref46","doi-asserted-by":"crossref","first-page":"14194","DOI":"10.1007\/s10489-022-03331-8","article-title":"Dual-level interactive multimodal-mixup encoder for multi-modal neural machine translation","volume":"52","author":"Ye","year":"2022","journal-title":"Appl Intell"}],"container-title":["Computers, Materials &amp; Continua"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/cdn.techscience.cn\/files\/cmc\/2025\/TSP_CMC-83-2\/TSP_CMC_61145\/TSP_CMC_61145.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,17]],"date-time":"2025-11-17T01:05:54Z","timestamp":1763341554000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.techscience.com\/cmc\/v83n2\/60545"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":46,"journal-issue":{"issue":"2","published-online":{"date-parts":[[2025]]},"published-print":{"date-parts":[[2025]]}},"URL":"https:\/\/doi.org\/10.32604\/cmc.2025.061145","relation":{},"ISSN":["1546-2226"],"issn-type":[{"type":"electronic","value":"1546-2226"}],"subject":[],"published":{"date-parts":[[2025]]}}}