{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T17:22:04Z","timestamp":1765387324257,"version":"3.46.0"},"reference-count":48,"publisher":"Springer Science and Business Media LLC","issue":"37","license":[{"start":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T00:00:00Z","timestamp":1750464000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T00:00:00Z","timestamp":1750464000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-025-20985-x","type":"journal-article","created":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T05:26:37Z","timestamp":1750483597000},"page":"46133-46149","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Image-text fine-grained alignment semantic guided reward-and-punishment mechanism for multi-modal Chinese-English neural language translation"],"prefix":"10.1007","volume":"84","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5644-8221","authenticated-orcid":false,"given":"Yanhui","family":"Ren","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,6,21]]},"reference":[{"key":"20985_CR1","doi-asserted-by":"publisher","first-page":"103986","DOI":"10.1016\/j.artint.2023.103986","volume":"323","author":"Y Yin","year":"2023","unstructured":"Yin Y, Zeng J, Su J et al (2023) Multi-modal graph contrastive encoding for neural machine translation[J]. Artif Intell 323:103986","journal-title":"Artif Intell"},{"key":"20985_CR2","doi-asserted-by":"publisher","unstructured":"Guo J, Ye J, Xiang Y et al (2023) Layer-level progressive transformer with modality difference awareness for multi-modal neural machine translation[J]. IEEE\/ACM Trans Audio Speech Lang Process 31:3015\u20133026.\u00a0https:\/\/doi.org\/10.1109\/TASLP.2023.3301210","DOI":"10.1109\/TASLP.2023.3301210"},{"issue":"3","key":"20985_CR3","doi-asserted-by":"publisher","first-page":"989","DOI":"10.2298\/CSIS231225025Y","volume":"21","author":"J Yu","year":"2024","unstructured":"Yu J, Lu Z, Yin S et al (2024) News recommendation model based on encoder graph neural network and Bat optimization in online social multimedia Art education[J]. Comput Sci Inform Syst 21(3):989\u20131012. https:\/\/doi.org\/10.2298\/CSIS231225025Y","journal-title":"Comput Sci Inform Syst"},{"key":"20985_CR4","doi-asserted-by":"publisher","unstructured":"Wang Y, Zeng Y, Liang J et al (2024) RetrievalMMT: retrieval-constrained multi-modal prompt learning for multi-modal machine translation[C]. Proceedings of the 2024 International Conference on Multimedia Retrieval. 860\u2013868.\u00a0https:\/\/doi.org\/10.1145\/3672608.3707816","DOI":"10.1145\/3672608.3707816"},{"issue":"6","key":"20985_CR5","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3589341","volume":"22","author":"X Huang","year":"2023","unstructured":"Huang X, Zhang J, Zong C (2023) Contrastive adversarial training for Multi-Modal machine Translation[J]. ACM Trans Asian Low-Resource Lang Inform Process 22(6):1\u201318","journal-title":"ACM Trans Asian Low-Resource Lang Inform Process"},{"key":"20985_CR6","doi-asserted-by":"publisher","unstructured":"Gupta D, Kharbanda S, Zhou J et al (2023) CLIPTrans: transferring visual knowledge with pre-trained models for multimodal machine translation[C]. Proceedings of the IEEE\/CVF international conference on computer vision. 2875\u20132886.\u00a0https:\/\/doi.org\/10.1109\/ICCV51070.2023.00269","DOI":"10.1109\/ICCV51070.2023.00269"},{"issue":"6","key":"20985_CR7","doi-asserted-by":"publisher","first-page":"1223","DOI":"10.1007\/s11390-023-1302-6","volume":"38","author":"C Wang","year":"2023","unstructured":"Wang C, Cai SJ, Shi BX et al (2023) Visual Topic Semantic Enhanced Machine Translation for Multi-Modal Data Efficiency[J]. J Comput Sci Technol 38(6):1223\u20131236","journal-title":"J Comput Sci Technol"},{"key":"20985_CR8","doi-asserted-by":"publisher","first-page":"106403","DOI":"10.1016\/j.neunet.2024.106403","volume":"178","author":"J Guo","year":"2024","unstructured":"Guo J, Su R, Ye J (2024) Multi-grained visual pivot-guided multi-modal neural machine translation with text-aware cross-modal contrastive disentangling[J]. Neural Netw 178:106403","journal-title":"Neural Netw"},{"key":"20985_CR9","unstructured":"Fataliyev K, Liu W (2023) MCASP: multi-modal cross attention network for stock market prediction[C]. Proceedings of the 21st Annual Workshop of the Australasian Language Technology Association. 67\u201377. https:\/\/aclanthology.org\/2023.alta-1.7\/"},{"key":"20985_CR10","doi-asserted-by":"publisher","unstructured":"Zhou F, Chen H (2023) Cross-modal translation and alignment for survival analysis[C]. Proceedings of the IEEE\/CVF International Conference on Computer Vision. 21485\u201321494.\u00a0https:\/\/doi.org\/10.1109\/ICCV51070.2023.01964","DOI":"10.1109\/ICCV51070.2023.01964"},{"issue":"2","key":"20985_CR11","first-page":"13","volume":"2","author":"S Yin","year":"2024","unstructured":"Yin S, Li H, Sun Y et al (2024) Data visualization analysis based on explainable artificial intelligence: A Survey[J]. IJLAI Trans Sci Eng 2(2):13\u201320","journal-title":"IJLAI Trans Sci Eng"},{"issue":"6","key":"20985_CR12","doi-asserted-by":"publisher","first-page":"e0287557","DOI":"10.1371\/journal.pone.0287557","volume":"18","author":"Y Xu","year":"2023","unstructured":"Xu Y, Zhang L, Shen X (2023) Multi-modal adaptive gated mechanism for visual question answering[J]. PLoS ONE 18(6):e0287557","journal-title":"PLoS ONE"},{"issue":"3","key":"20985_CR13","doi-asserted-by":"publisher","first-page":"189","DOI":"10.47738\/jads.v4i3.113","volume":"4","author":"DA Sulistyo","year":"2023","unstructured":"Sulistyo DA, Wibawa AP, Prasetya DD et al (2023) LSTM-Based machine translation for Madurese-Indonesian[J]. J Appl Data Sci 4(3):189\u2013199","journal-title":"J Appl Data Sci"},{"key":"20985_CR14","doi-asserted-by":"publisher","first-page":"106461","DOI":"10.1016\/j.est.2022.106461","volume":"58","author":"J Schmitt","year":"2023","unstructured":"Schmitt J, Horstk\u00f6tter I, B\u00e4ker B (2023) Electrical lithium-ion battery models based on recurrent neural networks: A holistic approach[J]. J Energy Storage 58:106461","journal-title":"J Energy Storage"},{"issue":"2","key":"20985_CR15","doi-asserted-by":"publisher","first-page":"1529","DOI":"10.1109\/TNSM.2023.3273991","volume":"20","author":"L Teng","year":"2023","unstructured":"Teng L, Qiao Y, Shafiq M et al (2023) FLPK-BiSeNet: federated learning based on priori knowledge and bilateral segmentation network for image edge extraction[J]. IEEE Trans Netw Serv Manage 20(2):1529\u20131542","journal-title":"IEEE Trans Netw Serv Manage"},{"key":"20985_CR16","doi-asserted-by":"publisher","first-page":"119951","DOI":"10.1016\/j.ins.2023.119951","volume":"657","author":"M Lu","year":"2024","unstructured":"Lu M, Xu X (2024) TRNN: an efficient time-series recurrent neural network for stock price prediction[J]. Inf Sci 657:119951","journal-title":"Inf Sci"},{"key":"20985_CR17","doi-asserted-by":"publisher","first-page":"110419","DOI":"10.1016\/j.asoc.2023.110419","volume":"143","author":"R Lin","year":"2023","unstructured":"Lin R, Wang H, Xiong M et al (2023) Attention-based gate recurrent unit for remaining useful life prediction in prognostics[J]. Appl Soft Comput 143:110419","journal-title":"Appl Soft Comput"},{"key":"20985_CR18","doi-asserted-by":"publisher","unstructured":"Srisurya IV (2023) Neural machine translation using adam optimised generative adversarial network[C]. 2023 7th International Conference on Computing Methodologies and Communication (ICCMC). IEEE. 383\u2013387.\u00a0https:\/\/doi.org\/10.1109\/ICCMC56507.2023.10084034","DOI":"10.1109\/ICCMC56507.2023.10084034"},{"key":"20985_CR19","doi-asserted-by":"publisher","first-page":"112110","DOI":"10.1016\/j.commatsci.2023.112110","volume":"223","author":"AAK Farizhandi","year":"2023","unstructured":"Farizhandi AAK, Mamivand M (2023) Spatiotemporal prediction of microstructure evolution with predictive recurrent neural network[J]. Comput Mater Sci 223:112110","journal-title":"Comput Mater Sci"},{"issue":"5","key":"20985_CR20","doi-asserted-by":"publisher","first-page":"13159","DOI":"10.1007\/s11042-023-16113-2","volume":"83","author":"PSS Gopi","year":"2024","unstructured":"Gopi PSS, Karthikeyan M (2024) Red Fox optimization with ensemble recurrent neural network for crop recommendation and yield prediction model[J]. Multimedia Tools Appl 83(5):13159\u201313179","journal-title":"Multimedia Tools Appl"},{"key":"20985_CR21","doi-asserted-by":"publisher","first-page":"109948","DOI":"10.1016\/j.engappai.2024.109948","volume":"142","author":"S Chowdhury","year":"2025","unstructured":"Chowdhury S, Soni B (2025) ENVQA: improving visual question answering model by enriching the visual feature[J]. Eng Appl Artif Intell 142:109948","journal-title":"Eng Appl Artif Intell"},{"key":"20985_CR22","doi-asserted-by":"publisher","unstructured":"Lee C, Yang T, Chen Z et al (2023) Heterogeneous anomaly detection for software systems via semi-supervised cross-modal attention[C]. 2023 IEEE\/ACM 45th International Conference on Software Engineering (ICSE). IEEE. 1724\u20131736.\u00a0https:\/\/doi.org\/10.1109\/ICSE48619.2023.00148","DOI":"10.1109\/ICSE48619.2023.00148"},{"key":"20985_CR23","doi-asserted-by":"publisher","first-page":"111329","DOI":"10.1016\/j.knosys.2023.111329","volume":"284","author":"J Liu","year":"2024","unstructured":"Liu J, Guan S, Zou Q et al (2024) Attention aware multi-modal fusion using a dual graph transformer for drug\u2013disease associations prediction[J]. Knowl Based Syst 284:111329","journal-title":"Knowl Based Syst"},{"key":"20985_CR24","doi-asserted-by":"publisher","first-page":"8346","DOI":"10.1109\/TMM.2023.3235495","volume":"25","author":"Y Xu","year":"2023","unstructured":"Xu Y, Bin Y, Wei J et al (2023) Multi-modal transformer with global-local alignment for composed query image retrieval[J]. IEEE Trans Multimedia 25:8346\u20138357","journal-title":"IEEE Trans Multimedia"},{"key":"20985_CR25","doi-asserted-by":"crossref","unstructured":"Li Q, Ji C, Guo S et al (2023) Multi-modal knowledge graph transformer framework for multi-modal entity alignment[J]. arxiv preprint arxiv:2310.06365. https:\/\/openreview.net\/forum?id=xzveggFhiQ","DOI":"10.18653\/v1\/2023.findings-emnlp.70"},{"issue":"9","key":"20985_CR26","doi-asserted-by":"publisher","first-page":"10429","DOI":"10.1007\/s10489-022-04055-5","volume":"53","author":"B Wang","year":"2023","unstructured":"Wang B, Feng Y, Xiong X et al (2023) Multi-modal transformer using two-level visual features for fake news detection[J]. Appl Intell 53(9):10429\u201310443","journal-title":"Appl Intell"},{"issue":"2","key":"20985_CR27","doi-asserted-by":"publisher","first-page":"367","DOI":"10.1007\/s11227-024-06840-0","volume":"81","author":"D Wu","year":"2025","unstructured":"Wu D, Zhang L, Chen Y (2025) Syntactic-guided optimization of image\u2013text matching for intra-modal modeling[J]. J Supercomputing 81(2):367","journal-title":"J Supercomputing"},{"issue":"2","key":"20985_CR28","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3545573","volume":"19","author":"S Liang","year":"2023","unstructured":"Liang S, Zhu A, Zhang J et al (2023) Hyper-node relational graph attention network for multi-modal knowledge graph completion[J]. ACM Trans Multimedia Comput Commun Appl 19(2):1\u201321","journal-title":"ACM Trans Multimedia Comput Commun Appl"},{"key":"20985_CR29","doi-asserted-by":"publisher","unstructured":"Wang R, Liu J, Li M et al (2024) Multi-modal online review driven product improvement design based on scientific effects knowledge graph[J]. J Eng Des 1\u201338. https:\/\/doi.org\/10.1080\/09544828.2023.2301229","DOI":"10.1080\/09544828.2023.2301229"},{"key":"20985_CR30","doi-asserted-by":"publisher","unstructured":"Wang Q, Mao Y, Wang J et al (2023) Aprompt: attention prompt tuning for efficient adaptation of pre-trained language models[C]. Proceedings of the 2023 conference on empirical methods in natural language processing. 9147\u20139160.\u00a0https:\/\/doi.org\/10.18653\/v1\/2023.emnlp-main.567","DOI":"10.18653\/v1\/2023.emnlp-main.567"},{"key":"20985_CR31","doi-asserted-by":"publisher","unstructured":"Wang Q, Yang L, Quan X et al (2022) Learning to generate question by asking question: a primal-dual approach with uncommon word generation[C]. Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing. 46\u201361.\u00a0https:\/\/doi.org\/10.18653\/v1\/2022.emnlp-main.4","DOI":"10.18653\/v1\/2022.emnlp-main.4"},{"key":"20985_CR32","doi-asserted-by":"publisher","unstructured":"Yan L, Wang Q, Cui Y et al (2022) GL-RG: global-local representation granularity for video captioning[J]. rXiv preprint arXiv:2205.10706.\u00a0https:\/\/doi.org\/10.24963\/ijcai.2022\/381","DOI":"10.24963\/ijcai.2022\/381"},{"key":"20985_CR33","doi-asserted-by":"publisher","unstructured":"Yin S, Li H, Laghari AA et al (2024) FLSN-MVO: edge computing and privacy protection based on federated learning Siamese network with multi-verse optimization algorithm for industry 5.0[J]. IEEE Open J Commun Soc 6:3443\u20133458.\u00a0https:\/\/doi.org\/10.1109\/OJCOMS.2024.3520562","DOI":"10.1109\/OJCOMS.2024.3520562"},{"key":"20985_CR34","first-page":"3882","volume":"37","author":"J Wang","year":"2024","unstructured":"Wang J, Wang P, Liu D et al (2024) Diffusion-inspired truncated sampler for text-video retrieval[J]. Adv Neural Inf Process Syst 37:3882\u20133906","journal-title":"Adv Neural Inf Process Syst"},{"key":"20985_CR35","doi-asserted-by":"publisher","unstructured":"Wang J, Sun G, Wang P et al (2024) Text is mass: modeling as stochastic embedding for text-video retrieval[C]. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 16551\u201316560.\u00a0https:\/\/doi.org\/10.1109\/CVPR52733.2024.01566","DOI":"10.1109\/CVPR52733.2024.01566"},{"key":"20985_CR36","doi-asserted-by":"publisher","unstructured":"Nishihara T, Tamura A, Ninomiya T et al (2020) Supervised visual attention for multimodal neural machine translation[C]. Proceedings of the 28th International Conference on Computational Linguistics. 4304\u20134314.\u00a0https:\/\/doi.org\/10.18653\/v1\/2020.coling-main.380","DOI":"10.18653\/v1\/2020.coling-main.380"},{"key":"20985_CR37","doi-asserted-by":"publisher","first-page":"119892","DOI":"10.1016\/j.ins.2023.119892","volume":"654","author":"Z Zhu","year":"2024","unstructured":"Zhu Z, Li X, Chen H et al (2024) An effective and robust genetic algorithm with hybrid multi-strategy and mechanism for airport gate allocation[J]. Inf Sci 654:119892","journal-title":"Inf Sci"},{"key":"20985_CR38","doi-asserted-by":"publisher","unstructured":"Vijayan V, Bowen B, Grigsby S et al (2024) The case for evaluating multimodal translation models on text datasets[J]. arxiv preprint arxiv:2403.03014. https:\/\/doi.org\/10.48550\/arXiv.2403.03014","DOI":"10.48550\/arXiv.2403.03014"},{"key":"20985_CR39","doi-asserted-by":"publisher","unstructured":"Yan L, Han C, Xu Z et al (2023) Prompt learns prompt: exploring knowledge-aware generative prompt collaboration for video captioning[C]. Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence (IJCAI-23). 180:1622\u20131630. https:\/\/doi.org\/10.24963\/ijcai.2023\/180","DOI":"10.24963\/ijcai.2023\/180"},{"key":"20985_CR40","doi-asserted-by":"crossref","unstructured":"Wang D, Xiong D (2021) Efficient object-level visual context modeling for multimodal machine translation: masking irrelevant objects helps grounding[C]. Proceedings of the AAAI conference on artificial intelligence. 35(4):2720\u20132728","DOI":"10.1609\/aaai.v35i4.16376"},{"key":"20985_CR41","doi-asserted-by":"publisher","unstructured":"Lin H, Meng F, Su J et al (2020) Dynamic context-guided capsule network for multimodal machine translation[C]. Proceedings of the 28th ACM international conference on multimedia. 1320\u20131329. https:\/\/doi.org\/10.1145\/3394171.3413715","DOI":"10.1145\/3394171.3413715"},{"key":"20985_CR42","doi-asserted-by":"publisher","unstructured":"Hou Z, Guo J (2024) Virtual visual-guided domain-shadow fusion via modal exchanging for domain-specific multi-modal neural machine translation[C]. Proceedings of the 32nd ACM International Conference on Multimedia. 4227\u20134235. https:\/\/doi.org\/10.1145\/3664647.3681525","DOI":"10.1145\/3664647.3681525"},{"key":"20985_CR43","unstructured":"Hatami A, Buitelaar P, Arcan M (2023) A filtering approach to object region detection in multimodal machine translation[C]. Proceedings of Machine Translation Summit XIX, Vol. 1: Research Track. 393\u2013405"},{"key":"20985_CR44","doi-asserted-by":"publisher","first-page":"101935","DOI":"10.1016\/j.inffus.2023.101935","volume":"100","author":"B Zhu","year":"2023","unstructured":"Zhu B, Wu M, Hong Y et al (2023) Mmiea: Multi-modal interaction entity alignment model for knowledge graphs[J]. Inform Fusion 100:101935","journal-title":"Inform Fusion"},{"key":"20985_CR45","doi-asserted-by":"publisher","first-page":"14194","DOI":"10.1007\/s10489-022-03331-8","volume":"52","author":"J Ye","year":"2022","unstructured":"Ye J, Guo J (2022) Dual-level interactive multimodal-mixup encoder for multi-modal neural machine translation[J]. Appl Intell 52:14194\u201314203. https:\/\/doi.org\/10.1007\/s10489-022-03331-8","journal-title":"Appl Intell"},{"key":"20985_CR46","doi-asserted-by":"crossref","unstructured":"Guo J, Hou Z, **an Y et al (2024) Progressive modality-complement aggregative multitransformer for domain multi-modal neural machine translation[J]. Pattern Recogn 149:110294","DOI":"10.1016\/j.patcog.2024.110294"},{"key":"20985_CR47","doi-asserted-by":"publisher","unstructured":"Ouenniche K, Tapu R, Zaharia T (2023) Vision-text cross-modal fusion for accurate video captioning[J]. IEEE Access 11:115477\u2013115492.\u00a0https:\/\/doi.org\/10.1109\/ACCESS.2023.3324052","DOI":"10.1109\/ACCESS.2023.3324052"},{"key":"20985_CR48","doi-asserted-by":"publisher","unstructured":"Arslan HS, Fishel M, Anbarjafari G (2018) Doubly attentive transformer machine translation[J]. arxiv preprint arxiv:1807.11605. https:\/\/doi.org\/10.48550\/arXiv.1807.11605","DOI":"10.48550\/arXiv.1807.11605"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-025-20985-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-025-20985-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-025-20985-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T17:17:32Z","timestamp":1765387052000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-025-20985-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,21]]},"references-count":48,"journal-issue":{"issue":"37","published-online":{"date-parts":[[2025,11]]}},"alternative-id":["20985"],"URL":"https:\/\/doi.org\/10.1007\/s11042-025-20985-x","relation":{},"ISSN":["1573-7721"],"issn-type":[{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2025,6,21]]},"assertion":[{"value":"16 January 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 March 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 June 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 June 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The manuscript is not be submitted to more than one journal for simultaneous consideration.The submitted work is original and is not have been published elsewhere in any form or language.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical"}},{"value":"The authors declare no conflict of interest involved in this paper.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing Interests"}}]}}