{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T03:21:34Z","timestamp":1740108094569,"version":"3.37.3"},"reference-count":44,"publisher":"Springer Science and Business Media LLC","issue":"22","license":[{"start":{"date-parts":[[2024,4,29]],"date-time":"2024-04-29T00:00:00Z","timestamp":1714348800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,4,29]],"date-time":"2024-04-29T00:00:00Z","timestamp":1714348800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Comput &amp; Applic"],"published-print":{"date-parts":[[2024,8]]},"DOI":"10.1007\/s00521-024-09705-y","type":"journal-article","created":{"date-parts":[[2024,4,29]],"date-time":"2024-04-29T12:01:55Z","timestamp":1714392115000},"page":"13853-13864","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Dose multimodal machine translation can improve translation performance?"],"prefix":"10.1007","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4466-3418","authenticated-orcid":false,"given":"ShaoDong","family":"Cui","sequence":"first","affiliation":[]},{"given":"Kaibo","family":"Duan","sequence":"additional","affiliation":[]},{"given":"Wen","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Hiroyuki","family":"Shinnou","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,4,29]]},"reference":[{"key":"9705_CR1","doi-asserted-by":"crossref","unstructured":"Barrault L, Bougares F, Specia L, Lala C, Elliott D, Frank S (2018) Findings of the third shared task on multimodal machine translation. In: Proceedings of the third conference on machine translation: shared task papers, pp 304\u2013323","DOI":"10.18653\/v1\/W18-6402"},{"key":"9705_CR2","unstructured":"Caglayan O, Barrault L, Bougares F (2016) Multimodal attention for neural machine translation. arXiv preprint arXiv:1609.03976"},{"key":"9705_CR3","doi-asserted-by":"publisher","unstructured":"Caglayan O, Aransa W, Bardet A, Garc\u00eda-Mart\u00ednez M, Bougares F, Barrault L, Masana M, Herranz L, van\u00a0de Weijer J (2017) LIUM-CVC submissions for WMT17 multimodal translation task. In: Proceedings of the second conference on machine translation, association for computational linguistics, Copenhagen, Denmark, pp 432\u2013439 https:\/\/doi.org\/10.18653\/v1\/W17-4746","DOI":"10.18653\/v1\/W17-4746"},{"key":"9705_CR4","doi-asserted-by":"crossref","unstructured":"Caglayan O, Madhyastha P, Specia L, Barrault L (2019) Probing the need for visual context in multimodal machine translation. arXiv preprint arXiv:1903.08678","DOI":"10.18653\/v1\/N19-1422"},{"key":"9705_CR5","doi-asserted-by":"publisher","unstructured":"Caglayan O, Ive J, Haralampieva V, Madhyastha P, Barrault L, Specia L (2020) Simultaneous machine translation with visual context. In: Proceedings of the 2020 conference on empirical methods in natural language processing (EMNLP), association for computational linguistics, Online, pp 2350\u2013236https:\/\/doi.org\/10.18653\/v1\/2020.emnlp-main.184","DOI":"10.18653\/v1\/2020.emnlp-main.184"},{"key":"9705_CR6","doi-asserted-by":"publisher","unstructured":"Calixto I, Rios M, Aziz W (2019) Latent variable model for multi-modal translation. In: Proceedings of the 57th annual meeting of the association for computational linguistics, association for computational linguistics, Florence, Italy, pp 6392\u2013640https:\/\/doi.org\/10.18653\/v1\/P19-1642","DOI":"10.18653\/v1\/P19-1642"},{"key":"9705_CR7","unstructured":"Carlsson F, Eisen P, Rekathati F, Sahlgren M (2022) Cross-lingual and multilingual clip. In: Proceedings of the thirteenth language resources and evaluation conference, pp 6848\u20136854"},{"key":"9705_CR8","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2022.108598","volume":"245","author":"S Chen","year":"2022","unstructured":"Chen S, Zeng Y, Cao D, Lu S (2022) Video-guided machine translation via dual-level back-translation. Knowl Based Syst 245:108598","journal-title":"Knowl Based Syst"},{"key":"9705_CR9","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, et\u00a0al. (2020) An image is worth $$16\\times 16$$ words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"9705_CR10","doi-asserted-by":"crossref","unstructured":"Elliott D (2018) Adversarial evaluation of multimodal machine translation. In: EMNLP, pp 2974\u20132978","DOI":"10.18653\/v1\/D18-1329"},{"key":"9705_CR11","doi-asserted-by":"publisher","unstructured":"Elliott D, Frank S, Sima\u2019an K, Specia L (2016) Multi30k: multilingual English\u2013German image descriptions. In: Proceedings of the 5th workshop on vision and language, association for computational linguistics, pp 70\u201377. https:\/\/doi.org\/10.18653\/v1\/W16-3210","DOI":"10.18653\/v1\/W16-3210"},{"key":"9705_CR12","doi-asserted-by":"crossref","unstructured":"Elliott D, Frank S, Barrault L, Bougares F, Specia L (2017) Findings of the second shared task on multimodal machine translation and multilingual image description. In: Proceedings of the second conference on machine translation, volume 2: shared task papers, association for computational linguistics, Copenhagen, Denmark, pp 215\u2013233. http:\/\/www.aclweb.org\/anthology\/W17-4718","DOI":"10.18653\/v1\/W17-4718"},{"key":"9705_CR13","unstructured":"Gain B, Bandyopadhyay D, Mukherjee S, Adak C, Ekbal A (2023) Impact of visual context on noisy multimodal NMT: an empirical study for English to Indian languages. arXiv preprint arXiv:2308.16075"},{"key":"9705_CR14","doi-asserted-by":"crossref","unstructured":"Gr\u00f6nroos SA, Huet B, Kurimo M, Laaksonen J, Merialdo B, Pham P, Sj\u00f6berg M, Sulubacak U, Tiedemann J, Troncy R et\u00a0al (2018) The MeMAD submission to the wmt18 multimodal translation task. arXiv preprint arXiv:1808.10802","DOI":"10.18653\/v1\/W18-6439"},{"key":"9705_CR15","doi-asserted-by":"crossref","unstructured":"Gupta D, Kharbanda S, Zhou J, Li W, Pfister H, Wei D (2023) CLIPTrans: transferring visual knowledge with pre-trained models for multimodal machine translation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 2875\u20132886","DOI":"10.1109\/ICCV51070.2023.00269"},{"key":"9705_CR16","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"9705_CR17","doi-asserted-by":"crossref","unstructured":"Helcl J, Libovick\u1ef3 J, Vari\u0161 D (2018) CUNI system for the WMT18 multimodal translation task. arXiv preprint arXiv:1811.04697","DOI":"10.18653\/v1\/W18-6441"},{"key":"9705_CR18","doi-asserted-by":"crossref","unstructured":"Huang PY, Liu F, Shiang SR, Oh J, Dyer C (2016) Attention-based multimodal neural machine translation. In: Proceedings of the first conference on machine translation, shared task papers, vol 2, pp 639\u2013645","DOI":"10.18653\/v1\/W16-2360"},{"key":"9705_CR19","unstructured":"Imankulova A, Kaneko M, Hirasawa T, Komachi M (2020) Toward multimodal simultaneous neural machine translation. In: Proceedings of the fifth conference on machine translation, association for computational linguistics, Online, pp 540\u2013549 https:\/\/www.aclweb.org\/anthology\/2020.wmt-1.70"},{"key":"9705_CR20","doi-asserted-by":"publisher","first-page":"352","DOI":"10.1016\/j.inffus.2022.10.018","volume":"91","author":"L Li","year":"2023","unstructured":"Li L, Tayir T, Han Y, Tao X, Vel\u00e1squez JD (2023) Multimodality information fusion for automated machine translation. Inf Fusion 91:352\u2013363. https:\/\/doi.org\/10.1016\/j.inffus.2022.10.018","journal-title":"Inf Fusion"},{"key":"9705_CR21","doi-asserted-by":"publisher","unstructured":"Libovick\u00fd J, Helcl J (2017) Attention strategies for multi-source sequence-to-sequence learning. In: Barzilay R, Kan MY (eds) Proceedings of the 55th annual meeting of the association for computational linguistics (vol 2: short papers), association for computational linguistics, Vancouver, Canada, pp 196\u201320https:\/\/doi.org\/10.18653\/v1\/P17-2031","DOI":"10.18653\/v1\/P17-2031"},{"key":"9705_CR22","doi-asserted-by":"crossref","unstructured":"Lin H, Meng F, Su J, Yin Y, Yang Z, Ge Y, Zhou J, Luo J (2020) Dynamic context-guided capsule network for multimodal machine translation. In: Proceedings of the 28th ACM international conference on multimedia, pp 1320\u20131329","DOI":"10.1145\/3394171.3413715"},{"key":"9705_CR23","unstructured":"Liu P, Cao H, Zhao T (2021) Gumbel-attention for multi-modal machine translation. arXiv preprint arXiv:2103.08862"},{"key":"9705_CR24","doi-asserted-by":"publisher","unstructured":"Long Q, Wang M, Li L (2021) Generative imagination elevates machine translation. In: Proceedings of the 2021 conference of the North American chapter of the association for computational linguistics: human language technologies, association for computational linguistics, Online, pp 5738\u2013574https:\/\/doi.org\/10.18653\/v1\/2021.naacl-main.457","DOI":"10.18653\/v1\/2021.naacl-main.457"},{"key":"9705_CR25","doi-asserted-by":"crossref","unstructured":"Madhyastha PS, Wang J, Specia L (2017) Sheffield multimt: using object posterior predictions for multimodal machine translation. In: Proceedings of the second conference on machine translation, pp 470\u2013476","DOI":"10.18653\/v1\/W17-4752"},{"key":"9705_CR26","doi-asserted-by":"crossref","unstructured":"Peng R, Zeng Y, Zhao J (2022) Distill the image to nowhere: inversion knowledge distillation for multimodal machine translation. In: Proceedings of the 2022 conference on empirical methods in natural language processing, association for computational linguistics, Abu Dhabi, United Arab Emirates, pp 2379\u20132390 https:\/\/aclanthology.org\/2022.emnlp-main.152","DOI":"10.18653\/v1\/2022.emnlp-main.152"},{"key":"9705_CR27","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J et\u00a0al (2021) Learning transferable visual models from natural language supervision. In: International conference on machine learning, PMLR, pp 8748\u20138763"},{"issue":"1","key":"9705_CR28","first-page":"5485","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel C, Shazeer N, Roberts A, Lee K, Narang S, Matena M, Zhou Y, Li W, Liu PJ (2020) Exploring the limits of transfer learning with a unified text-to-text transformer. J Mach Learn Res 21(1):5485\u20135551","journal-title":"J Mach Learn Res"},{"key":"9705_CR29","doi-asserted-by":"crossref","unstructured":"Shaw P, Uszkoreit J, Vaswani A (2018) Self-attention with relative position representations. arXiv preprint arXiv:1803.02155","DOI":"10.18653\/v1\/N18-2074"},{"key":"9705_CR30","doi-asserted-by":"publisher","first-page":"3013","DOI":"10.1109\/TMM.2021.3092187","volume":"24","author":"Y Song","year":"2021","unstructured":"Song Y, Chen S, Jin Q, Luo W, Xie J, Huang F (2021) Enhancing neural machine translation with dual-side multimodal awareness. IEEE Trans Multimedia 24:3013\u20133024","journal-title":"IEEE Trans Multimedia"},{"key":"9705_CR31","doi-asserted-by":"crossref","unstructured":"Specia L, Frank S, Sima\u2019An K, Elliott D (2016) A shared task on multimodal machine translation and crosslingual image description. In: Proceedings of the first conference on machine translation, shared task papers, vol 2, pp 543\u2013553","DOI":"10.18653\/v1\/W16-2346"},{"key":"9705_CR32","unstructured":"Tamura H, Hirasawa T, Kaneko M, Komachi M (2020) TMU Japanese-English multimodal machine translation system for wat 2020. In: Proceedings of the 7th workshop on Asian translation, pp 80\u201391"},{"key":"9705_CR33","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. In: Advances in neural information processing system, vol 30"},{"key":"9705_CR34","doi-asserted-by":"crossref","unstructured":"Wang X, Wu J, Chen J, Li L, Wang YF, Wang WY (2019) VaTeX: a large-scale, high-quality multilingual dataset for video-and-language research. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 4581\u20134591","DOI":"10.1109\/ICCV.2019.00468"},{"key":"9705_CR35","doi-asserted-by":"crossref","unstructured":"Wu Z, Kong L, Bi W, Li X, Kao B (2021a) Good for misconceived reasons: an empirical revisiting on the need for visual context in multimodal machine translation. arXiv preprint arXiv:2105.14462","DOI":"10.18653\/v1\/2021.acl-long.480"},{"key":"9705_CR36","doi-asserted-by":"publisher","unstructured":"Wu Z, Kong L, Bi W, Li X, Kao B (2021b) Good for misconceived reasons: an empirical revisiting on the need for visual context in multimodal machine translation. In: Proceedings of the 59th annual meeting of the association for computational linguistics and the 11th international joint conference on natural language processing (Volume 1: long papers), association for computational linguistics, Online, pp 6153\u2013616 https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.480","DOI":"10.18653\/v1\/2021.acl-long.480"},{"key":"9705_CR37","first-page":"9418","volume":"34","author":"P Yang","year":"2020","unstructured":"Yang P, Chen B, Zhang P, Sun X (2020) Visual agreement regularized training for multi-modal machine translation. Proc AAAI Conf Artif Intell 34:9418\u20139425","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"9705_CR38","doi-asserted-by":"publisher","first-page":"388","DOI":"10.2197\/ipsjjip.30.388","volume":"30","author":"Z Yang","year":"2022","unstructured":"Yang Z, Hirasawa T, Komachi M, Okazaki N (2022) Why videos do not guide translations in video-guided machine translation? An empirical evaluation of video-guided machine translation dataset. J Inform Process 30:388\u2013396","journal-title":"J Inform Process"},{"key":"9705_CR39","doi-asserted-by":"crossref","unstructured":"Yao S, Wan X (2020) Multimodal transformer for multimodal machine translation. In: Proceedings of the 58th annual meeting of the association for computational linguistics, pp 4346\u20134350","DOI":"10.18653\/v1\/2020.acl-main.400"},{"key":"9705_CR40","doi-asserted-by":"crossref","unstructured":"Yin Y, Meng F, Su J, Zhou C, Yang Z, Zhou J, Luo J (2020) A novel graph-based multi-modal fusion encoder for neural machine translation. arXiv preprint arXiv:2007.08742","DOI":"10.18653\/v1\/2020.acl-main.273"},{"key":"9705_CR41","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young P, Lai A, Hodosh M, Hockenmaier J (2014) From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans Assoc Comput Linguist 2:67\u201378","journal-title":"Trans Assoc Comput Linguist"},{"key":"9705_CR42","unstructured":"Zhao Y, Komachi M, Kajiwara T, Chu C (2020) Double attention-based multimodal neural machine translation with semantic image regions. In: Proceedings of the 22nd annual conference of the European association for machine translation, pp 105\u2013114"},{"key":"9705_CR43","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.neucom.2021.12.076","volume":"476","author":"Y Zhao","year":"2022","unstructured":"Zhao Y, Komachi M, Kajiwara T, Chu C (2022) Region-attentive multimodal neural machine translation. Neurocomputing 476:1\u201313","journal-title":"Neurocomputing"},{"key":"9705_CR44","doi-asserted-by":"publisher","unstructured":"Zhou M, Cheng R, Lee YJ, Yu Z (2018) A visual attention grounding neural model for multimodal machine translation. In: Proceedings of the 2018 conference on empirical methods in natural language processing, association for computational linguistics, Brussels, Belgium, pp 3643\u2013365https:\/\/doi.org\/10.18653\/v1\/D18-1400","DOI":"10.18653\/v1\/D18-1400"}],"container-title":["Neural Computing and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-024-09705-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00521-024-09705-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-024-09705-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,9]],"date-time":"2024-08-09T18:13:23Z","timestamp":1723227203000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00521-024-09705-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,29]]},"references-count":44,"journal-issue":{"issue":"22","published-print":{"date-parts":[[2024,8]]}},"alternative-id":["9705"],"URL":"https:\/\/doi.org\/10.1007\/s00521-024-09705-y","relation":{},"ISSN":["0941-0643","1433-3058"],"issn-type":[{"type":"print","value":"0941-0643"},{"type":"electronic","value":"1433-3058"}],"subject":[],"published":{"date-parts":[[2024,4,29]]},"assertion":[{"value":"31 August 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 March 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 April 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"This article does not contain any studies with human participants or animals performed by any of the authors.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval"}}]}}