{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,28]],"date-time":"2025-07-28T21:05:03Z","timestamp":1753736703058,"version":"3.37.3"},"reference-count":63,"publisher":"Springer Science and Business Media LLC","issue":"1-2","license":[{"start":{"date-parts":[[2019,4,8]],"date-time":"2019-04-08T00:00:00Z","timestamp":1554681600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"}],"funder":[{"DOI":"10.13039\/501100001602","name":"Science Foundation Ireland","doi-asserted-by":"publisher","award":["Grant 13\/RC\/2106"],"award-info":[{"award-number":["Grant 13\/RC\/2106"]}],"id":[{"id":"10.13039\/501100001602","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003246","name":"Nederlandse Organisatie voor Wetenschappelijk Onderzoek","doi-asserted-by":"publisher","award":["27789002"],"award-info":[{"award-number":["27789002"]}],"id":[{"id":"10.13039\/501100003246","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Machine Translation"],"published-print":{"date-parts":[[2019,6]]},"DOI":"10.1007\/s10590-019-09226-9","type":"journal-article","created":{"date-parts":[[2019,4,8]],"date-time":"2019-04-08T17:47:44Z","timestamp":1554745664000},"page":"155-177","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["An error analysis for image-based multi-modal neural machine translation"],"prefix":"10.1007","volume":"33","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6244-7906","authenticated-orcid":false,"given":"Iacer","family":"Calixto","sequence":"first","affiliation":[]},{"given":"Qun","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,4,8]]},"reference":[{"key":"9226_CR1","unstructured":"Bahdanau D, Cho K, Bengio Y (2015) Neural machine translation by jointly learning to align and translate. In: International conference on learning representations, ICLR 2015, San Diego, California"},{"key":"9226_CR2","doi-asserted-by":"crossref","unstructured":"Bruni E, Tran NK, Baroni M (2014) Multimodal distributional semantics. J Artif Intell Res 49(1):1\u201347. \n                    http:\/\/dl.acm.org\/citation.cfm?id=2655713.2655714","DOI":"10.1613\/jair.4135"},{"key":"9226_CR3","unstructured":"Caglayan O, Aransa W, Wang Y, Masana M, Garc\u00eda-Mart\u00ednez M, Bougares F, Barrault L, van\u00a0de Weijer J (2016) Does multimodality help human and machine for translation and image captioning? In: Proceedings of the first conference on machine translation, Berlin, Germany, pp 627\u2013633. \n                    http:\/\/www.aclweb.org\/anthology\/W\/W16\/W16-2358"},{"key":"9226_CR4","doi-asserted-by":"crossref","unstructured":"Caglayan O, Aransa W, Bardet A, Garc\u00eda-Mart\u00ednez M, Bougares F, Barrault L, Masana M, Herranz L, van\u00a0de Weijer J (2017) LIUM-CVC submissions for WMT17 multimodal translation task. In: Proceedings of the second conference on machine translation. Association for Computational Linguistics, Copenhagen, pp 432\u2013439. \n                    http:\/\/www.aclweb.org\/anthology\/W17-4746","DOI":"10.18653\/v1\/W17-4746"},{"key":"9226_CR5","unstructured":"Calixto I (2017) Incorporating visual information into neural machine translation. PhD Thesis, School of Computing, Dublin City University"},{"key":"9226_CR6","doi-asserted-by":"crossref","unstructured":"Calixto I, Liu Q (2017) Incorporating global visual features into attention-based neural machine translation. In: Proceedings of the 2017 conference on empirical methods in natural language processing. Association for Computational Linguistics, Copenhagen, pp 992\u20131003. \n                    https:\/\/www.aclweb.org\/anthology\/D17-1105","DOI":"10.18653\/v1\/D17-1105"},{"key":"9226_CR7","unstructured":"Calixto I, de\u00a0Campos T, Specia L (2012) Images as context in Statistical Machine Translation. In: Proceedings of the workshop on vision and language, VL 2012, Sheffield, England"},{"key":"9226_CR8","doi-asserted-by":"crossref","unstructured":"Calixto I, Elliott D, Frank S (2016) DCU-UvA multimodal MT system report. In: Proceedings of the first conference on machine translation, Berlin, Germany, pp 634\u2013638. \n                    http:\/\/www.aclweb.org\/anthology\/W\/W16\/W16-2359","DOI":"10.18653\/v1\/W16-2359"},{"key":"9226_CR9","doi-asserted-by":"crossref","unstructured":"Calixto I, Liu Q, Campbell N (2017) Doubly-attentive decoder for multi-modal neural machine translation. In: Proceedings of the 55th conference of the association for computational linguistics: long papers, Vancouver, Canada, vol 1","DOI":"10.18653\/v1\/P17-1175"},{"key":"9226_CR10","doi-asserted-by":"publisher","unstructured":"Chen M, Wang S, Liang PP, Baltru\u0161aitis T, Zadeh A, Morency LP (2017) Multimodal sentiment analysis with word-level fusion and reinforcement learning. In: Proceedings of the 19th ACM international conference on multimodal interaction, ICMI 2017. ACM, New York, pp 163\u2013171. \n                    https:\/\/doi.org\/10.1145\/3136755.3136801","DOI":"10.1145\/3136755.3136801"},{"key":"9226_CR11","doi-asserted-by":"crossref","unstructured":"Cho K, van Merrienboer B, Gulcehre C, Bahdanau D, Bougares F, Schwenk H, Bengio Y (2014) Learning phrase representations using RNN encoder\u2013decoder for statistical machine translation. In: Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), Doha, Qatar, pp 1724\u20131734. \n                    http:\/\/www.aclweb.org\/anthology\/D14-1179","DOI":"10.3115\/v1\/D14-1179"},{"key":"9226_CR12","unstructured":"Church K, Patil R (1982) Coping with syntactic ambiguity or how to put the block in the box on the table. Comput Linguist 8(3-4):139\u2013149. \n                    http:\/\/dl.acm.org\/citation.cfm?id=972942.972946"},{"key":"9226_CR13","doi-asserted-by":"crossref","unstructured":"Denkowski M, Lavie A (2014) Meteor universal: language specific translation evaluation for any target language. In: Proceedings of the EACL 2014 workshop on statistical machine translation","DOI":"10.3115\/v1\/W14-3348"},{"key":"9226_CR14","doi-asserted-by":"crossref","unstructured":"Donahue J, Hendricks LA, Guadarrama S, Rohrbach M, Venugopalan S, Darrell T, Saenko K (2015) Long-term recurrent convolutional networks for visual recognition and description. In: 2015 IEEE conference on computer vision and pattern recognition (CVPR), Boston, US, pp 2625\u20132634","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"9226_CR15","unstructured":"Elliott D, K\u00e1d\u00e1r A (2017) Imagination improves multimodal translation. In: Proceedings of the eighth international joint conference on natural language processing: long papers, vol 1. Asian Federation of Natural Language Processing, Taipei, pp 130\u2013141. \n                    http:\/\/www.aclweb.org\/anthology\/I17-1014"},{"key":"9226_CR16","unstructured":"Elliott D, Frank S, Hasler E (2015) Multi-language image description with neural sequence models. CoRR. \n                    arXiv: 1510.04709"},{"key":"9226_CR17","doi-asserted-by":"crossref","unstructured":"Elliott D, Frank S, Sima\u2019an K, Specia L (2016) Multi30K: multilingual English\u2013German image descriptions. In: Proceedings of the 5th workshop on vision and language, VL@ACL 2016, Berlin, Germany. \n                    http:\/\/aclweb.org\/anthology\/W\/W16\/W16-3210.pdf","DOI":"10.18653\/v1\/W16-3210"},{"key":"9226_CR18","doi-asserted-by":"crossref","unstructured":"Elliott D, Frank S, Barrault L, Bougares F, Specia L (2017) Findings of the second shared task on multimodal machine translation and multilingual image description. In: Proceedings of the second conference on machine translation. Association for Computational Linguistics, pp 215\u2013233. \n                    http:\/\/aclweb.org\/anthology\/W17-4718","DOI":"10.18653\/v1\/W17-4718"},{"key":"9226_CR19","unstructured":"Faghri F, Fleet DJ, Kiros JR, Fidler S (2017) VSE++: improved visual-semantic embeddings. \n                    arXiv:1707.05612"},{"key":"9226_CR20","doi-asserted-by":"crossref","unstructured":"Farhadi A, Hejrati M, Sadeghi MA, Young P, Rashtchian C, Hockenmaier J, Forsyth D (2010) Every picture tells a story: generating sentences from images. In: Proceedings of the 11th European conference on computer vision: Part IV, ECCV\u201910. Springer, Berlin, pp 15\u201329. \n                    http:\/\/dl.acm.org\/citation.cfm?id=1888089.1888092","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"9226_CR21","unstructured":"Gao H, Mao J, Zhou J, Huang Z, Wang L, Xu W (2015) Are you talking to a machine? Dataset and methods for multilingual image question answering. In: NIPS"},{"key":"9226_CR22","doi-asserted-by":"publisher","unstructured":"Girshick R, Donahue J, Darrell T, Malik J (2014) Rich feature hierarchies for accurate object detection and semantic segmentation. In: Proceedings of the 2014 IEEE conference on computer vision and pattern recognition, CVPR \u201914, Washington, DC, USA, pp 580\u2013587. \n                    https:\/\/doi.org\/10.1109\/CVPR.2014.81","DOI":"10.1109\/CVPR.2014.81"},{"key":"9226_CR23","doi-asserted-by":"publisher","first-page":"379","DOI":"10.1006\/jmla.2000.2714","volume":"43","author":"A Glenberg","year":"2000","unstructured":"Glenberg A, Robertson D (2000) Symbol grounding and meaning: a comparison of high-dimensional and embodied theories of meaning. J Mem Lang 43:379\u2013401","journal-title":"J Mem Lang"},{"key":"9226_CR24","unstructured":"Graves A (2013) Generating sequences with recurrent neural networks. CoRR. \n                    arXiv:1308.0850"},{"key":"9226_CR25","doi-asserted-by":"crossref","unstructured":"Harnad S (1990) The symbol grounding problem. Physica D 42(1):335\u2013346. \n                    http:\/\/www.sciencedirect.com\/science\/article\/pii\/0167278990900876","DOI":"10.1016\/0167-2789(90)90087-6"},{"key":"9226_CR26","unstructured":"He K, Zhang X, Ren S, Sun J (2015) Deep residual learning for image recognition. arXiv preprint \n                    arXiv:1512.03385"},{"key":"9226_CR27","doi-asserted-by":"crossref","unstructured":"Hitschler J, Schamoni S, Riezler S (2016) Multimodal pivots for image caption translation. In: Proceedings of the 54th annual meeting of the Association for Computational Linguistics: long papers, Berlin, Germany, vol 1, pp 2399\u20132409. \n                    http:\/\/www.aclweb.org\/anthology\/P16-1227","DOI":"10.18653\/v1\/P16-1227"},{"key":"9226_CR28","doi-asserted-by":"crossref","unstructured":"Hodosh M, Young P, Hockenmaier J (2013) Framing image description as a ranking task: data, models and evaluation metrics. J Artif Intell Res 47(1):853\u2013899. \n                    http:\/\/dl.acm.org\/citation.cfm?id=2566972.2566993","DOI":"10.1613\/jair.3994"},{"key":"9226_CR29","doi-asserted-by":"crossref","unstructured":"Huang PY, Liu F, Shiang SR, Oh J, Dyer C (2016) Attention-based multimodal neural machine translation. In: Proceedings of the first conference on machine translation, Berlin, Germany, pp 639\u2013645. \n                    http:\/\/www.aclweb.org\/anthology\/W\/W16\/W16-2360","DOI":"10.18653\/v1\/W16-2360"},{"key":"9226_CR30","unstructured":"Kalchbrenner N, Blunsom P (2013) Recurrent continuous translation models. In: Proceedings of the 2013 conference on empirical methods in natural language processing, EMNLP 2013, Seattle, USA, pp 1700\u20131709"},{"key":"9226_CR31","doi-asserted-by":"crossref","unstructured":"Karpathy A, Fei-Fei L (2015) Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, CVPR 2015, Boston, Massachusetts, pp 3128\u20133137","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"9226_CR32","unstructured":"Kiros R, Salakhutdinov R, Zemel RS (2014) Unifying visual-semantic embeddings with multimodal neural language models. CoRR. \n                    arXiv:1411.2539"},{"key":"9226_CR33","volume-title":"Statistical machine translation","author":"P Koehn","year":"2010","unstructured":"Koehn P (2010) Statistical machine translation, 1st edn. Cambridge University Press, New York","edition":"1"},{"key":"9226_CR34","doi-asserted-by":"publisher","unstructured":"Koehn P, Och FJ, Marcu D (2003) Statistical phrase-based translation. In: Proceedings of the 2003 conference of the North American Chapter of the Association for Computational Linguistics on human language technology, NAACL \u201903, vol 1. Association for Computational Linguistics, Stroudsburg, pp 48\u201354. \n                    https:\/\/doi.org\/10.3115\/1073445.1073462","DOI":"10.3115\/1073445.1073462"},{"key":"9226_CR35","doi-asserted-by":"crossref","unstructured":"Koehn P, Hoang H, Birch A, Callison-Burch C, Federico M, Bertoldi N, Cowan B, Shen W, Moran C, Zens R, Dyer C, Bojar O, Constantin A, Herbst E (2007) Moses: open source toolkit for statistical machine translation. In: Proceedings of the 45th annual meeting of the ACL on interactive poster and demonstration sessions, ACL \u201907. Association for Computational Linguistics, Prague, pp 177\u2013180. \n                    http:\/\/dl.acm.org\/citation.cfm?id=1557769.1557821","DOI":"10.3115\/1557769.1557821"},{"key":"9226_CR36","doi-asserted-by":"crossref","unstructured":"Lazaridou A, Pham NT, Baroni M (2015) Combining language and vision with a multimodal skip-gram model. In: Proceedings of the 2015 conference of the North American Chapter of the Association for Computational Linguistics: human language technologies. Association for Computational Linguistics, Denver, pp 153\u2013163. \n                    http:\/\/www.aclweb.org\/anthology\/N15-1016","DOI":"10.3115\/v1\/N15-1016"},{"key":"9226_CR37","doi-asserted-by":"crossref","unstructured":"Libovick\u00fd J, Helcl J (2017) Attention strategies for multi-source sequence-to-sequence learning. In: Proceedings of the 55th annual meeting of the Association for Computational Linguistics: short papers, vol 2. Association for Computational Linguistics, Vancouver, pp 196\u2013202. \n                    http:\/\/aclweb.org\/anthology\/P17-2031","DOI":"10.18653\/v1\/P17-2031"},{"key":"9226_CR38","doi-asserted-by":"crossref","unstructured":"Libovick\u00fd J, Helcl J, Tlust\u00fd M, Bojar O, Pecina P (2016) CUNI system for WMT16 automatic post-editing and multimodal translation tasks. In: Proceedings of the first conference on machine translation, Berlin, Germany, pp 646\u2013654. \n                    http:\/\/www.aclweb.org\/anthology\/W\/W16\/W16-2361","DOI":"10.18653\/v1\/W16-2361"},{"key":"9226_CR39","unstructured":"Luong MT, Le QV, Sutskever I, Vinyals O, Kaiser L (2016) Multi-task sequence to sequence learning. In: Proceedings of the international conference on learning representations (ICLR), 2016, San Juan, Puerto Rico"},{"key":"9226_CR40","doi-asserted-by":"crossref","unstructured":"Luong T, Pham H, Manning CD (2015) Effective approaches to attention-based neural machine translation. In: Proceedings of the 2015 conference on empirical methods in natural language processing (EMNLP), Lisbon, Portugal, pp 1412\u20131421","DOI":"10.18653\/v1\/D15-1166"},{"key":"9226_CR41","doi-asserted-by":"crossref","unstructured":"Madhyastha PS, Wang J, Specia L (2017) Sheffield multiMT: using object posterior predictions for multimodal machine translation. In: Proceedings of the second conference on machine translation, Copenhagen, Denmark. Association for Computational Linguistics, pp 470\u2013476. \n                    http:\/\/www.aclweb.org\/anthology\/W17-4752","DOI":"10.18653\/v1\/W17-4752"},{"key":"9226_CR42","unstructured":"Mao J, Xu W, Yang Y, Wang J, Yuille AL (2014) Explain images with multimodal recurrent neural networks. \n                    arXiv:1410.1090"},{"key":"9226_CR43","unstructured":"Mao J, Xu J, Jing K, Yuille AL (2016) Training and evaluating multimodal word embeddings with large-scale web annotated images. In: Lee DD, Sugiyama M, Luxburg UV, Guyon I, Garnett R (eds) Advances in neural information processing systems 29. Curran Associates, Inc., pp 442\u2013450"},{"issue":"4","key":"9226_CR44","doi-asserted-by":"publisher","first-page":"417","DOI":"10.1162\/0891201042544884","volume":"30","author":"FJ Och","year":"2004","unstructured":"Och FJ, Ney H (2004) The alignment template approach to statistical machine translation. Comput Linguist 30(4):417\u2013449. \n                    https:\/\/doi.org\/10.1162\/0891201042544884","journal-title":"Comput Linguist"},{"key":"9226_CR45","doi-asserted-by":"publisher","unstructured":"Papineni K, Roukos S, Ward T, Zhu WJ (2002) BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting on Association for Computational Linguistics, ACL \u201902, Philadelphia, Pennsylvania, pp 311\u2013318. \n                    https:\/\/doi.org\/10.3115\/1073083.1073135","DOI":"10.3115\/1073083.1073135"},{"key":"9226_CR46","doi-asserted-by":"crossref","unstructured":"Popovi\u0107 M (2015) chrF: character n-gram F-score for automatic MT evaluation. In: Proceedings of the tenth workshop on statistical machine translation, Lisbon, Portugal, pp 392\u2013395. \n                    http:\/\/aclweb.org\/anthology\/W15-3049","DOI":"10.18653\/v1\/W15-3049"},{"issue":"3","key":"9226_CR47","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky O, Deng J, Su H, Krause J, Satheesh S, Ma S, Huang Z, Karpathy A, Khosla A, Bernstein M, Berg AC, Fei-Fei L (2015) ImageNet large scale visual recognition challenge. Int J Comput Vis 115(3):211\u2013252. \n                    https:\/\/doi.org\/10.1007\/s11263-015-0816-y","journal-title":"Int J Comput Vis"},{"key":"9226_CR48","doi-asserted-by":"publisher","unstructured":"Schuster M, Paliwal K (1997) Bidirectional recurrent neural networks. Trans Signal Process 45(11):2673\u20132681. \n                    https:\/\/doi.org\/10.1109\/78.650093","DOI":"10.1109\/78.650093"},{"key":"9226_CR49","doi-asserted-by":"crossref","unstructured":"Sennrich R, Haddow B, Birch A (2016) Neural machine translation of rare words with subword units. In: Proceedings of the 54th annual meeting of the Association for Computational Linguistics: long papers, Berlin, Germany, vol 1, pp 1715\u20131725. \n                    http:\/\/www.aclweb.org\/anthology\/P16-1162","DOI":"10.18653\/v1\/P16-1162"},{"key":"9226_CR50","doi-asserted-by":"crossref","unstructured":"Shah K, Wang J, Specia L (2016) SHEF-multimodal: grounding machine translation on images. In: Proceedings of the first conference on machine translation, Berlin, Germany, pp 660\u2013665. \n                    http:\/\/www.aclweb.org\/anthology\/W\/W16\/W16-2363","DOI":"10.18653\/v1\/W16-2363"},{"key":"9226_CR51","unstructured":"Silberer C, Lapata M (2012) Grounded models of semantic representation. In: Proceedings of the 2012 joint conference on empirical methods in natural language processing and computational natural language learning, , EMNLP-CoNLL \u201912, Stroudsburg, PA, USA. Association for Computational Linguistics, pp 1423\u20131433. \n                    http:\/\/dl.acm.org\/citation.cfm?id=2390948.2391110"},{"key":"9226_CR52","unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition. CoRR. \n                    arXiv:1409.1556"},{"key":"9226_CR53","unstructured":"Snover M, Dorr B, Schwartz R, Micciulla L, Makhoul J (2006) A study of translation edit rate with targeted human annotation. In: Proceedings of Association for Machine Translation in the Americas, Cambridge, MA, pp 223\u2013231"},{"key":"9226_CR54","unstructured":"Specia L, Frank S, Sima\u2019an K, Elliott D (2016) A shared task on multimodal machine translation and crosslingual image description. In: Proceedings of the first conference on machine translation, WMT 2016, Berlin, Germany, pp 543\u2013553. \n                    http:\/\/aclweb.org\/anthology\/W\/W16\/W16-2346.pdf"},{"key":"9226_CR55","unstructured":"Sutskever I, Vinyals O, Le QV (2014) Sequence to sequence learning with neural networks. In: Advances in neural information processing systems, Montr\u00e9al, Canada, pp 3104\u20133112"},{"key":"9226_CR56","doi-asserted-by":"publisher","unstructured":"Szegedy C, Vanhoucke V, Ioffe S, Shlens J, Wojna Z (2016) Rethinking the inception architecture for computer vision. In: 2016 IEEE conference on computer vision and pattern recognition (CVPR), pp 2818\u20132826.\n                    https:\/\/doi.org\/10.1109\/CVPR.2016.308","DOI":"10.1109\/CVPR.2016.308"},{"key":"9226_CR57","doi-asserted-by":"crossref","unstructured":"Tu Z, Lu Z, Liu Y, Liu X, Li H (2016) Modeling coverage for neural machine translation. In: Proceedings of the 54th annual meeting of the Association for Computational Linguistics: long papers, Berlin, Germany, vol 1, pp 76\u201385. \n                    http:\/\/www.aclweb.org\/anthology\/P16-1008","DOI":"10.18653\/v1\/P16-1008"},{"key":"9226_CR58","doi-asserted-by":"crossref","unstructured":"Venugopalan S, Rohrbach M, Donahue J, Mooney R, Darrell T, Saenko K (2015) Sequence to sequence\u2014video to text. In: Proceedings of the IEEE international conference on computer vision, Santiago, Chile, pp 4534\u20134542","DOI":"10.1109\/ICCV.2015.515"},{"key":"9226_CR59","unstructured":"Vilar D, Xu J, D\u2019Haro L, Ney H (2006) Error analysis of statistical machine translation output. In: Proceedings of the fifth international conference on language resources and evaluation (LREC-2006), Genoa, Italy"},{"key":"9226_CR60","doi-asserted-by":"crossref","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D (2015) Show and tell: a neural image caption generator. In: IEEE conference on computer vision and pattern recognition, CVPR 2015, Boston, Massachusetts, pp 3156\u20133164","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"9226_CR61","doi-asserted-by":"crossref","unstructured":"Wu Q, Teney D, Wang P, Shen C, Dick A, van den Hengel A (2017) Visual question answering: a survey of methods and datasets. Comput Vis Image Underst 163:21\u201340. \n                    http:\/\/www.sciencedirect.com\/science\/article\/pii\/S1077314217300772\n                    \n                  , language in Vision","DOI":"10.1016\/j.cviu.2017.05.001"},{"key":"9226_CR62","unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville A, Salakhudinov R, Zemel R, Bengio Y (2015) Show, attend and tell: neural image caption generation with visual attention. In: Proceedings of the 32nd international conference on machine learning (ICML-15), JMLR workshop and conference proceedings, Lille, France, pp 2048\u20132057. \n                    http:\/\/jmlr.org\/proceedings\/papers\/v37\/xuc15.pdf"},{"key":"9226_CR63","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young P, Lai A, Hodosh M, Hockenmaier J (2014) From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans Assoc Comput Linguist 2:67\u201378","journal-title":"Trans Assoc Comput Linguist"}],"container-title":["Machine Translation"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10590-019-09226-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10590-019-09226-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10590-019-09226-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,4,6]],"date-time":"2020-04-06T23:11:41Z","timestamp":1586214701000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10590-019-09226-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,4,8]]},"references-count":63,"journal-issue":{"issue":"1-2","published-print":{"date-parts":[[2019,6]]}},"alternative-id":["9226"],"URL":"https:\/\/doi.org\/10.1007\/s10590-019-09226-9","relation":{},"ISSN":["0922-6567","1573-0573"],"issn-type":[{"type":"print","value":"0922-6567"},{"type":"electronic","value":"1573-0573"}],"subject":[],"published":{"date-parts":[[2019,4,8]]},"assertion":[{"value":"15 July 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 January 2019","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 April 2019","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}