{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:26:46Z","timestamp":1750220806258,"version":"3.41.0"},"reference-count":57,"publisher":"Springer Science and Business Media LLC","issue":"1-2","license":[{"start":{"date-parts":[[2017,4,19]],"date-time":"2017-04-19T00:00:00Z","timestamp":1492560000000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Machine Translation"],"published-print":{"date-parts":[[2017,6]]},"DOI":"10.1007\/s10590-017-9195-1","type":"journal-article","created":{"date-parts":[[2017,4,19]],"date-time":"2017-04-19T16:56:10Z","timestamp":1492620970000},"page":"19-33","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["A comparison of discriminative training criteria for continuous space translation models"],"prefix":"10.1007","volume":"31","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8627-1965","authenticated-orcid":false,"given":"Alexandre","family":"Allauzen","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Quoc Khanh","family":"Do","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fran\u00e7ois","family":"Yvon","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2017,4,19]]},"reference":[{"key":"9195_CR1","unstructured":"Allauzen A, P\u00e9cheux N, Do QK, Dinarelli M, Lavergne T, Max A, Le H, Yvon F (2013) LIMSI @ WMT13. In: Proceedings of the workshop on statistical machine translation, Sofia, Bulgaria, pp 62\u201369"},{"key":"9195_CR2","doi-asserted-by":"crossref","unstructured":"Auli M, Gao J (2014a) Decoder integration and expected BLEU training for recurrent neural network language models. In: Proceedings of the annual meeting of the Association for Computational Linguistics (ACL), pp 136\u2013142","DOI":"10.3115\/v1\/P14-2023"},{"key":"9195_CR3","doi-asserted-by":"crossref","unstructured":"Auli M, Gao J (2014b) Decoder integration and expected BLEU training for recurrent neural network language models. In: Proceedings of the 52nd annual meeting of the Association for Computational Linguistics (ACL\u201914), pp 136\u2013142","DOI":"10.3115\/v1\/P14-2023"},{"key":"9195_CR4","doi-asserted-by":"crossref","unstructured":"Auli M, Galley M, Quirk C, Zweig G (2013) Joint language and translation modeling with recurrent neural networks. In: Proceedings of the conference on empirical methods in natural language processing (EMNLP), pp 1044\u20131054","DOI":"10.18653\/v1\/D13-1106"},{"key":"9195_CR5","unstructured":"Bahdanau D, Cho K, Bengio Y (2014) Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473"},{"key":"9195_CR6","first-page":"1137","volume":"3","author":"Y Bengio","year":"2003","unstructured":"Bengio Y, Ducharme R, Vincent P, Janvin C (2003) A neural probabilistic language model. J Mach Learn Res 3:1137\u20131155","journal-title":"J Mach Learn Res"},{"key":"9195_CR7","doi-asserted-by":"crossref","unstructured":"Blunsom P, Osborne M (2008) Probabilistic inference for machine translation. In: Proceedings of the conference on empirical methods in natural language processing, pp 215\u2013223","DOI":"10.3115\/1613715.1613746"},{"key":"9195_CR8","unstructured":"Blunsom P, Cohn T, Osborne M (2008) A discriminative latent variable model for statistical machine translation. In: ACL, pp 200\u2013208"},{"issue":"3","key":"9195_CR9","doi-asserted-by":"crossref","first-page":"205","DOI":"10.1162\/089120104323093294","volume":"30","author":"F Casacuberta","year":"2004","unstructured":"Casacuberta F, Vidal E (2004) Machine translation with inferred stochastic finite-state transducers. Comput Linguist 30(3):205\u2013225","journal-title":"Comput Linguist"},{"key":"9195_CR10","unstructured":"Cherry C, Foster G (2012) Batch tuning strategies for statistical machine translation. In: Proceedings of the North American chapter of the Association for Computational Linguistics: human language technologies (NAACL-HLT), pp 427\u2013436"},{"key":"9195_CR11","doi-asserted-by":"crossref","unstructured":"Chiang D, Marton Y, Resnik P (2008) Online large-margin training of syntactic and structural translation features. In: Proceedings of the conference on empirical methods in natural language processing, pp 224\u2013233","DOI":"10.3115\/1613715.1613747"},{"key":"9195_CR12","doi-asserted-by":"crossref","unstructured":"Cho K, van Merrienboer B, Gulcehre C, Bahdanau D, Bougares F, Schwenk H, Bengio Y (2014) Learning phrase representations using RNN encoder\u2013decoder for statistical machine translation. In: Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), Doha, Qatar, pp 1724\u20131734","DOI":"10.3115\/v1\/D14-1179"},{"key":"9195_CR13","doi-asserted-by":"crossref","unstructured":"Collins M (2002) Discriminative training methods for hidden Markov models: theory and experiments with perceptron algorithms. In: Proceedings of the conference on empirical methods in natural language processing (EMNLP), pp 1\u20138","DOI":"10.3115\/1118693.1118694"},{"key":"9195_CR14","doi-asserted-by":"crossref","unstructured":"Collobert R, Weston J (2008) A unified architecture for natural language processing: deep neural networks with multitask learning. In: Proceedings of the 25th international conference on machine learning. ACM, New York, pp 160\u2013167","DOI":"10.1145\/1390156.1390177"},{"key":"9195_CR15","first-page":"2493","volume":"12","author":"R Collobert","year":"2011","unstructured":"Collobert R, Weston J, Bottou L, Karlen M, Kavukcuoglu K, Kuksa P (2011) Natural language processing (almost) from scratch. J Mach Learn Res 12:2493\u20132537","journal-title":"J Mach Learn Res"},{"key":"9195_CR16","first-page":"951","volume":"3","author":"K Crammer","year":"2003","unstructured":"Crammer K, Singer Y (2003) Ultraconservative online algorithms for multiclass problems. J Mach Learn Res 3:951\u2013991","journal-title":"J Mach Learn Res"},{"issue":"3","key":"9195_CR17","doi-asserted-by":"crossref","first-page":"199","DOI":"10.1007\/s10590-007-9024-z","volume":"20","author":"JM Crego","year":"2006","unstructured":"Crego JM, Mari\u00f1o JB (2006) Improving statistical MT by coupling reordering and decoding. Mach Transl 20(3):199\u2013215","journal-title":"Mach Transl"},{"key":"9195_CR18","doi-asserted-by":"crossref","first-page":"49","DOI":"10.2478\/v10108-011-0010-5","volume":"96","author":"JM Crego","year":"2011","unstructured":"Crego JM, Yvon F, Mari\u00f1o JB (2011) N-code: an open-source bilingual N-gram SMT toolkit. Prague Bull Math Linguist 96:49\u201358","journal-title":"Prague Bull Math Linguist"},{"key":"9195_CR19","doi-asserted-by":"crossref","unstructured":"Devlin J, Zbib R, Huang Z, Lamar T, Schwartz R, Makhoul J (2014) Fast and robust neural network joint models for statistical machine translation. In: Proceedings of the 52nd annual meeting of the Association for Computational Linguistics. Long papers, vol 1, Baltimore, MD, pp 1370\u20131380","DOI":"10.3115\/v1\/P14-1129"},{"key":"9195_CR20","unstructured":"Do QK (2016) Discriminative training of continuous space translation models. PhD Thesis, Universit\u00e9 Paris-Sud and Universit\u00e9 Paris-Saclay"},{"key":"9195_CR21","doi-asserted-by":"crossref","unstructured":"Do Q-K, Allauzen A, Yvon F (2014) Discriminative adaptation of continuous space translation models. In: International workshop on spoken language translation (IWSLT 2014), Lake Tahoe, USA","DOI":"10.18653\/v1\/D15-1121"},{"key":"9195_CR22","unstructured":"Do Q-K, Allauzen A, Yvon F (2015a) Apprentissage discriminant des mod\u00e8les continus de traduction. In: Actes de la 22e conf\u00e9rence sur le Traitement Automatique des Langues Naturelles, Caen, France. Association pour le Traitement Automatique des Langues, pp 267\u2013278"},{"key":"9195_CR23","doi-asserted-by":"crossref","unstructured":"Do QK, Allauzen A, Yvon F (2015b) A discriminative training procedure for continuous translation models. In: Conference on empirical methods in natural language processing (EMNLP 2015), Lisboa, Portugal, p 7","DOI":"10.18653\/v1\/D15-1121"},{"key":"9195_CR24","unstructured":"Dyer C, Resnik P (2010) Context-free reordering, finite-state translation. In: Proceedings of the North American chapter of the Association for Computational Linguistics: human language technologies (NAACL-HLT), pp 858\u2013866"},{"issue":"3","key":"9195_CR25","doi-asserted-by":"crossref","first-page":"277","DOI":"10.1023\/A:1007662407062","volume":"37","author":"Y Freund","year":"1999","unstructured":"Freund Y, Schapire RE (1999) Large margin classification using the perceptron algorithm. Mach Learn 37(3):277\u2013296","journal-title":"Mach Learn"},{"key":"9195_CR26","unstructured":"Gao J, He X (2013) Training MRF-based phrase translation models using gradient ascent. In: Proceedings of the North American chapter of the Association for Computational Linguistics: human language technologies (NAACL-HLT), Atlanta, pp 450\u2013459"},{"key":"9195_CR27","doi-asserted-by":"crossref","unstructured":"Gao J, He X, Yih W-t, Deng L (2014) Learning continuous phrase representations for translation modeling. In: Proceedings of the 52nd annual meeting of the Association for Computational Linguistics, Baltimore, MD","DOI":"10.3115\/v1\/P14-1066"},{"key":"9195_CR28","unstructured":"Gutmann M, Hyv\u00e4rinen A (2010) Noise-contrastive estimation: a new estimation principle for unnormalized statistical models. In: Teh YW, Titterington M (eds) Proceedings of the international conference on artificial intelligence and statistics (AISTATS), vol 9, pp 297\u2013304"},{"key":"9195_CR29","unstructured":"He X, Deng L (2012) Maximum expected BLEU training of phrase and lexicon translation models. In: Proceedings of the 50th annual meeting of the Association for Computational Linguistics: long papers, vol 1, pp 292\u2013301"},{"key":"9195_CR30","unstructured":"Hopkins M, May J (2011) Tuning as ranking. In: Proceedings of the 2011 conference on empirical methods in natural language processing, Edinburgh, Scotland, UK, pp 1352\u20131362"},{"key":"9195_CR31","unstructured":"Lavergne T, Crego JM, Allauzen A, Yvon F (2011) From n-gram-based to CRF-based translation models. In: Proceedings of the sixth workshop on statistical machine translation, pp 542\u2013553"},{"key":"9195_CR32","unstructured":"Lavergne T, Allauzen A, Yvon F (2013) Un cadre d\u2019apprentissage int\u00e9gralement discriminant pour la traduction statistique. TALN-R\u00c9CITAL 2013, p 450"},{"key":"9195_CR33","unstructured":"Le H-S, Oparin I, Allauzen A, Gauvain J-L, Yvon F (2011) Structured output layer neural network language model. In: Proceedings of the international conference on audio, speech and signal processing, pp 5524\u20135527"},{"key":"9195_CR34","unstructured":"Le H-S, Allauzen A, Yvon F (2012) Continuous space translation models with neural networks. In: Proceedings of the North American chapter of the Association for Computational Linguistics: human language technologies (NAACL-HLT), Montr\u00e9al, Canada, pp 39\u201348"},{"key":"9195_CR35","doi-asserted-by":"crossref","unstructured":"Liang P, Bouchard-C\u00f4t\u00e9 A, Klein D, Taskar B (2006) An end-to-end discriminative approach to machine translation. In: Proceedings of the annual meeting of the Association for Computational Linguistics (ACL), pp 761\u2013768","DOI":"10.3115\/1220175.1220271"},{"issue":"4","key":"9195_CR36","doi-asserted-by":"crossref","first-page":"527","DOI":"10.1162\/coli.2006.32.4.527","volume":"32","author":"JB Mari\u00f1o","year":"2006","unstructured":"Mari\u00f1o JB, Banchs RE, Crego JM, de Gispert A, Lambert P, Fonollosa JA, Costa-Juss\u00e0 MR (2006) N-gram-based machine translation. Comput Linguist 32(4):527\u2013549","journal-title":"Comput Linguist"},{"key":"9195_CR37","doi-asserted-by":"crossref","unstructured":"McDonald R, Crammer K, Pereira F (2005) Online large-margin training of dependency parsers. In: Proceedings of the annual meeting of the Association for Computational Linguistics (ACL), pp 91\u201398","DOI":"10.3115\/1219840.1219852"},{"key":"9195_CR38","unstructured":"Mnih A, Hinton GE (2008) A scalable hierarchical distributed language model. In: Koller D, Schuurmans D, Bengio Y, Bottou L (eds) Advances in neural information processing Systems 21, vol 21, pp 1081\u20131088"},{"key":"9195_CR39","unstructured":"Mnih A, Teh YW (2012) A fast and simple algorithm for training neural probabilistic language models. In: Proceedings of the international conference of machine learning (ICML)"},{"issue":"1","key":"9195_CR40","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1162\/COLI_a_00241","volume":"42","author":"G Neubig","year":"2016","unstructured":"Neubig G, Watanabe T (2016) Optimization for statistical machine translation: a survey. Comput Linguist 42(1):1\u201354","journal-title":"Comput Linguist"},{"key":"9195_CR41","unstructured":"Niehues J, Waibel A (2012) Continuous space language models using restricted Boltzmann machines. In: Proceedings of international workshop on spoken language translation (IWSLT), Hong-Kong, China, pp 164\u2013170"},{"key":"9195_CR42","doi-asserted-by":"crossref","unstructured":"Och FJ (2003) Minimum error rate training in statistical machine translation. In: Proceedings of the annual meeting of the Association for Computational Linguistics (ACL). Association for Computational Linguistics, pp 160\u2013167","DOI":"10.3115\/1075096.1075117"},{"key":"9195_CR43","unstructured":"Papineni K, Roukos S, Ward T, Zhu W-J (2002) BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the annual meeting of the Association for Computational Linguistics (ACL), pp 311\u2013318"},{"key":"9195_CR44","unstructured":"Rosti A-V, Zhang B, Matsoukas S, Schwartz R (2010) BBN system description for WMT10 system combination task. In: Proceedings of the joint fifth workshop on statistical machine translation and MetricsMATR, Uppsala, Sweden. Association for Computational Linguistics, pp 321\u2013326"},{"issue":"3","key":"9195_CR45","doi-asserted-by":"crossref","first-page":"492","DOI":"10.1016\/j.csl.2006.09.003","volume":"21","author":"H Schwenk","year":"2007","unstructured":"Schwenk H (2007) Continuous space language models. Comput Speech Lang 21(3):492\u2013518","journal-title":"Comput Speech Lang"},{"key":"9195_CR46","unstructured":"Schwenk H, Costa-Jussa MR, Fonollosa JAR (2007) Smooth bilingual $$n$$ n -gram translation. In: Proceedings of the conference on empirical methods in natural language processing (EMNLP), Prague, Czech Republic, pp 430\u2013438"},{"issue":"1\u20133","key":"9195_CR47","doi-asserted-by":"crossref","first-page":"73","DOI":"10.1007\/s10994-005-0918-9","volume":"60","author":"L Shen","year":"2005","unstructured":"Shen L, Joshi AK (2005) Ranking and reranking with perceptron. Mach Learn 60(1\u20133):73\u201396","journal-title":"Mach Learn"},{"key":"9195_CR48","unstructured":"Shen L, Sarkar A, Och FJ (2004) Discriminative reranking for machine translation. In: HLT-NAACL, pp 177\u2013184"},{"key":"9195_CR49","unstructured":"Shen S, Cheng Y, He Z, He W, Wu H, Sun M, Liu Y (2015) Minimum risk training for neural machine translation. CoRR. arXiv:1512.02433"},{"key":"9195_CR50","unstructured":"Simianer P, Riezler S, Dyer C (2012) Joint feature selection in distributed stochastic learning for large-scale discriminative training in SMT. In: Proceedings of the annual meeting of the Association for Computational Linguistics (ACL), pp 11\u201321"},{"key":"9195_CR51","unstructured":"Socher R, Bauer J, Manning CD, Andrew YN (2013) Parsing with compositional vector grammars. In: Proceedings of the annual meeting of the Association for Computational Linguistics (ACL), Sofia, Bulgaria, pp 455\u2013465"},{"key":"9195_CR52","unstructured":"Sutskever I, Vinyals O, Le QV (2014) Sequence to sequence learning with neural networks. In: Advances in neural information processing systems, NIPS*27, Montr\u00e9al, Canada, pp 3104\u20133112"},{"key":"9195_CR53","doi-asserted-by":"crossref","unstructured":"Vaswani A, Zhao Y, Fossum V, Chiang D (2013) Decoding with large-scale neural language models improves translation. In: Proceedings of the conference on empirical methods in natural language Processing (EMNLP), Seattle, Washington, USA, pp 1387\u20131392","DOI":"10.18653\/v1\/D13-1140"},{"key":"9195_CR54","unstructured":"Watanabe T, Suzuki J, Tsukada H, Isozaki H (2007) Online large-margin training for statistical machine translation. In: Proceedings of the 2007 joint conference on empirical methods in natural language processing and computational natural language learning (EMNLP-CoNLL), Prague, Czech Republic, pp 764\u2013773"},{"key":"9195_CR55","unstructured":"Yang N, Liu S, Li M, Zhou M, Yu N (2013) Word alignment modeling with context dependent deep neural networks. In: Proceedings of the annual meeting of the Association for Computational Linguistics (ACL), Sofia, Bulgaria, pp 166\u2013175"},{"key":"9195_CR56","doi-asserted-by":"crossref","unstructured":"Zens R, Och FJ, Ney H (2002) Phrase-based statistical machine translation. In: KI \u201902: proceedings of the 25th annual German conference on AI. Springer, London, pp 18\u201332","DOI":"10.1007\/3-540-45751-8_2"},{"key":"9195_CR57","unstructured":"Zens R, Hasan S, Ney H (2007) A systematic comparison of training criteria for statistical machine translation. In: Proceedings of the 2007 joint conference on empirical methods in natural language processing and computational natural language learning (EMNLP-CoNLL), Prague, Czech Republic, pp 524\u2013532"}],"container-title":["Machine Translation"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10590-017-9195-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10590-017-9195-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10590-017-9195-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:12:07Z","timestamp":1750201927000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10590-017-9195-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,4,19]]},"references-count":57,"journal-issue":{"issue":"1-2","published-print":{"date-parts":[[2017,6]]}},"alternative-id":["9195"],"URL":"https:\/\/doi.org\/10.1007\/s10590-017-9195-1","relation":{},"ISSN":["0922-6567","1573-0573"],"issn-type":[{"type":"print","value":"0922-6567"},{"type":"electronic","value":"1573-0573"}],"subject":[],"published":{"date-parts":[[2017,4,19]]}}}