{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T17:31:26Z","timestamp":1779384686679,"version":"3.53.1"},"reference-count":47,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2022,1,9]],"date-time":"2022-01-09T00:00:00Z","timestamp":1641686400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,1,9]],"date-time":"2022-01-09T00:00:00Z","timestamp":1641686400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2022,3]]},"DOI":"10.1007\/s10994-021-06070-y","type":"journal-article","created":{"date-parts":[[2022,1,9]],"date-time":"2022-01-09T00:03:33Z","timestamp":1641686613000},"page":"917-935","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":37,"title":["A study of BERT for context-aware neural machine translation"],"prefix":"10.1007","volume":"111","author":[{"given":"Xueqing","family":"Wu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9823-9033","authenticated-orcid":false,"given":"Yingce","family":"Xia","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jinhua","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lijun","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shufang","family":"Xie","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tao","family":"Qin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2022,1,9]]},"reference":[{"key":"6070_CR1","unstructured":"Adhikari, A., Ram, A., Tang, R., & Lin, J. (2019). Docbert: Bert for document classification. arXiv preprint arXiv:1904.08398"},{"key":"6070_CR2","unstructured":"Agrawal, R.\u00a0R., Turchi, M., & Negri, M. (2018). Contextual handling in neural machine translation: Look behind, ahead and on both sides. In 21st annual conference of the European association for machine translation (pp. 11\u201320)."},{"key":"6070_CR3","doi-asserted-by":"crossref","unstructured":"Bao, G., Zhang, Y., Teng, Z., Chen, B., & Luo, W. (2021). G-transformer for document-level machine translation. arXiv preprint arXiv:2105.14761","DOI":"10.18653\/v1\/2021.acl-long.267"},{"key":"6070_CR4","doi-asserted-by":"crossref","unstructured":"Bawden, R., Sennrich, R., Birch, A., & Haddow, B. (2018). Evaluating discourse phenomena in neural machine translation. In Proceedings of the 2018 conference of the North American chapter of the association for computational linguistics: Human language technologies, (long papers) (Vol.1, pp. 1304\u20131313).","DOI":"10.18653\/v1\/N18-1118"},{"key":"6070_CR5","unstructured":"Clark, K., Luong, M.-T., Le, Q.\u00a0V., & Manning, C.\u00a0D. (2020). Electra: Pre-training text encoders as discriminators rather than generators. In International conference on learning representations."},{"key":"6070_CR6","unstructured":"Devlin, J., Chang, M.-W., Lee, K., & Toutanova, K. (2019). Bert: Pre-training of deep bidirectional transformers for language understanding. In NAACL."},{"key":"6070_CR7","doi-asserted-by":"crossref","unstructured":"Edunov, S., Ott, M., Auli, M., Grangier, D., & Ranzato, M. (2018). Classical structured prediction losses for sequence to sequence learning. In Proceedings of the 2018 conference of the North American chapter of the association for computational linguistics: Human language technologies, (long papers) (Vol. 1, pp. 355\u2013364). Association for Computational Linguistics.","DOI":"10.18653\/v1\/N18-1033"},{"key":"6070_CR8","doi-asserted-by":"crossref","unstructured":"Girshick, R., Donahue, J., Darrell, T., & Malik, J. (2014). Rich feature hierarchies for accurate object detection and semantic segmentation. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 580\u2013587).","DOI":"10.1109\/CVPR.2014.81"},{"key":"6070_CR9","unstructured":"Hassan, H., Aue, A., Chen, C., Chowdhary, V., Clark, J., Federmann, C., Huang, X., Junczys-Dowmunt, M., Lewis, W., Li, M., & Liu, S. (2018). Achieving human parity on automatic Chinese to English news translation. arXiv preprint arXiv:1803.05567"},{"key":"6070_CR10","unstructured":"Jean, S., Lauly, S., Firat, O., & Cho, K. (2017). Does neural machine translation benefit from larger context? arXiv preprint arXiv:1704.05135"},{"key":"6070_CR11","doi-asserted-by":"crossref","unstructured":"Junczys-Dowmunt, M. (2019). Microsoft translator at WMT 2019: Towards large-scale document-level neural machine translation. In Proceedings of the fourth conference on machine translation (shared task papers, Day 1) (Vol. 2, pp. 225\u2013233). Association for Computational Linguistics.","DOI":"10.18653\/v1\/W19-5321"},{"key":"6070_CR12","doi-asserted-by":"crossref","unstructured":"Kang, X., Zhao, Y., Zhang, J., & Zong, C. (2020). Dynamic context selection for document-level neural machine translation via reinforcement learning. In Proceedings of the 2020 conference on empirical methods in natural language processing (EMNLP) (pp. 2242\u20132254).","DOI":"10.18653\/v1\/2020.emnlp-main.175"},{"key":"6070_CR13","doi-asserted-by":"crossref","unstructured":"Kim, Y., Tran, D.\u00a0T., & Ney, H. (2019). When and why is document-level context useful in neural machine translation? In Proceedings of the fourth workshop on discourse in machine translation (DiscoMT 2019) (pp. 24\u201334).","DOI":"10.18653\/v1\/D19-6503"},{"key":"6070_CR14","unstructured":"Kingma, D. P., & Ba, J. (2015). Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980"},{"key":"6070_CR15","unstructured":"Koehn, P. (2004). Statistical significance tests for machine translation evaluation. In Proceedings of the 2004 conference on empirical methods in natural language processing (pp. 388\u2013395). Association for Computational Linguistics."},{"key":"6070_CR16","doi-asserted-by":"crossref","unstructured":"Lewis, M., Liu, Y., Goyal, N., Ghazvininejad, M., Mohamed, A., Levy, O., Stoyanov, V., & Zettlemoyer, L. (2020). Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. In Proceedings of the 58th annual meeting of the association for computational linguistics (pp. 7871\u20137880).","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"6070_CR17","unstructured":"Li, L., Jiang, X., & Liu, Q. (2019). Pretrained language models for document-level neural machine translation. arXiv preprint arXiv:1911.03110"},{"key":"6070_CR18","doi-asserted-by":"crossref","unstructured":"Li, B., Liu, H., Wang, Z., Jiang, Y., Xiao, T., Zhu, J., Liu, T., & Li, C. (2020). Does multi-encoder help? A case study on context-aware neural machine translation. arXiv preprint arXiv:2005.03393","DOI":"10.18653\/v1\/2020.acl-main.322"},{"key":"6070_CR19","unstructured":"Liu, Y., Ott, M., Goyal, N., Du, J., Joshi, M., Chen, D., Levy, O., Lewis, M., Zettlemoyer, L., & Stoyanov, V. (2019). Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692"},{"key":"6070_CR20","doi-asserted-by":"publisher","first-page":"726","DOI":"10.1162\/tacl_a_00343","volume":"8","author":"Y Liu","year":"2020","unstructured":"Liu, Y., Gu, J., Goyal, N., Li, X., Edunov, S., Ghazvininejad, M., Lewis, M., & Zettlemoyer, L. (2020). Multilingual denoising pre-training for neural machine translation. Transactions of the Association for Computational Linguistics, 8, 726\u2013742.","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"6070_CR21","doi-asserted-by":"crossref","unstructured":"Ma, S., Zhang, D., & Zhou, M. (2020). A simple and effective unified encoder for document-level machine translation. In Proceedings of the 58th annual meeting of the association for computational linguistics (pp. 3505\u20133511).","DOI":"10.18653\/v1\/2020.acl-main.321"},{"key":"6070_CR22","doi-asserted-by":"crossref","unstructured":"Maruf, S., Martins, A. F.\u00a0T., & Haffari, G. (2019a). Selective attention for context-aware neural machine translation. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies (long and short papers) (Vol. 1, pp. 3092\u20133102). Association for Computational Linguistics.","DOI":"10.18653\/v1\/N19-1313"},{"key":"6070_CR23","unstructured":"Maruf, S., Saleh, F., & Haffari, G. (2019b). A survey on document-level machine translation: Methods and evaluation. arXiv preprint arXiv:1912.08494"},{"key":"6070_CR24","doi-asserted-by":"crossref","unstructured":"Miculicich, L., Ram, D., Pappas, N., & Henderson, J. (2018). Document-level neural machine translation with hierarchical attention networks. In Proceedings of the 2018 conference on empirical methods in natural language processing (pp. 2947\u20132954). Association for Computational Linguistics.","DOI":"10.18653\/v1\/D18-1325"},{"key":"6070_CR25","doi-asserted-by":"crossref","unstructured":"Morishita, M., Suzuki, J., Iwata, T., & Nagata, M. (2021). Context-aware neural machine translation with mini-batch embedding. In Proceedings of the 16th conference of the European chapter of the association for computational linguistics: main volume (pp. 2513\u20132521). Association for Computational Linguistics.","DOI":"10.18653\/v1\/2021.eacl-main.214"},{"key":"6070_CR26","doi-asserted-by":"crossref","unstructured":"M\u00fcller, M., Gonzales, A.\u00a0R., Voita, E., & Sennrich, R. (2018). A large-scale test set for the evaluation of context-aware pronoun translation in neural machine translation. In Proceedings of the third conference on machine translation: research papers (pp. 61\u201372).","DOI":"10.18653\/v1\/W18-6307"},{"key":"6070_CR27","doi-asserted-by":"crossref","unstructured":"Ng, N., Yee, K., Baevski, A., Ott, M., Auli, M., & Edunov, S. (2019). Facebook FAIR\u2019s WMT19 news translation task submission. In Proceedings of the fourth conference on machine translation (WMT19) (pp. 314\u2013319). Association for Computational Linguistics.","DOI":"10.18653\/v1\/W19-5333"},{"key":"6070_CR28","doi-asserted-by":"crossref","unstructured":"Peters, M., Neumann, M., Iyyer, M., Gardner, M., Clark, C., Lee, K., & Zettlemoyer, L. (2018). Deep contextualized word representations. In Proceedings of the 2018 conference of the North American chapter of the association for computational linguistics: human language technologies (long papers) (Vol. 1, pp. 2227\u20132237). Association for Computational Linguistics.","DOI":"10.18653\/v1\/N18-1202"},{"key":"6070_CR29","unstructured":"Radford, A., Narasimhan, K., Salimans, T., & Sutskever, I. (2018). Improving language understanding by generative pre-training. https:\/\/s3-us-west-2.amazonaws.com\/openai-assets\/researchcovers\/languageunsupervised\/languageunderstandingpaper.pdf"},{"issue":"8","key":"6070_CR30","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., & Sutskever, I. (2019). Language models are unsupervised multitask learners. OpenAI Blog, 1(8), 9.","journal-title":"OpenAI Blog"},{"key":"6070_CR31","doi-asserted-by":"crossref","unstructured":"Sennrich, R., Haddow, B., & Birch, A. (2016). Neural machine translation of rare words with subword units. In Proceedings of the 54th annual meeting of the association for computational linguistics (long papers) (Vol. 1, pp. 1715\u20131725). Association for Computational Linguistics.","DOI":"10.18653\/v1\/P16-1162"},{"key":"6070_CR32","unstructured":"Sermanet, P., Eigen, D., Zhang, X., Mathieu, M., Fergus, R., & LeCun, Y. (2013). Overfeat: Integrated recognition, localization and detection using convolutional networks. arXiv preprint arXiv:1312.6229"},{"key":"6070_CR33","doi-asserted-by":"crossref","unstructured":"Tiedemann, J., & Scherrer, Y. (2017). Neural machine translation with extended context. arXiv preprint arXiv:1708.05943","DOI":"10.18653\/v1\/W17-4811"},{"key":"6070_CR34","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, \u0141, & Polosukhin, I. (2017). Attention is all you need. In Advances in neural information processing systems (pp. 5998\u20136008)."},{"key":"6070_CR35","doi-asserted-by":"crossref","unstructured":"Voita, E., Sennrich, R., & Titov, I. (2019). When a good translation is wrong in context: Context-aware machine translation improves on deixis, ellipsis, and lexical cohesion. In Proceedings of the 57th annual meeting of the association for computational linguistics (pp. 1198\u20131212). Association for Computational Linguistics.","DOI":"10.18653\/v1\/P19-1116"},{"key":"6070_CR36","doi-asserted-by":"crossref","unstructured":"Voita, E., Serdyukov, P., Sennrich, R., & Titov, I. (2018). Context-aware neural machine translation learns anaphora resolution. In Proceedings of the 56th annual meeting of the association for computational linguistics (long papers) (Vol. 1, pp. 1264\u20131274). Association for Computational Linguistics.","DOI":"10.18653\/v1\/P18-1117"},{"key":"6070_CR37","doi-asserted-by":"crossref","unstructured":"Wong, K., Maruf, S., & Haffari, G. (2020). Contextual neural machine translation improves translation of cataphoric pronouns. In Proceedings of the 58th annual meeting of the association for computational linguistics (pp. 5971\u20135978).","DOI":"10.18653\/v1\/2020.acl-main.530"},{"key":"6070_CR38","doi-asserted-by":"crossref","unstructured":"Xiong, H., He, Z., Wu, H., & Wang, H. (2019a). Modeling coherence for discourse neural machine translation. In Proceedings of the AAAI conference on artificial intelligence (Vol. 33, pp. 7338\u20137345).","DOI":"10.1609\/aaai.v33i01.33017338"},{"key":"6070_CR39","unstructured":"Xiong, H., Zhang, R., Zhang, C., He, Z., Wu, H., & Wang, H. (2019b). Dutongchuan: Context-aware translation model for simultaneous interpreting. arXiv preprint arXiv:1907.12984"},{"key":"6070_CR40","unstructured":"Yang, J., Wang, M., Zhou, H., Zhao, C., Yu, Y., Zhang, W., & Li, L. (2019a). Towards making the most of bert in neural machine translation. arXiv preprint arXiv:1908.05672"},{"key":"6070_CR41","doi-asserted-by":"crossref","unstructured":"Yang, W., Xie, Y., Lin, A., Li, X., Tan, L., Xiong, K., Li, M., & Lin, J. (2019b). End-to-end open-domain question answering with bertserini. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics (demonstrations) (pp. 72\u201377).","DOI":"10.18653\/v1\/N19-4013"},{"key":"6070_CR42","doi-asserted-by":"crossref","unstructured":"Yang, Z., Yang, D., Dyer, C., He, X., Smola, A., & Hovy, E. (2016). Hierarchical attention networks for document classification. In Proceedings of the 2016 conference of the North American chapter of the association for computational linguistics: human language technologies (pp. 1480\u20131489).","DOI":"10.18653\/v1\/N16-1174"},{"key":"6070_CR43","doi-asserted-by":"crossref","unstructured":"Yun, H., Hwang, Y., & Jung, K. (2020). Improving context-aware neural machine translation using self-attentive sentence embedding. In Proceedings of the AAAI conference on artificial intelligence (Vol. 34, pp. 9498\u20139506).","DOI":"10.1609\/aaai.v34i05.6494"},{"key":"6070_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, J., Luan, H., Sun, M., Zhai, F., Xu, J., Zhang, M., & Liu, Y. (2018a). Improving the transformer translation model with document-level context. In Proceedings of the 2018 conference on empirical methods in natural language processing (pp. 533\u2013542). Association for Computational Linguistics.","DOI":"10.18653\/v1\/D18-1049"},{"key":"6070_CR45","doi-asserted-by":"crossref","unstructured":"Zhang, J., Luan, H., Sun, M., Zhai, F., Xu, J., Zhang, M., & Liu, Y. (2018b). Improving the transformer translation model with document-level context. In Proceedings of the 2018 conference on empirical methods in natural language processing (pp. 533\u2013542).","DOI":"10.18653\/v1\/D18-1049"},{"key":"6070_CR46","doi-asserted-by":"crossref","unstructured":"Zheng, Z., Yue, X., Huang, S., Chen, J., & Birch, A. (2020). Toward making the most of context in neural machine translation. In IJCAI-PRICAI.","DOI":"10.24963\/ijcai.2020\/551"},{"key":"6070_CR47","unstructured":"Zhu, J., Xia, Y., Wu, L., He, D., Qin, T., Zhou, W., Li, H., & Liu, T. (2020). Incorporating bert into neural machine translation. In International conference on learning representations."}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-021-06070-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10994-021-06070-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-021-06070-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,15]],"date-time":"2024-09-15T22:33:10Z","timestamp":1726439590000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10994-021-06070-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,1,9]]},"references-count":47,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2022,3]]}},"alternative-id":["6070"],"URL":"https:\/\/doi.org\/10.1007\/s10994-021-06070-y","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,1,9]]},"assertion":[{"value":"24 May 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 August 2021","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 September 2021","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 January 2022","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declared that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval"}},{"value":"Not applicable.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent to participate"}},{"value":"Not applicable.","order":5,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}