{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,10]],"date-time":"2025-10-10T07:23:18Z","timestamp":1760080998057},"reference-count":35,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"10","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2024,10,1]]},"DOI":"10.1587\/transinf.2023edp7249","type":"journal-article","created":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T22:12:59Z","timestamp":1727734379000},"page":"1322-1331","source":"Crossref","is-referenced-by-count":1,"title":["Neural End-To-End Speech Translation Leveraged by ASR Posterior Distribution"],"prefix":"10.1587","volume":"E107.D","author":[{"given":"Yuka","family":"KO","sequence":"first","affiliation":[{"name":"Nara Institute of Science and Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Katsuhito","family":"SUDOH","sequence":"additional","affiliation":[{"name":"Nara Institute of Science and Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sakriani","family":"SAKTI","sequence":"additional","affiliation":[{"name":"Nara Institute of Science and Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Satoshi","family":"NAKAMURA","sequence":"additional","affiliation":[{"name":"Nara Institute of Science and Technology"},{"name":"School of Data Science, Chinese University of Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"532","reference":[{"key":"1","doi-asserted-by":"crossref","unstructured":"[1] H. Ney, \u201cSpeech translation: coupling of recognition and translation,\u201d Proc. 1999 IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP &apos;99, Phoenix, Arizona, USA, March 15-19, 1999, pp.517-520, IEEE Computer Society, 1999. 10.1109\/icassp.1999.758176","DOI":"10.1109\/ICASSP.1999.758176"},{"key":"2","doi-asserted-by":"publisher","unstructured":"[2] F. Casacuberta, M. Federico, H. Ney, and E. Vidal, \u201cRecent efforts in spoken language translation,\u201d IEEE Signal Process. Mag., vol.25, no.3, pp.80-88, 2008. 10.1109\/msp.2008.917989","DOI":"10.1109\/MSP.2008.917989"},{"key":"3","doi-asserted-by":"crossref","unstructured":"[3] G. Kumar, M. Post, D. Povey, and S. Khudanpur, \u201cSome insights from translating conversational telephone speech,\u201d IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2014, Florence, Italy, May 4-9, 2014, pp.3231-3235, IEEE, 2014. 10.1109\/icassp.2014.6854197","DOI":"10.1109\/ICASSP.2014.6854197"},{"key":"4","doi-asserted-by":"crossref","unstructured":"[4] M. Sperber, G. Neubig, N.-Q. Pham, and A. Waibel, \u201cSelf-Attentional Models for Lattice Inputs,\u201d Proc. 57th Conference of the Association for Computational Linguistics, ACL 2019, Florence, Italy, July 28- Aug. 2, 2019, Volume 1: Long Papers, ed. A. Korhonen, D.R. Traum, and L. M\u00e0rquez, pp.1185-1197, Association for Computational Linguistics, 2019. 10.18653\/v1\/p19-1115","DOI":"10.18653\/v1\/P19-1115"},{"key":"5","unstructured":"[5] A. Berard, O. Pietquin, C. Servan, and L. Besacier, \u201cListen and translate: A proof of concept for end-to-end speech-to-text translation,\u201d CoRR, vol.abs\/1612.01744, 2016."},{"key":"6","doi-asserted-by":"crossref","unstructured":"[6] R.J. Weiss, J. Chorowski, N. Jaitly, Y. Wu, and Z. Chen, \u201cSequence-to-sequence models can directly translate foreign speech,\u201d Interspeech 2017, 18th Annual Conference of the International Speech Communication Association, Stockholm, Sweden, Aug. 20-24, 2017, ed. F. Lacerda, pp.2625-2629, ISCA, 2017. 10.21437\/interspeech.2017-503","DOI":"10.21437\/Interspeech.2017-503"},{"key":"7","unstructured":"[7] J. Niehues, R. Cattoni, S. St\u00fcker, M. Negri, M. Turchi, T. Ha, E. Salesky, R. Sanabria, L. Barrault, L. Specia, and M. Federico, \u201cThe IWSLT 2019 evaluation campaign,\u201d Proc. 16th International Conference on Spoken Language Translation, IWSLT 2019, Hong Kong, Nov. 2-3, 2019, ed. J. Niehues, R. Cattoni, S. St\u00fcker, M. Negri, M. Turchi, T. Ha, E. Salesky, R. Sanabria, L. Barrault, L. Specia, and M. Federico, Association for Computational Linguistics, 2019."},{"key":"8","doi-asserted-by":"crossref","unstructured":"[8] A. Anastasopoulos and D. Chiang, \u201cTied Multitask Learning for Neural Speech Translation,\u201d Proc. 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2018, New Orleans, Louisiana, USA, June 1-6, 2018, Volume 1 (Long Papers), ed. M.A. Walker, H. Ji, and A. Stent, pp.82-91, Association for Computational Linguistics, 2018. 10.18653\/v1\/n18-1008","DOI":"10.18653\/v1\/N18-1008"},{"key":"9","doi-asserted-by":"crossref","unstructured":"[9] Y. Jia, R.J. Weiss, F. Biadsy, W. Macherey, M. Johnson, Z. Chen, and Y. Wu, \u201cDirect Speech-to-Speech Translation with a Sequence-to-Sequence Model,\u201d Interspeech 2019, 20th Annual Conference of the International Speech Communication Association, Graz, Austria, 15-19 Sept. 2019, ed. G. Kubin and Z. Kacic, pp.1123-1127, ISCA, 2019. 10.21437\/interspeech.2019-1951","DOI":"10.21437\/Interspeech.2019-1951"},{"key":"10","doi-asserted-by":"publisher","unstructured":"[10] T. Kano, S. Sakti, and S. Nakamura, \u201cEnd-to-End Speech Translation With Transcoding by Multi-Task Learning for Distant Language Pairs,\u201d IEEE ACM Trans. Audio Speech Lang. Process., vol.28, pp.1342-1355, 2020. 10.1109\/taslp.2020.2986886","DOI":"10.1109\/TASLP.2020.2986886"},{"key":"11","doi-asserted-by":"crossref","unstructured":"[11] T. Kano, S. Sakti, and S. Nakamura, \u201cTransformer-Based Direct Speech-To-Speech Translation with Transcoder,\u201d 2021 IEEE Spoken Language Technology Workshop (SLT), pp.958-965, 2021. 10.1109\/slt48900.2021.9383496","DOI":"10.1109\/SLT48900.2021.9383496"},{"key":"12","unstructured":"[12] K. Osamura, T. Kano, S. Sakti, K. Sudoh, and S. Nakamura, \u201cUsing Spoken Word Posterior Features in Neural Machine Translation,\u201d Proc. 15th International Workshop on Spoken Language Translation, 181-188, Oct. 2018, vol.21, p.22, 2018."},{"key":"13","doi-asserted-by":"crossref","unstructured":"[13] P. Bahar, T. Bieschke, R. Schl\u00fcter, and H. Ney, \u201cTight integrated end-to-end training for cascaded speech translation,\u201d IEEE Spoken Language Technology Workshop, SLT 2021, Shenzhen, China, Jan. 19-22, 2021, pp.950-957, IEEE, 2021. 10.1109\/slt48900.2021.9383462","DOI":"10.1109\/SLT48900.2021.9383462"},{"key":"14","doi-asserted-by":"crossref","unstructured":"[14] S. Dalmia, B. Yan, V. Raunak, F. Metze, and S. Watanabe, \u201cSearchable hidden intermediates for end-to-end models of decomposable sequence tasks,\u201d Proc. 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2021, Online, June 6-11, 2021, ed. K. Toutanova, A. Rumshisky, L. Zettlemoyer, D. Hakkani-T\u00fcr, I. Beltagy, S. Bethard, R. Cotterell, T. Chakraborty, and Y. Zhou, pp.1882-1896, Association for Computational Linguistics, 2021. 10.18653\/v1\/2021.naacl-main.151","DOI":"10.18653\/v1\/2021.naacl-main.151"},{"key":"15","doi-asserted-by":"crossref","unstructured":"[15] S.-P. Chuang, T.-W. Sung, A.H. Liu, and H.-Y. Lee, \u201cWorse WER, but Better BLEU? Leveraging Word Embedding as Intermediate in Multitask End-to-End Speech Translation,\u201d Proc. 58th Annual Meeting of the Association for Computational Linguistics, ACL 2020, Online, July 5-10, 2020, ed. D. Jurafsky, J. Chai, N. Schluter, and J.R. Tetreault, pp.5998-6003, Association for Computational Linguistics, 2020. 10.18653\/v1\/2020.acl-main.533","DOI":"10.18653\/v1\/2020.acl-main.533"},{"key":"16","unstructured":"[16] G.E. Hinton, O. Vinyals, and J. Dean, \u201cDistilling the Knowledge in a Neural Network,\u201d CoRR, vol.abs\/1503.02531, 2015."},{"key":"17","doi-asserted-by":"crossref","unstructured":"[17] Y. Kim and A.M. Rush, \u201cSequence-level knowledge distillation,\u201d Proc. 2016 Conference on Empirical Methods in Natural Language Processing, EMNLP 2016, Austin, Texas, USA, Nov. 1-4, 2016, ed. J. Su, X. Carreras, and K. Duh, pp.1317-1327, The Association for Computational Linguistics, 2016. 10.18653\/v1\/d16-1139","DOI":"10.18653\/v1\/D16-1139"},{"key":"18","doi-asserted-by":"crossref","unstructured":"[18] Y. Liu, H. Xiong, J. Zhang, Z. He, H. Wu, H. Wang, and C. Zong, \u201cEnd-to-End Speech Translation with Knowledge Distillation,\u201d Interspeech 2019, 20th Annual Conference of the International Speech Communication Association, Graz, Austria, 15-19 Sept. 2019, ed. G. Kubin and Z. Kacic, pp.1128-1132, ISCA, 2019.","DOI":"10.21437\/Interspeech.2019-2582"},{"key":"19","doi-asserted-by":"crossref","unstructured":"[19] M. Gaido, M.A.D. Gangi, M. Negri, and M. Turchi, \u201cEnd-to-End Speech-Translation with Knowledge Distillation: FBK@IWSLT2020,\u201d Proc. 17th International Conference on Spoken Language Translation, IWSLT 2020, Online, July 9 - 10, 2020, ed. M. Federico, A. Waibel, K. Knight, S. Nakamura, H. Ney, J. Niehues, S. St\u00fcker, D. Wu, J. Mariani, and F. Yvon, pp.80-88, Association for Computational Linguistics, 2020. 10.18653\/v1\/2020.iwslt-1.8","DOI":"10.18653\/v1\/2020.iwslt-1.8"},{"key":"20","doi-asserted-by":"crossref","unstructured":"[20] C. Szegedy, V. Vanhoucke, S. Ioffe, J. Shlens, and Z. Wojna, \u201cRethinking the inception architecture for computer vision,\u201d 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016, Las Vegas, NV, USA, June 27-30, 2016, pp.2818-2826, IEEE Computer Society, 2016. 10.1109\/cvpr.2016.308","DOI":"10.1109\/CVPR.2016.308"},{"key":"21","unstructured":"[21] R. M\u00fcller, S. Kornblith, and G.E. Hinton, \u201cWhen does label smoothing help?,\u201d Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, Dec. 8-14, 2019, Vancouver, BC, Canada, ed. H.M. Wallach, H. Larochelle, A. Beygelzimer, F. d&apos;Alch\u00e9-Buc, E.B. Fox, and R. Garnett, pp.4696-4705, 2019."},{"key":"22","doi-asserted-by":"crossref","unstructured":"[22] A. Graves, S. Fern\u00e1ndez, F. Gomez, and J. Schmidhuber, \u201cConnectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks,\u201d Machine Learning, Proc. Twenty-Third International Conference (ICML 2006), Pittsburgh, Pennsylvania, USA, June 25-29, 2006, ed. W.W. Cohen and A.W. Moore, ACM International Conference Proceeding Series, vol.148, pp.369-376, ACM, 2006. 10.1145\/1143844.1143891","DOI":"10.1145\/1143844.1143891"},{"key":"23","doi-asserted-by":"publisher","unstructured":"[23] S. Watanabe, T. Hori, S. Kim, J.R. Hershey, and T. Hayashi, \u201cHybrid ctc\/attention architecture for end-to-end speech recognition,\u201d IEEE J. Sel. Topics Signal Process., vol.11, no.8, pp.1240-1253, 2017. 10.1109\/jstsp.2017.2763455","DOI":"10.1109\/JSTSP.2017.2763455"},{"key":"24","unstructured":"[24] M. Post, G. Kumar, A. Lopez, D.G. Karakos, C. Callison-Burch, and S. Khudanpur, \u201cImproved speech-to-text translation with the fisher and callhome spanish-english speech translation corpus,\u201d Proc. 10th International Workshop on Spoken Language Translation: Papers, Heidelberg, Germany, Dec. 5-6, 2013, 2013."},{"key":"25","doi-asserted-by":"crossref","unstructured":"[25] P. Koehn, H. Hoang, A. Birch, C. Callison-Burch, M. Federico, N. Bertoldi, B. Cowan, W. Shen, C. Moran, R. Zens, C. Dyer, O. Bojar, A. Constantin, and E. Herbst, \u201cMoses: Open Source Toolkit for Statistical Machine Translation,\u201d ACL 2007, Proc. 45th Annual Meeting of the Association for Computational Linguistics, June 23-30, 2007, Prague, Czech Republic, ed. J.A. Carroll, A. van den Bosch, and A. Zaenen, The Association for Computational Linguistics, pp.177-180, 2007. 10.3115\/1557769.1557821","DOI":"10.3115\/1557769.1557821"},{"key":"26","unstructured":"[26] D. Povey, A. Ghoshal, G. Boulianne, L. Burget, O. Glembek, N. Goel, M. Hannemann, P. Motlicek, Y. Qian, P. Schwarz, et al., \u201cThe Kaldi speech recognition toolkit,\u201d IEEE 2011 workshop on automatic speech recognition and understanding, IEEE Signal Processing Society, 2011."},{"key":"27","doi-asserted-by":"crossref","unstructured":"[27] T. Ko, V. Peddinti, D. Povey, and S. Khudanpur, \u201cAudio augmentation for speech recognition,\u201d INTERSPEECH 2015, 16th Annual Conference of the International Speech Communication Association, Dresden, Germany, Sept. 6-10, 2015, pp.3586-3589, ISCA, 2015. 10.21437\/interspeech.2015-711","DOI":"10.21437\/Interspeech.2015-711"},{"key":"28","doi-asserted-by":"crossref","unstructured":"[28] T. Kudo and J. Richardson, \u201cSentencePiece: A simple and language independent subword tokenizer and detokenizer for Neural Text Processing,\u201d Proc. 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pp.66-71, 2018. 10.18653\/v1\/d18-2012","DOI":"10.18653\/v1\/D18-2012"},{"key":"29","unstructured":"[29] A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A.N. Gomez, \u0141. Kaiser, and I. Polosukhin, \u201cAttention is All You Need,\u201d Advances in neural information processing systems, pp.5998-6008, 2017."},{"key":"30","doi-asserted-by":"crossref","unstructured":"[30] S. Watanabe, T. Hori, S. Karita, T. Hayashi, J. Nishitoba, Y. Unno, N.E.Y. Soplin, J. Heymann, M. Wiesner, N. Chen, A. Renduchintala, and T. Ochiai, \u201cESPnet: End-to-End Speech Processing Toolkit,\u201d Proc. Interspeech 2018, pp.2207-2211, 2018. 10.21437\/interspeech.2018-1456","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"31","doi-asserted-by":"crossref","unstructured":"[31] K. Papineni, S. Roukos, T. Ward, and W.-J. Zhu, \u201cBLEU: a method for automatic evaluation of machine translation,\u201d Proc. 40th annual meeting of the Association for Computational Linguistics, pp.311-318, 2002. 10.3115\/1073083.1073135","DOI":"10.3115\/1073083.1073135"},{"key":"32","doi-asserted-by":"crossref","unstructured":"[32] H. Inaguma, S. Kiyono, K. Duh, S. Karita, N. Yalta, T. Hayashi, and S. Watanabe, \u201cESPnet-ST: All-in-One Speech Translation Toolkit,\u201d Proc. 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations, ACL 2020, Online, July 5-10, 2020, ed. A. Celikyilmaz and T. Wen, pp.302-311, Association for Computational Linguistics, 2020. 10.18653\/v1\/2020.acl-demos.34","DOI":"10.18653\/v1\/2020.acl-demos.34"},{"key":"33","doi-asserted-by":"crossref","unstructured":"[33] M. Post, \u201cA call for clarity in reporting BLEU scores,\u201d Proc. Third Conference on Machine Translation: Research Papers, Belgium, Brussels, pp.186-191, Association for Computational Linguistics, Oct. 2018. 10.18653\/v1\/w18-6319","DOI":"10.18653\/v1\/W18-6319"},{"key":"34","doi-asserted-by":"crossref","unstructured":"[34] D.S. Park, W. Chan, Y. Zhang, C.-C. Chiu, B. Zoph, E.D. Cubuk, and Q.V. Le, \u201cSpecaugment: A simple data augmentation method for automatic speech recognition,\u201d Interspeech 2019, 20th Annual Conference of the International Speech Communication Association, Graz, Austria, 15-19 Sept. 2019, ed. G. Kubin and Z. Kacic, pp.2613-2617, ISCA, 2019. 10.21437\/interspeech.2019-2680","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"35","doi-asserted-by":"crossref","unstructured":"[35] Y. Ko, K. Sudoh, S. Sakti, and S. Nakamura, \u201cASR Posterior-based Loss for Multi-task End-to-end Speech Translation,\u201d Proc. Interspeech 2021, pp.2272-2276, 2021. 10.21437\/interspeech.2021-1105","DOI":"10.21437\/Interspeech.2021-1105"}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E107.D\/10\/E107.D_2023EDP7249\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,5]],"date-time":"2024-10-05T03:45:28Z","timestamp":1728099928000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E107.D\/10\/E107.D_2023EDP7249\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,1]]},"references-count":35,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2024]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2023edp7249","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"type":"print","value":"0916-8532"},{"type":"electronic","value":"1745-1361"}],"subject":[],"published":{"date-parts":[[2024,10,1]]},"article-number":"2023EDP7249"}}