{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T23:06:26Z","timestamp":1743116786355,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":27,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819794362"},{"type":"electronic","value":"9789819794379"}],"license":[{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-9437-9_16","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T16:27:22Z","timestamp":1730392042000},"page":"199-212","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Improving End-to-End Speech Translation with\u00a0Progressive Dual Encoding"],"prefix":"10.1007","author":[{"given":"Runlai","family":"Zhang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Saihan","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuhao","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yangfan","family":"Du","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hao","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tong","family":"Xiao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingbo","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,1]]},"reference":[{"key":"16_CR1","unstructured":"Anmol, G., et al.: Conformer: convolution-augmented transformer for speech recognition (2020)"},{"key":"16_CR2","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. Adv. Neural Inf. Process. Syst. 33, 12449\u201312460 (2020)"},{"key":"16_CR3","doi-asserted-by":"crossref","unstructured":"B\u00e9rard, A., Besacier, L., Kocabiyikoglu, A.C., Pietquin, O.: End-to-end automatic speech translation of audiobooks. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","DOI":"10.1109\/ICASSP.2018.8461690"},{"key":"16_CR4","unstructured":"Di\u00a0Gangi, M.A., Cattoni, R., Bentivogli, L., Negri, M., Turchi, M.: MuST-C: a multilingual speech translation corpus. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), Minneapolis, Minnesota, pp. 2012\u20132017, June 2019"},{"key":"16_CR5","doi-asserted-by":"crossref","unstructured":"Fang, Q., Ye, R., Li, L., Feng, Y., Wang, M.: STEMM: self-learning with speech-text manifold mixup for speech translation. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), Dublin, Ireland, pp. 7050\u20137062, May 2022","DOI":"10.18653\/v1\/2022.acl-long.486"},{"key":"16_CR6","doi-asserted-by":"crossref","unstructured":"Graves, A., Fern\u00e1ndez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd International Conference on Machine Learning, pp. 369\u2013376 (2006)","DOI":"10.1145\/1143844.1143891"},{"key":"16_CR7","doi-asserted-by":"publisher","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","volume":"29","author":"WN Hsu","year":"2021","unstructured":"Hsu, W.N., Bolte, B., Tsai, Y.H.H., Lakhotia, K., Salakhutdinov, R., Mohamed, A.: HuBERT: self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 3451\u20133460 (2021)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"16_CR8","doi-asserted-by":"crossref","unstructured":"Kudo, T., Richardson, J.: SentencePiece: a simple and language independent subword tokenizer and detokenizer for neural text processing. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, Brussels, Belgium, pp. 66\u201371, November 2018","DOI":"10.18653\/v1\/D18-2012"},{"key":"16_CR9","unstructured":"Lee, Y., Kim, T.: Learning pronunciation from a foreign language in speech synthesis networks. CoRR abs\/1811.09364 (2018)"},{"key":"16_CR10","unstructured":"Lison, P., Tiedemann, J.: OpenSubtitles2016: extracting large parallel corpora from movie and TV subtitles (2016)"},{"key":"16_CR11","doi-asserted-by":"crossref","unstructured":"Meng, L., Xu, J., Tan, X., Wang, J., Qin, T., Xu, B.: MixSpeech: data augmentation for low-resource automatic speech recognition. In: ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7008\u20137012. IEEE (2021)","DOI":"10.1109\/ICASSP39728.2021.9414483"},{"key":"16_CR12","unstructured":"Michael, M., Michaela, S., Sarah, M., Michael, W., Morgan, S.: Trainable text-speech alignment using Kaldi. In: Interspeech, pp. 498\u2013502 (2017)"},{"key":"16_CR13","doi-asserted-by":"crossref","unstructured":"Ouyang, S., Ye, R., Li, L.: WACO: word-aligned contrastive learning for speech translation. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), Toronto, Canada, pp. 3891\u20133907, July 2023","DOI":"10.18653\/v1\/2023.acl-long.216"},{"key":"16_CR14","doi-asserted-by":"crossref","unstructured":"Park, D.S., et al.: SpecAugment: a simple data augmentation method for automatic speech recognition. In: Interspeech 2019, September 2019","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"16_CR15","doi-asserted-by":"crossref","unstructured":"Post, M.: A call for clarity in reporting BLEU scores. In: Proceedings of the Third Conference on Machine Translation: Research Papers, Brussels, Belgium, pp. 186\u2013191, October 2018","DOI":"10.18653\/v1\/W18-6319"},{"key":"16_CR16","first-page":"447","volume-title":"Fundamentals of Speech Recognition","author":"L Rabiner","year":"1993","unstructured":"Rabiner, L.: Fundamentals of Speech Recognition, pp. 447\u2013453. Prentice Hall, Englewood Cliffs (1993)"},{"key":"16_CR17","doi-asserted-by":"crossref","unstructured":"Tang, Y., Pino, J., Li, X., Wang, C., Genzel, D.: Improving speech translation by understanding and learning from the auxiliary text translation task. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 4252\u20134261. Online, August 2021","DOI":"10.18653\/v1\/2021.acl-long.328"},{"key":"16_CR18","doi-asserted-by":"crossref","unstructured":"Tang, Y., Pino, J., Wang, C., Ma, X., Genzel, D.: A general multi-task learning framework to leverage text data for speech to text tasks. In: ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6209\u20136213 (2021)","DOI":"10.1109\/ICASSP39728.2021.9415058"},{"key":"16_CR19","unstructured":"Vaswani, A., et al.: Attention is all you need. Ad. Neural Inf. Process. Syst. 30 (2017)"},{"key":"16_CR20","unstructured":"Verma, V., et al.: Manifold mixup: better representations by interpolating hidden states. In: International Conference on Machine Learning, pp. 6438\u20136447. PMLR (2019)"},{"key":"16_CR21","doi-asserted-by":"crossref","unstructured":"Wang, C., Tang, Y., Ma, X., Wu, A., Okhonko, D., Pino, J.: Fairseq S2T: fast speech-to-text modeling with fairseq. In: Proceedings of the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing: System Demonstrations, Suzhou, China, pp. 33\u201339, December 2020","DOI":"10.18653\/v1\/2020.aacl-demo.6"},{"key":"16_CR22","doi-asserted-by":"crossref","unstructured":"Weiss, R.J., Chorowski, J., Jaitly, N., Wu, Y., Chen, Z.: Sequence-to-sequence models can directly translate foreign speech. arXiv preprint arXiv:1703.08581 (2017)","DOI":"10.21437\/Interspeech.2017-503"},{"key":"16_CR23","doi-asserted-by":"crossref","unstructured":"Xu, C., et al.: Stacked acoustic-and-textual encoding: Integrating the pre-trained models into speech translation encoders. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 2619\u20132630. Online, August 2021","DOI":"10.18653\/v1\/2021.acl-long.204"},{"key":"16_CR24","doi-asserted-by":"crossref","unstructured":"Xu, C., et al.: Bridging the granularity gap for acoustic modeling. In: Findings of the Association for Computational Linguistics: ACL 2023, Toronto, Canada, pp. 10816\u201310833, July 2023","DOI":"10.18653\/v1\/2023.findings-acl.688"},{"key":"16_CR25","doi-asserted-by":"crossref","unstructured":"Ye, R., Wang, M., Li, L.: End-to-end speech translation via cross-modal progressive training. In: Proceedings of INTERSPEECH, August 2021","DOI":"10.21437\/Interspeech.2021-1065"},{"key":"16_CR26","doi-asserted-by":"crossref","unstructured":"Ye, R., Wang, M., Li, L.: Cross-modal contrastive learning for speech translation. In: Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Seattle, United States, pp. 5099\u20135113, July 2022","DOI":"10.18653\/v1\/2022.naacl-main.376"},{"key":"16_CR27","doi-asserted-by":"crossref","unstructured":"Yin, W., Liu, Z., Zhao, C., Wang, T., Tong, J., Ye, R.: Improving speech translation by fusing speech and text. In: Findings of the Association for Computational Linguistics: EMNLP 2023, Singapore, pp. 6262\u20136273, December 2023","DOI":"10.18653\/v1\/2023.findings-emnlp.414"}],"container-title":["Lecture Notes in Computer Science","Natural Language Processing and Chinese Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-9437-9_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T16:08:12Z","timestamp":1732982892000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-9437-9_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,1]]},"ISBN":["9789819794362","9789819794379"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-9437-9_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,1]]},"assertion":[{"value":"1 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"NLPCC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"CCF International Conference on Natural Language Processing and Chinese Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hangzhou","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 November 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 November 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"nlpcc2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/tcci.ccf.org.cn\/conference\/2024\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}