{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T22:53:16Z","timestamp":1743029596640,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":46,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789811692468"},{"type":"electronic","value":"9789811692475"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-981-16-9247-5_10","type":"book-chapter","created":{"date-parts":[[2022,1,11]],"date-time":"2022-01-11T21:25:33Z","timestamp":1641936333000},"page":"127-141","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["ConWST: Non-native Multi-source Knowledge Distillation for Low Resource Speech Translation"],"prefix":"10.1007","author":[{"given":"Wenbo","family":"Zhu","sequence":"first","affiliation":[]},{"given":"Hao","family":"Jin","sequence":"additional","affiliation":[]},{"given":"JianWen","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Lufeng","family":"Luo","sequence":"additional","affiliation":[]},{"given":"Jinhai","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Qinghua","family":"Lu","sequence":"additional","affiliation":[]},{"given":"Aiyuan","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,1,11]]},"reference":[{"doi-asserted-by":"crossref","unstructured":"Weiss, R.J., Chorowski, J., Jaitly, N., Wu, Y., Chen, Z.: Sequence-to-sequence models can directly translate foreign speech. arXiv preprint arXiv:1703.08581 (2017)","key":"10_CR1","DOI":"10.21437\/Interspeech.2017-503"},{"doi-asserted-by":"crossref","unstructured":"B\u2019erard, A., Besacier, L., Kocabiyikoglu, A.C., Pietquin, O.: End-to-end automatic speech translation of audiobooks. In: Proceedings of ICASSP (2018)","key":"10_CR2","DOI":"10.1109\/ICASSP.2018.8461690"},{"doi-asserted-by":"crossref","unstructured":"Tang, Y., Pino, J., Wang, C., Ma, X., Genzel, D.: A general multitask learning framework to leverage text data for speech to text tasks. In: Proceedings of ICASSP (2021)","key":"10_CR3","DOI":"10.1109\/ICASSP39728.2021.9415058"},{"doi-asserted-by":"crossref","unstructured":"Bansal, S., Kamper, H., Livescu, K., Lopez, A., Goldwater, S.: Pre-training on high-resource speech recognition improves low-resource speech-to-text translation. In: Proceedings of NAACL (2019)","key":"10_CR4","DOI":"10.18653\/v1\/N19-1006"},{"doi-asserted-by":"crossref","unstructured":"Stoian, M.C., Bansal, S., Goldwater, S.: Analyzing asr pre-training for low-resource speech-to-text translation. In: Proceedings of ICASSP (2020)","key":"10_CR5","DOI":"10.1109\/ICASSP40776.2020.9053847"},{"doi-asserted-by":"crossref","unstructured":"Wang, C., Wu, Y., Liu, S., Yang, Z., Zhou, M.: Bridging the gap between pre-training and finetuning for end-to-end speech translation. In: Proceedings of AAAI (2020)","key":"10_CR6","DOI":"10.18653\/v1\/2020.acl-main.344"},{"doi-asserted-by":"crossref","unstructured":"Jia, Y., et al.: Leveraging weakly supervised data to improve the end-to-end speech-to-text translation. In: Proceedings of ICASSP (2019)","key":"10_CR7","DOI":"10.1109\/ICASSP.2019.8683343"},{"doi-asserted-by":"crossref","unstructured":"Pino, J., Puzon, L., Gu, J., Ma, X., McCarthy, A.D., Gopinath, D.: Harnessing indirect training data for end-to-end automatic speech translation: tricks of the trade. In: Proceedings of IWSLT (2019)","key":"10_CR8","DOI":"10.21437\/Interspeech.2020-2938"},{"doi-asserted-by":"crossref","unstructured":"Salesky, E., Sperber, M., Black, A.W.: Exploring phoneme-level speech representations for end-to-end speech translation. In: Proceedings of ACL (2019)","key":"10_CR9","DOI":"10.18653\/v1\/P19-1179"},{"doi-asserted-by":"crossref","unstructured":"McCarthy, A.D., Puzon, L., Pino, J.: Skinaugment: auto-encoding speaker conversions for automatic speech translation. In: Proceedings of ICASSP (2020)","key":"10_CR10","DOI":"10.1109\/ICASSP40776.2020.9053406"},{"doi-asserted-by":"crossref","unstructured":"Wu, A., Wang, C., Pino, J., Gu, J.: Self-supervised representations improve end-to-end speech translation. In: Proceedings of Interspeech (2020)","key":"10_CR11","DOI":"10.21437\/Interspeech.2020-3094"},{"doi-asserted-by":"crossref","unstructured":"Nguyen, H., Bougares, F., Tomashenko, N., Est\u00e8ve, Y., Besaucier, L.: Investigating self-supervised pretraining for end-to-end speech translation. In: Proceedings of Interspeech (2020)","key":"10_CR12","DOI":"10.21437\/Interspeech.2020-1835"},{"doi-asserted-by":"crossref","unstructured":"Pino, J., Xu, Q., Ma, X., Dousti, M.J., Tang, Y.: Self-training for end-to-end speech translation. In: Proceedings of Interspeech (2020)","key":"10_CR13","DOI":"10.21437\/Interspeech.2020-2938"},{"doi-asserted-by":"crossref","unstructured":"Di Gangi, M.A., Negri, M., Turchi, M.: One-to-many multilingual end-to-end speech translation. In: Proceedings of ASRU (2019)","key":"10_CR14","DOI":"10.1109\/ASRU46091.2019.9004003"},{"doi-asserted-by":"crossref","unstructured":"Inaguma, H., Duh, K., Kawahara, T., Watanabe, S.: Multilingual end-to-end speech translation. In: Proceedings of ASRU (2019)","key":"10_CR15","DOI":"10.1109\/ASRU46091.2019.9003832"},{"doi-asserted-by":"crossref","unstructured":"Wang, C., Pino, J., Wu, A., Gu, J.: Covost: a diverse multilingual speech-to-text translation corpus. In: Proceedings of LREC (2020)","key":"10_CR16","DOI":"10.21437\/Interspeech.2021-2027"},{"doi-asserted-by":"crossref","unstructured":"Wang, C., Wu, A., Pino, J.: Covost 2 and massively multilingual speech-to-text translation arXiv (2020)","key":"10_CR17","DOI":"10.21437\/Interspeech.2021-2027"},{"unstructured":"Li, X., et al.: Multilingual speech translation with efficient finetuning of pretrained models. arXiv, abs\/2010.12829 (2021)","key":"10_CR18"},{"doi-asserted-by":"crossref","unstructured":"Anastasopoulos, A., Chiang, D.: Tied multitask learning for neural speech translation. In: Proceedings of NAACL (2018)","key":"10_CR19","DOI":"10.18653\/v1\/N18-1008"},{"doi-asserted-by":"crossref","unstructured":"Liu, Y., et al.: End-to-end speech translation with knowledge distillation. In: Proceedings of Interspeech (2019)","key":"10_CR20","DOI":"10.21437\/Interspeech.2019-2582"},{"doi-asserted-by":"crossref","unstructured":"Chuang, S.-P., Sung, T.-W., Liu, A.H., Lee, H.-Y.: Worse WER, but better BLEU? Leveraging word embedding as intermediate in multitask end-to-end speech translation. In: Proceedings of ACL (2020)","key":"10_CR21","DOI":"10.18653\/v1\/2020.acl-main.533"},{"doi-asserted-by":"crossref","unstructured":"Wang, C., Wu, Y., Liu, S., Zhou, M., Yang, Z.: Curriculum pre-training for end-to-end speech translation. In: Proceedings of ACL (2020)","key":"10_CR22","DOI":"10.18653\/v1\/2020.acl-main.344"},{"doi-asserted-by":"crossref","unstructured":"Salesky, E., Black, A.W.: Phone features improve speech translation. In: Proceedings of ACL (2020)","key":"10_CR23","DOI":"10.18653\/v1\/2020.acl-main.217"},{"doi-asserted-by":"crossref","unstructured":"Bansal, S., Kamper, H., Livescu, K., Lopez, A., Goldwater, S.: Pretraining on high-resource speech recognition improve the low-resource speech-to-text translation. In: Proceedings of NAACL (2019)","key":"10_CR24","DOI":"10.18653\/v1\/N19-1006"},{"doi-asserted-by":"crossref","unstructured":"Kim, Y., Rush, A.M.: Sequence-level knowledge distillation (2016)","key":"10_CR25","DOI":"10.18653\/v1\/D16-1139"},{"unstructured":"Zhou, C., Gu, J., Neubig, G.: Understanding knowledge distillation in non-autoregressive machine translation. In: Proceedings of ICLR (2019a)","key":"10_CR26"},{"doi-asserted-by":"crossref","unstructured":"Ren, Y., Liu, J., Tan, X., Zhao, Z., Zhao, S., Liu, T.Y.: A study of non-autoregressive model for sequence generation. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (2020)","key":"10_CR27","DOI":"10.18653\/v1\/2020.acl-main.15"},{"doi-asserted-by":"crossref","unstructured":"Schneider, S., Baevski, A., Collobert, R., Auli, M.: wav2vec: unsupervised pre-training for speech recognition. In: Proceedings of Interspeech, pp. 3465\u20133469 (2019)","key":"10_CR28","DOI":"10.21437\/Interspeech.2019-1873"},{"doi-asserted-by":"crossref","unstructured":"Inaguma, H., Kawahara, T., Watanabe, S.: .Source and target bidirectional knowledge distillation for end-to-end speech translation. In: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (2021)","key":"10_CR29","DOI":"10.18653\/v1\/2021.naacl-main.150"},{"unstructured":"Chen, T., Kornblith, S., Swersky, K., Norouzi, M., Hinton, G.E.: Big self-supervised models are strong semi-supervised learners. In: Advances in Neural Information Processing Systems, vol. 33 (2020)","key":"10_CR30"},{"unstructured":"Oord, A., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)","key":"10_CR31"},{"unstructured":"Lan, Z., Chen, M., Goodman, S., Gimpel, K., Sharma, P., Soricut, R.: ALBERT: a lite BERT for self-supervised learning of language representations. In: International Conference on Learning Representations (2019)","key":"10_CR32"},{"unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. In: Larochelle, H., Ranzato, M., Hadsell, R., Balcan, M., Lin, H. (eds.) Proceedings of NeurIPS (2020)","key":"10_CR33"},{"unstructured":"Tran, C., Tang, Y., Li, X., Gu, J.: Cross-lingual retrieval for iterative self-supervised training. In: Larochelle, H., Ranzato, M., Hadsell, R., Balcan, M., Lin, H. (eds.) Proceedings of NeurIPS (2020)","key":"10_CR34"},{"unstructured":"Baevski, A., Zhou, Y., Mohamed, A.-R., Auli, M.: wav2vec2.0: a framework for self-supervised learning of speech representations. In: Advances in Neural Information Processing Systems, vol. 33 (2020)","key":"10_CR35"},{"unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5998\u20136008 (2017)","key":"10_CR36"},{"unstructured":"Baevski, A., Zhou, H., Mohamed, A., Auli, M.: Wav2vec 2.0: a framework for self-supervised learning of speech representations (2020)","key":"10_CR37"},{"unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., et al.: Attention is all you need. arXiv (2017)","key":"10_CR38"},{"doi-asserted-by":"crossref","unstructured":"Koehn, P., et al.: Moses: open source toolkit for statistical machine translation. In: Proceedings of the 45th Annual Meeting of the ACL on Interactive Poster and Demonstration Sessions, pp. 177\u2013180. Association for Computational Linguistics (2007)","key":"10_CR39","DOI":"10.3115\/1557769.1557821"},{"unstructured":"Povey, D., et al.: The Kaldi speech recognition toolkit. In: IEEE 2011 workshop on automatic speech recognition and understanding. IEEE Signal Processing Society (2011)","key":"10_CR40"},{"doi-asserted-by":"crossref","unstructured":"Ko, T., Peddinti, V., Povey, D., Khudanpur, S.: Audio augmentation for speech recognition. In: Sixteenth Annual Conference of the International Speech Communication Association (2015)","key":"10_CR41","DOI":"10.21437\/Interspeech.2015-711"},{"key":"10_CR42","first-page":"2613","volume-title":"Interspeech 2019, 20th Annual Conference of the International Speech Communication Association, Graz, Austria, 15\u201319 September 2019","author":"DS Park","year":"2019","unstructured":"Park, D.S.: Specaugment: a simple data augmentation method for automatic speech recognition. In: Kubin, G., Kacic, Z. (eds.) Interspeech 2019, 20th Annual Conference of the International Speech Communication Association, Graz, Austria, 15\u201319 September 2019, pp. 2613\u20132617. ISCA (2019)"},{"unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: Bengio, Y., LeCun, Y. (eds.) 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, 7\u20139 May 2015, Conference Track Proceedings (2015)","key":"10_CR43"},{"unstructured":"Aswani, A.V., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5998\u20136008 (2017)","key":"10_CR44"},{"doi-asserted-by":"crossref","unstructured":"Rico Sennrich, A.B., Haddow, B.: Neural machine translation of rare words with subword units. In: Proceedings of ACL (2016)","key":"10_CR45","DOI":"10.18653\/v1\/P16-1162"},{"doi-asserted-by":"crossref","unstructured":"Provilkov, I., Emelianenko, D., Voita, E.: BPE-dropout: simple and effective subword regularization. In: Proceedings of ACL (2020)","key":"10_CR46","DOI":"10.18653\/v1\/2020.acl-main.170"}],"container-title":["Communications in Computer and Information Science","Cognitive Systems and Information Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-16-9247-5_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,7]],"date-time":"2022-05-07T02:13:53Z","timestamp":1651889633000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-16-9247-5_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9789811692468","9789811692475"],"references-count":46,"URL":"https:\/\/doi.org\/10.1007\/978-981-16-9247-5_10","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"type":"print","value":"1865-0929"},{"type":"electronic","value":"1865-0937"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"11 January 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICCSIP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Cognitive Systems and Signal Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Suzhou","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 November 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 November 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iccsip2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/iccsip2021.tsingzhan.com\/#\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"105","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"41","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"39% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}