{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:13:33Z","timestamp":1750220013201,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,3,3]],"date-time":"2023-03-03T00:00:00Z","timestamp":1677801600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key R&D Program of China","award":["2020AAA0106600"],"award-info":[{"award-number":["2020AAA0106600"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,3,3]]},"DOI":"10.1145\/3594409.3594415","type":"proceedings-article","created":{"date-parts":[[2023,7,26]],"date-time":"2023-07-26T17:31:06Z","timestamp":1690392666000},"page":"178-182","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Forward Translation to Mix Data for Speech Translation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-6140-4218","authenticated-orcid":false,"given":"Zhipeng","family":"Wang","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Beijing Institute of Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9937-2756","authenticated-orcid":false,"given":"Hongjing","family":"Xu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Beijing Institute of Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6947-0053","authenticated-orcid":false,"given":"Shuoying","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Beijing Institute of Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6343-285X","authenticated-orcid":false,"given":"Yuhang","family":"Guo","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Beijing Institute of Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,7,26]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the Workshop on Collaborative Translation: technology, crowdsourcing, and the translator perspective","author":"Munro R.","year":"2010","unstructured":"[ 1 ] Munro , R. 2010 . Crowdsourced translation for emergency response in Haiti: the global collaboration of local knowledge . Proceedings of the Workshop on Collaborative Translation: technology, crowdsourcing, and the translator perspective ( Denver, Colorado, USA , Oct. 2010). [1] Munro, R. 2010. Crowdsourced translation for emergency response in Haiti: the global collaboration of local knowledge. Proceedings of the Workshop on Collaborative Translation: technology, crowdsourcing, and the translator perspective (Denver, Colorado, USA, Oct. 2010)."},{"key":"e_1_3_2_1_2_1","volume-title":"On Using SpecAugment for End-to-End Speech Translation. arXiv:1911.08876 [cs, eess]. (Nov","author":"Bahar P.","year":"2019","unstructured":"[ 2 ] Bahar , P. 2019. On Using SpecAugment for End-to-End Speech Translation. arXiv:1911.08876 [cs, eess]. (Nov . 2019 ). [2] Bahar, P. 2019. On Using SpecAugment for End-to-End Speech Translation. arXiv:1911.08876 [cs, eess]. (Nov. 2019)."},{"key":"e_1_3_2_1_3_1","volume-title":"Listen and Translate: A Proof of Concept for End-to-End Speech-to-Text Translation. arXiv:1612.01744 [cs]. (Dec","author":"Berard A.","year":"2016","unstructured":"[ 3 ] Berard , A. 2016. Listen and Translate: A Proof of Concept for End-to-End Speech-to-Text Translation. arXiv:1612.01744 [cs]. (Dec . 2016 ). [3] Berard, A. 2016. Listen and Translate: A Proof of Concept for End-to-End Speech-to-Text Translation. arXiv:1612.01744 [cs]. (Dec. 2016)."},{"key":"e_1_3_2_1_4_1","volume-title":"Sequence-to-Sequence Models Can Directly Translate Foreign Speech. arXiv:1703.08581 [cs, stat]. (Jun","author":"Weiss R.J.","year":"2017","unstructured":"[ 4 ] Weiss , R.J. 2017. Sequence-to-Sequence Models Can Directly Translate Foreign Speech. arXiv:1703.08581 [cs, stat]. (Jun . 2017 ). [4] Weiss, R.J. 2017. Sequence-to-Sequence Models Can Directly Translate Foreign Speech. arXiv:1703.08581 [cs, stat]. (Jun. 2017)."},{"key":"e_1_3_2_1_5_1","volume-title":"End-to-End Speech Translation with Knowledge Distillation. Interspeech 2019 (Sep","author":"Liu Y.","year":"2019","unstructured":"[ 5 ] Liu , Y. 2019. End-to-End Speech Translation with Knowledge Distillation. Interspeech 2019 (Sep . 2019 ), 1128\u20131132. [5] Liu, Y. 2019. End-to-End Speech Translation with Knowledge Distillation. Interspeech 2019 (Sep. 2019), 1128\u20131132."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.325"},{"key":"e_1_3_2_1_7_1","volume-title":"Dual-decoder Transformer for Joint Automatic Speech Recognition and Multilingual Speech Translation. arXiv:2011.00747 [cs, eess]. (Nov","author":"Le H.","year":"2020","unstructured":"[ 7 ] Le , H. 202 0. Dual-decoder Transformer for Joint Automatic Speech Recognition and Multilingual Speech Translation. arXiv:2011.00747 [cs, eess]. (Nov . 2020 ). [7] Le, H. 2020. Dual-decoder Transformer for Joint Automatic Speech Recognition and Multilingual Speech Translation. arXiv:2011.00747 [cs, eess]. (Nov. 2020)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-short.103"},{"key":"e_1_3_2_1_9_1","volume-title":"Multilingual Speech Translation with Efficient Finetuning of Pretrained Models. arXiv:2010.12829 [cs]. (Jan","author":"Li X.","year":"2021","unstructured":"[ 9 ] Li , X. 2021. Multilingual Speech Translation with Efficient Finetuning of Pretrained Models. arXiv:2010.12829 [cs]. (Jan . 2021 ). [9] Li, X. 2021. Multilingual Speech Translation with Efficient Finetuning of Pretrained Models. arXiv:2010.12829 [cs]. (Jan. 2021)."},{"key":"e_1_3_2_1_10_1","volume-title":"wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. arXiv:2006.11477 [cs, eess]. (Oct","author":"Baevski A.","year":"2020","unstructured":"[ 10 ] Baevski , A. 202 0. wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. arXiv:2006.11477 [cs, eess]. (Oct . 2020 ). [10] Baevski, A. 2020. wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. arXiv:2006.11477 [cs, eess]. (Oct. 2020)."},{"key":"e_1_3_2_1_11_1","volume-title":"HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units. arXiv:2106.07447 [cs, eess]. (Jun","author":"Hsu W.","year":"2021","unstructured":"[ 11 ] Hsu , W. - N. 2021. HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units. arXiv:2106.07447 [cs, eess]. (Jun . 2021 ). [11] Hsu, W.-N. 2021. HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units. arXiv:2106.07447 [cs, eess]. (Jun. 2021)."},{"key":"e_1_3_2_1_12_1","volume-title":"Bridging the Modality Gap for Speech-to-Text Translation. arXiv:2010.14920 [cs]. (Oct","author":"Liu Y.","year":"2020","unstructured":"[ 12 ] Liu , Y. 202 0. Bridging the Modality Gap for Speech-to-Text Translation. arXiv:2010.14920 [cs]. (Oct . 2020 ). [12] Liu, Y. 2020. Bridging the Modality Gap for Speech-to-Text Translation. arXiv:2010.14920 [cs]. (Oct. 2020)."},{"key":"e_1_3_2_1_13_1","volume-title":"Fused Acoustic and Text Encoding for Multimodal Bilingual Pretraining and Speech Translation. arXiv:2102.05766 [cs]. (Sep","author":"Zheng R.","year":"2021","unstructured":"[ 13 ] Zheng , R. 2021. Fused Acoustic and Text Encoding for Multimodal Bilingual Pretraining and Speech Translation. arXiv:2102.05766 [cs]. (Sep . 2021 ). [13] Zheng, R. 2021. Fused Acoustic and Text Encoding for Multimodal Bilingual Pretraining and Speech Translation. arXiv:2102.05766 [cs]. (Sep. 2021)."},{"key":"e_1_3_2_1_14_1","volume-title":"Learning Shared Semantic Space for Speech-to-Text Translation. arXiv:2105.03095 [cs]. (Aug","author":"Han C.","year":"2021","unstructured":"[ 14 ] Han , C. 2021. Learning Shared Semantic Space for Speech-to-Text Translation. arXiv:2105.03095 [cs]. (Aug . 2021 ). [14] Han, C. 2021. Learning Shared Semantic Space for Speech-to-Text Translation. arXiv:2105.03095 [cs]. (Aug. 2021)."},{"key":"e_1_3_2_1_15_1","series-title":"Apr. 2021","volume-title":"Listen, Understand and Translate\u201d: Triple Supervision Decouples End-to-end Speech-to-text Translation. arXiv:2009.09704 [cs, eess].","author":"Dong Q.","unstructured":"[ 15 ] Dong , Q. 2021. \u201c Listen, Understand and Translate\u201d: Triple Supervision Decouples End-to-end Speech-to-text Translation. arXiv:2009.09704 [cs, eess]. ( Apr. 2021 ) . [15] Dong, Q. 2021. \u201cListen, Understand and Translate\u201d: Triple Supervision Decouples End-to-end Speech-to-text Translation. arXiv:2009.09704 [cs, eess]. (Apr. 2021)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.204"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.5281\/ZENODO.3525492"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Di Gangi M.A.","year":"2019","unstructured":"[ 18 ] Di Gangi , M.A. 2019 . MuST-C: a Multilingual Speech Translation Corpus . Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies , Volume 1 (Long and Short Papers) (Minneapolis, Minnesota , Jun. 2019), 2012\u20132017. [18] Di Gangi, M.A. 2019. MuST-C: a Multilingual Speech Translation Corpus. Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers) (Minneapolis, Minnesota, Jun. 2019), 2012\u20132017."},{"key":"e_1_3_2_1_19_1","volume-title":"Microsoft Speech Language Translation (MSLT) Corpus: The IWSLT 2016 release for English, French and German. International workshop on spoken language translation.","author":"Federmann C.","year":"2016","unstructured":"[ 19 ] Federmann , C. and Lewis, W.D . Microsoft Speech Language Translation (MSLT) Corpus: The IWSLT 2016 release for English, French and German. International workshop on spoken language translation. 2016 . 7. [19] Federmann, C. and Lewis, W.D. Microsoft Speech Language Translation (MSLT) Corpus: The IWSLT 2016 release for English, French and German. International workshop on spoken language translation. 2016. 7."},{"key":"e_1_3_2_1_20_1","volume-title":"MaSS: A Large and Clean Multilingual Corpus of Sentence-aligned Spoken Utterances Extracted from the Bible. arXiv:1907.12895 [cs]. (Feb","author":"Boito M.Z.","year":"2020","unstructured":"[ 20 ] Boito , M.Z. 202 0. MaSS: A Large and Clean Multilingual Corpus of Sentence-aligned Spoken Utterances Extracted from the Bible. arXiv:1907.12895 [cs]. (Feb . 2020 ). [20] Boito, M.Z. 2020. MaSS: A Large and Clean Multilingual Corpus of Sentence-aligned Spoken Utterances Extracted from the Bible. arXiv:1907.12895 [cs]. (Feb. 2020)."},{"key":"e_1_3_2_1_21_1","volume-title":"Europarl-ST: A Multilingual Corpus For Speech Translation Of Parliamentary Debates. arXiv:1911.03167 [cs, eess]. (Feb","author":"Iranzo-S\u00e1nchez J.","year":"2020","unstructured":"[ 21 ] Iranzo-S\u00e1nchez , J. 202 0. Europarl-ST: A Multilingual Corpus For Speech Translation Of Parliamentary Debates. arXiv:1911.03167 [cs, eess]. (Feb . 2020 ). [21] Iranzo-S\u00e1nchez, J. 2020. Europarl-ST: A Multilingual Corpus For Speech Translation Of Parliamentary Debates. arXiv:1911.03167 [cs, eess]. (Feb. 2020)."},{"key":"e_1_3_2_1_22_1","volume-title":"CoVoST: A Diverse Multilingual Speech-To-Text Translation Corpus. arXiv:2002.01320 [cs]. (Jun","author":"Wang C.","year":"2020","unstructured":"[ 22 ] Wang , C. 202 0. CoVoST: A Diverse Multilingual Speech-To-Text Translation Corpus. arXiv:2002.01320 [cs]. (Jun . 2020 ). [22] Wang, C. 2020. CoVoST: A Diverse Multilingual Speech-To-Text Translation Corpus. arXiv:2002.01320 [cs]. (Jun. 2020)."},{"key":"e_1_3_2_1_23_1","volume-title":"CoVoST 2 and Massively Multilingual Speech-to-Text Translation. arXiv:2007.10310 [cs, eess]. (Oct","author":"Wang C.","year":"2020","unstructured":"[ 23 ] Wang , C. 202 0. CoVoST 2 and Massively Multilingual Speech-to-Text Translation. arXiv:2007.10310 [cs, eess]. (Oct . 2020 ). [23] Wang, C. 2020. CoVoST 2 and Massively Multilingual Speech-to-Text Translation. arXiv:2007.10310 [cs, eess]. (Oct. 2020)."},{"key":"e_1_3_2_1_24_1","volume-title":"CVSS Corpus and Massively Multilingual Speech-to-Speech Translation. arXiv:2201.03713 [cs, eess]. (Jan","author":"Jia Y.","year":"2022","unstructured":"[ 24 ] Jia , Y. 2022. CVSS Corpus and Massively Multilingual Speech-to-Speech Translation. arXiv:2201.03713 [cs, eess]. (Jan . 2022 ). [24] Jia, Y. 2022. CVSS Corpus and Massively Multilingual Speech-to-Speech Translation. arXiv:2201.03713 [cs, eess]. (Jan. 2022)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"[\n  25\n  ]  Ramponi A. and Plank B. 2020. Neural Unsupervised Domain Adaptation in NLP\u2014A Survey. arXiv:2006.00632 [cs]. (Oct. 2020).  [25] Ramponi A. and Plank B. 2020. Neural Unsupervised Domain Adaptation in NLP\u2014A Survey. arXiv:2006.00632 [cs]. (Oct. 2020).","DOI":"10.18653\/v1\/2020.coling-main.603"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.5555\/3305890.3305990"},{"key":"e_1_3_2_1_27_1","volume-title":"Self-Training for End-to-End Speech Translation. arXiv:2006.02490 [cs, eess]. (Oct","author":"Pino J.","year":"2020","unstructured":"[ 27 ] Pino , J. 202 0. Self-Training for End-to-End Speech Translation. arXiv:2006.02490 [cs, eess]. (Oct . 2020 ). [27] Pino, J. 2020. Self-Training for End-to-End Speech Translation. arXiv:2006.02490 [cs, eess]. (Oct. 2020)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639610"},{"key":"e_1_3_2_1_29_1","volume-title":"SpeechStew: Simply Mix All Available Speech Recognition Data to Train One Large Neural Network. arXiv:2104.02133 [cs]. (Apr","author":"Chan W.","year":"2021","unstructured":"[ 29 ] Chan , W. 2021. SpeechStew: Simply Mix All Available Speech Recognition Data to Train One Large Neural Network. arXiv:2104.02133 [cs]. (Apr . 2021 ). [29] Chan, W. 2021. SpeechStew: Simply Mix All Available Speech Recognition Data to Train One Large Neural Network. arXiv:2104.02133 [cs]. (Apr. 2021)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"e_1_3_2_1_31_1","volume-title":"SkinAugment: Auto-Encoding Speaker Conversions for Automatic Speech Translation. ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (May","author":"McCarthy A.D.","year":"2020","unstructured":"[ 31 ] McCarthy , A.D. 2020 . SkinAugment: Auto-Encoding Speaker Conversions for Automatic Speech Translation. ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (May 2020), 7924\u20137928. [31] McCarthy, A.D. 2020. SkinAugment: Auto-Encoding Speaker Conversions for Automatic Speech Translation. ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (May 2020), 7924\u20137928."},{"key":"e_1_3_2_1_32_1","volume-title":"Sample","author":"Lam T.K.","year":"2022","unstructured":"[ 32 ] Lam , T.K. 2022. Sample , Translate, Recombine : Leveraging Audio Alignments for Data Augmentation in End-to-end Speech Translation . arXiv:2203.08757 [cs, eess]. ( Mar. 2022 ). [32] Lam, T.K. 2022. Sample, Translate, Recombine: Leveraging Audio Alignments for Data Augmentation in End-to-end Speech Translation. arXiv:2203.08757 [cs, eess]. (Mar. 2022)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1002\/nur.20364"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/IALP51396.2020.9310459"},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings of the Fifth Conference on Machine Translation (Online","author":"Wang L.","year":"2020","unstructured":"[ 35 ] Wang , L. 2020 . Tencent AI Lab Machine Translation Systems for WMT20 Chat Translation Task . Proceedings of the Fifth Conference on Machine Translation (Online , Nov. 2020), 483\u2013491. [35] Wang, L. 2020. Tencent AI Lab Machine Translation Systems for WMT20 Chat Translation Task. Proceedings of the Fifth Conference on Machine Translation (Online, Nov. 2020), 483\u2013491."},{"key":"e_1_3_2_1_36_1","volume-title":"fairseq S2T: Fast Speech-to-Text Modeling with fairseq. arXiv:2010.05171 [cs, eess]. (Oct","author":"Wang C.","year":"2020","unstructured":"[ 36 ] Wang , C. 202 0. fairseq S2T: Fast Speech-to-Text Modeling with fairseq. arXiv:2010.05171 [cs, eess]. (Oct . 2020 ). [36] Wang, C. 2020. fairseq S2T: Fast Speech-to-Text Modeling with fairseq. arXiv:2010.05171 [cs, eess]. (Oct. 2020)."},{"key":"e_1_3_2_1_37_1","volume-title":"arXiv:1706.03762 [cs]. (Dec","author":"Vaswani A.","year":"2017","unstructured":"[ 37 ] Vaswani , A. 2017. Attention Is All You Need. arXiv:1706.03762 [cs]. (Dec . 2017 ). [37] Vaswani, A. 2017. Attention Is All You Need. arXiv:1706.03762 [cs]. (Dec. 2017)."}],"event":{"name":"ICIAI 2023: 2023 the 7th International Conference on Innovation in Artificial Intelligence","acronym":"ICIAI 2023","location":"Harbin China"},"container-title":["Proceedings of the 2023 7th International Conference on Innovation in Artificial Intelligence"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3594409.3594415","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3594409.3594415","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T17:51:16Z","timestamp":1750182676000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3594409.3594415"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,3,3]]},"references-count":37,"alternative-id":["10.1145\/3594409.3594415","10.1145\/3594409"],"URL":"https:\/\/doi.org\/10.1145\/3594409.3594415","relation":{},"subject":[],"published":{"date-parts":[[2023,3,3]]},"assertion":[{"value":"2023-07-26","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}