{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T14:23:33Z","timestamp":1766067813825,"version":"3.40.3"},"publisher-location":"Cham","reference-count":38,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031404979"},{"type":"electronic","value":"9783031404986"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-40498-6_20","type":"book-chapter","created":{"date-parts":[[2023,8,22]],"date-time":"2023-08-22T23:02:34Z","timestamp":1692745354000},"page":"226-238","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["When Whisper Meets TTS: Domain Adaptation Using only\u00a0Synthetic Speech Data"],"prefix":"10.1007","author":[{"given":"Juan Camilo","family":"V\u00e1squez-Correa","sequence":"first","affiliation":[]},{"given":"Haritz","family":"Arzelus","sequence":"additional","affiliation":[]},{"given":"Juan M.","family":"Martin-Do\u00f1as","sequence":"additional","affiliation":[]},{"given":"Joaquin","family":"Arellano","sequence":"additional","affiliation":[]},{"given":"Ander","family":"Gonzalez-Docasal","sequence":"additional","affiliation":[]},{"given":"Aitor","family":"\u00c1lvarez","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,8,23]]},"reference":[{"key":"20_CR1","doi-asserted-by":"crossref","unstructured":"Li, J., et al.: Recent advances in end-to-end automatic speech recognition. APSIPA Trans. Sign. Inf. Proc. 11(1) (2022)","DOI":"10.1561\/116.00000050"},{"key":"20_CR2","unstructured":"Baevski, A., et al.: Wav2Vec 2.0: a framework for self-supervised learning of speech representations. In: NEURIPS, vol. 33, pp. 12449\u201312460 (2020)"},{"key":"20_CR3","doi-asserted-by":"crossref","unstructured":"Gulati, A., et al.: Conformer: convolution-augmented transformer for speech recognition. In: Proceedings of the INTERSPEECH, pp. 5036\u20135040 (2020)","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"20_CR4","unstructured":"Radford, A., et al.: Robust speech recognition via large-scale weak supervision. Technical report, OpenAI (2022)"},{"key":"20_CR5","doi-asserted-by":"crossref","unstructured":"Park, D.S., et al.: SpecAugment: a simple data augmentation method for automatic speech recognition. In: Proceedings of the INTERSPEECH, pp. 2613\u20132617 (2019)","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"20_CR6","unstructured":"Li, J., et al.: Training neural speech recognition systems with synthetic speech augmentation. arXiv preprint arXiv:1811.00707 (2018)"},{"key":"20_CR7","doi-asserted-by":"crossref","unstructured":"Rosenberg, A., et al.: Speech recognition with augmented synthesized speech. In: Proceedings of the ASRU, pp. 996\u20131002. IEEE (2019)","DOI":"10.1109\/ASRU46091.2019.9003990"},{"key":"20_CR8","doi-asserted-by":"crossref","unstructured":"Laptev, A., et al.: You do not need more data: improving end-to-end speech recognition by text-to-speech data augmentation. In: Proceedings of the CISP-BMEI, pp. 439\u2013444. IEEE (2020)","DOI":"10.1109\/CISP-BMEI51763.2020.9263564"},{"key":"20_CR9","doi-asserted-by":"crossref","unstructured":"Rossenbach, N., et al.: Generating synthetic audio data for attention-based speech recognition systems. In: Proceedings of the ICASSP, pp. 7069\u20137073. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9053008"},{"key":"20_CR10","unstructured":"Wang, Y., et al.: Style tokens: unsupervised style modeling, control and transfer in end-to-end speech synthesis. In Proceedings of the ICML, pp. 5180\u20135189. PMLR (2018)"},{"key":"20_CR11","unstructured":"Wang, C., et al.: Neural codec language models are zero-shot text to speech synthesizers. arXiv preprint arXiv:2301.02111 (2023)"},{"key":"20_CR12","doi-asserted-by":"crossref","unstructured":"Ueno, S., et al.: Multi-speaker sequence-to-sequence speech synthesis for data augmentation in acoustic-to-word speech recognition. In Proceedings of the ICASSP, pp. 6161\u20136165. IEEE (2019)","DOI":"10.1109\/ICASSP.2019.8682816"},{"key":"20_CR13","doi-asserted-by":"crossref","unstructured":"Zheng, X., Liu, Y., Gunceler, D., Willett, D.: Using synthetic audio to improve the recognition of out-of-vocabulary words in end-to-end ASR systems. In: Proceedings of the ICASSP, pp. 5674\u20135678. IEEE (2021)","DOI":"10.1109\/ICASSP39728.2021.9414778"},{"key":"20_CR14","doi-asserted-by":"crossref","unstructured":"Fazel, A., et al.: SynthASR: unlocking synthetic data for speech recognition. arXiv preprint arXiv:2106.07803 (2021)","DOI":"10.21437\/Interspeech.2021-1882"},{"key":"20_CR15","doi-asserted-by":"crossref","unstructured":"Ueno, S., et al.: Data augmentation for ASR using TTS via a discrete representation. In: Proceedings of the ASRU, pp. 68\u201375. IEEE (2021)","DOI":"10.1109\/ASRU51503.2021.9688218"},{"key":"20_CR16","doi-asserted-by":"publisher","first-page":"494","DOI":"10.1016\/j.neunet.2023.01.027","volume":"161","author":"L Qu","year":"2023","unstructured":"Qu, L., Weber, C., Wermter, S.: Emphasizing unseen words: new vocabulary acquisition for end-to-end speech recognition. Neural Netw. 161, 494\u2013504 (2023)","journal-title":"Neural Netw."},{"key":"20_CR17","doi-asserted-by":"crossref","unstructured":"Hu, T.Y., et al.: Synt++: utilizing imperfect synthetic data to improve speech recognition. In: Proceedings of the ICASSP, pp. 7682\u20137686. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9746217"},{"key":"20_CR18","doi-asserted-by":"crossref","unstructured":"Mimura, M., et al.: Leveraging sequence-to-sequence speech synthesis for enhancing acoustic-to-word speech recognition. In: Proceedings of the SLT, pp. 477\u2013484. IEEE (2018)","DOI":"10.1109\/SLT.2018.8639589"},{"key":"20_CR19","doi-asserted-by":"crossref","unstructured":"Panayotov, V., et al.: LibriSpeech: an ASR corpus based on public domain audio books. In: Proceedings of the ICASSP, pp. 5206\u20135210 (2015)","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"20_CR20","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1038\/s42256-023-00613-9","volume":"5","author":"N Ding","year":"2023","unstructured":"Ding, N., et al.: Parameter-efficient fine-tuning of large-scale pre-trained language models. Nature Mach. Intell. 5, 1\u201316 (2023)","journal-title":"Nature Mach. Intell."},{"key":"20_CR21","unstructured":"Hu, E.J., Shen, Y., et al.: LoRA: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"20_CR22","unstructured":"Zhang, Q., et al.: Adaptive budget allocation for parameter-efficient fine-tuning. arXiv preprint arXiv:2303.10512 (2023)"},{"key":"20_CR23","unstructured":"Zaken, E.B., et al.: BitFit: simple parameter-efficient fine-tuning for transformer-based masked language-models. arXiv preprint arXiv:2106.10199 (2021)"},{"key":"20_CR24","doi-asserted-by":"crossref","unstructured":"Shen, et al.: Natural TTS synthesis by conditioning WaveNet on MEL spectrogram predictions. In: Proceedings of the ICASSP, pp. 4779\u20134783. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"20_CR25","unstructured":"Kong, J., et al.: Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Proceedings of the NEURIPS, vol. 33, pp. 17022\u201317033 (2020)"},{"key":"20_CR26","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. 2015 ICLR. arXiv preprint arXiv:1412.6980 (2015)"},{"key":"20_CR27","unstructured":"Ito, K., Johnson, L.: The LJ speech dataset (2017). www.http:\/\/keithito.com\/LJ-Speech-Dataset\/"},{"key":"20_CR28","doi-asserted-by":"crossref","unstructured":"V\u00e1squez-Correa, J.C., \u00c1lvarez Muniain, A.: Novel speech recognition systems applied to forensics within child exploitation: Wav2Vec 2. 0 vs. whisper. Sensors 23(4), 1843 (2023)","DOI":"10.3390\/s23041843"},{"issue":"2","key":"20_CR29","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s10579-017-9410-y","volume":"53","author":"T Baumann","year":"2019","unstructured":"Baumann, T., et al.: The spoken Wikipedia corpus collection: harvesting, alignment and an application to hyperlistening. Lang. Resour. Eval. 53(2), 303\u2013329 (2019)","journal-title":"Lang. Resour. Eval."},{"key":"20_CR30","unstructured":"Mirkin, S., et al.: A recorded debating dataset. In: Proceedings of the LREC, pp. 250\u2013254 (2017)"},{"key":"20_CR31","unstructured":"Rousseau, A., et al.: Enhancing the TED-LIUM corpus with selected data for language modeling and more ted talks. In: Proceedings of the LREC, pp. 3935\u20133939 (2014)"},{"key":"20_CR32","unstructured":"Lleida, E., et al.: Albayzin evaluation: IberSPEECH-RTVE 2022 speech to text transcription challenge (2022)"},{"key":"20_CR33","doi-asserted-by":"crossref","unstructured":"Dinkel, H., et al.: Voice activity detection in the wild: a data-driven approach using teacher-student training. IEEE\/ACM Trans. Audio, Speech Lang. Process. 29, 1542\u20131555 (2021)","DOI":"10.1109\/TASLP.2021.3073596"},{"key":"20_CR34","doi-asserted-by":"crossref","unstructured":"Gemmeke, J., et al.: Audio set: an ontology and human-labeled dataset for audio events. In: Proceedings of the ICASSP, pp. 776\u2013780 (2017)","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"20_CR35","doi-asserted-by":"crossref","unstructured":"Arzelus, H., et al.: The Vicomtech-UPM speech transcription systems for the albayz\u0131n-rtve 2022 speech to text transcription challenge. In: Proceedings of the IberSPEECH, pp. 266\u2013270 (2022)","DOI":"10.21437\/IberSPEECH.2022-54"},{"key":"20_CR36","doi-asserted-by":"crossref","unstructured":"T. Etchegoyhen et al. mintzai-st: Corpus and baselines for basque-spanish speech translation. In: Proceedings of the IberSPEECH, pp. 1\u20135 (2021)","DOI":"10.21437\/IberSPEECH.2021-41"},{"key":"20_CR37","doi-asserted-by":"crossref","unstructured":"Liu, X., et al.: P-tuning v2: prompt tuning can be comparable to fine-tuning universally across scales and tasks. arXiv preprint arXiv:2110.07602 (2021)","DOI":"10.18653\/v1\/2022.acl-short.8"},{"key":"20_CR38","unstructured":"Li, X.L., Liang, P.: Prefix-tuning: optimizing continuous prompts for generation. In: Proceedings of the ACL, pp. 4582\u20134597 (2021)"}],"container-title":["Lecture Notes in Computer Science","Text, Speech, and Dialogue"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-40498-6_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,22]],"date-time":"2023-08-22T23:04:50Z","timestamp":1692745490000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-40498-6_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031404979","9783031404986"],"references-count":38,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-40498-6_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"23 August 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"TSD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Text, Speech, and Dialogue","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Pilsen","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Czech Republic","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 September 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 September 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"tsd2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.kiv.zcu.cz\/tsd2023\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMS & back-office system","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"64","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"31","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"48% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.56","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}