{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T08:48:24Z","timestamp":1742978904071,"version":"3.40.3"},"publisher-location":"Cham","reference-count":33,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783031209796"},{"type":"electronic","value":"9783031209802"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-20980-2_43","type":"book-chapter","created":{"date-parts":[[2022,11,12]],"date-time":"2022-11-12T19:03:09Z","timestamp":1668279789000},"page":"508-521","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Low-Resource Emotional Speech Synthesis: Transfer Learning and\u00a0Data Requirements"],"prefix":"10.1007","author":[{"given":"Anton","family":"Nesterenko","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ruslan","family":"Akhmerov","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yulia","family":"Matveeva","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Anna","family":"Goremykina","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dmitry","family":"Astankov","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Evgeniy","family":"Shuranov","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alexandra","family":"Shirshova","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,11,10]]},"reference":[{"key":"43_CR1","unstructured":"Adigwe, A., Tits, N., El Haddad, K., Ostadabbas, S., Dutoit, T.: The emotional voices database: towards controlling the emotion dimension in voice generation systems 06 (2018)"},{"key":"43_CR2","doi-asserted-by":"crossref","unstructured":"Cai, X., Dai, D., Wu, Z., Li, X., Li, J., Meng, H.: Emotion controllable speech synthesis using emotion-unlabeled dataset with the assistance of cross-domain speech emotion recognition. In: ICASSP 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2020). arXiv:abs\/2011.08679","DOI":"10.1109\/ICASSP39728.2021.9413907"},{"key":"43_CR3","doi-asserted-by":"publisher","unstructured":"Cai, Z., Zhang, C., Li, M.: From speaker verification to multispeaker speech synthesis, deep transfer with feedback constraint 08, 1032 (2020). https:\/\/doi.org\/10.21437\/Interspeech","DOI":"10.21437\/Interspeech"},{"key":"43_CR4","unstructured":"Ganin, Y., Ustinova, E., Ajakan, H., Germain, P., Larochelle, H., Laviolette, F., March, M., Lempitsky, V.: Domain-adversarial training of neural networks. J. Mach. Learn. Res. 17(59), 1\u201335 (2016). https:\/\/jmlr.org\/papers\/v17\/15-239.html"},{"key":"43_CR5","unstructured":"Goodfellow, I., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., Courville, A., Bengio, Y.: Generative adversarial nets. In: Ghahramani, Z., Welling, M., Cortes, C., Lawrence, N., Weinberger, K.Q. (eds.) Advances in Neural Information Processing Systems, vol. 27. Curran Associates, Inc. (2014). https:\/\/proceedings.neurips.cc\/paper\/2014\/file\/5ca3e9b122f61f8f06494c97b1afccf3-Paper.pdf"},{"key":"43_CR6","unstructured":"Ito, K., Johnson, L.: The LJ speech dataset (2017). https:\/\/keithito.com\/LJ-Speech-Dataset\/"},{"key":"43_CR7","doi-asserted-by":"publisher","unstructured":"Jang, W., Lim, D., Yoon, J.: Universal MelGAN: a robust neural vocoder for high-fidelity waveform generation in multiple domains (2020). https:\/\/doi.org\/10.48550\/ARXIV.2011.09631, arXiv:abs\/2011.09631","DOI":"10.48550\/ARXIV.2011.09631"},{"key":"43_CR8","unstructured":"Jemine, C., et al.: Real time voice cloning (2021). https:\/\/github.com\/CorentinJ\/Real-Time-Voice-Cloning"},{"key":"43_CR9","unstructured":"Jia, Y., et al.: Transfer learning from speaker verification to multispeaker text-to-speech synthesis. In: Proceedings of the 32nd International Conference on Neural Information Processing Systems, pp. 4485\u20134495. NIPS\u201918, Curran Associates Inc., Red Hook, NY, USA (2018)"},{"key":"43_CR10","unstructured":"Kong, J., Casanova, E.: Hifi-gan (2013). https:\/\/github.com\/jik876\/hifi-gan"},{"key":"43_CR11","unstructured":"Kong, J., Kim, J., Bae, J.: Hifi-gan: generative adversarial networks for efficient and high fidelity speech synthesis (2020). arXiv:abs\/2010.05646"},{"key":"43_CR12","unstructured":"Kumar, K., Kumar, R., de Boissiere, T., Gestin, L., Teoh, W.Z., Sotelo, J., de Br\u00e9bisson, A., Bengio, Y., Courville, A.C.: MelGAN: generative adversarial networks for conditional waveform synthesis. In: Wallach, H., Larochelle, H., Beygelzimer, A., d\u2019Alch\u00e9-Buc, F., Fox, E., Garnett, R. (eds.) Advances in Neural Information Processing Systems 32 (NeurIPS 2019), vol. 32. Curran Associates, Inc. (2019)"},{"key":"43_CR13","doi-asserted-by":"crossref","unstructured":"Li, T., Yang, S., Xue, L., Xie, L.: Controllable emotion transfer for end-to-end speech synthesis (2020). arXiv:abs\/2011.08679","DOI":"10.1109\/ISCSLP49672.2021.9362069"},{"key":"43_CR14","doi-asserted-by":"crossref","unstructured":"Liu, R., Sisman, B., Li, H.: Reinforcement learning for emotional text-to-speech synthesis with improved emotion discriminability (2021). arXiv:abs\/2104.01408","DOI":"10.21437\/Interspeech.2021-1236"},{"key":"43_CR15","doi-asserted-by":"publisher","unstructured":"Lu, C., Wen, X., Liu, R., Chen, X.: Multi-speaker emotional speech synthesis with fine-grained prosody modeling. In: ICASSP 2021\u20132021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5729\u20135733 (2021). https:\/\/doi.org\/10.1109\/ICASSP39728.2021.9413398","DOI":"10.1109\/ICASSP39728.2021.9413398"},{"key":"43_CR16","doi-asserted-by":"crossref","unstructured":"McAuliffe, M., Socolof, M., Mihuc, S., Wagner, M., Sonderegger, M.: Montreal forced aligner: trainable text-speech alignment using kaldi. In: INTERSPEECH (2017)","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"43_CR17","unstructured":"Ren, Y., Hu, C., Tan, X., Qin, T., Zhao, S., Zhao, Z., Liu, T.Y.: Fastspeech 2: fast and high-quality end-to-end text to speech. In: International Conference on Learning Representations (2021)"},{"key":"43_CR18","unstructured":"Ren, Y., Ruan, Y., Tan, X., Qin, T., Zhao, S., Zhao, Z., Liu, T.Y.: Fastspeech: Fast, robust and controllable text to speech. In: Wallach, H., Larochelle, H., Beygelzimer, A., d\u00c1lch\u00e9-Buc, F., Fox, E., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol. 32. Curran Associates, Inc. (2019)"},{"key":"43_CR19","doi-asserted-by":"publisher","unstructured":"Shang, Z., Huang, Z., Zhang, H., Zhang, P., Yan, Y.: Incorporating cross-speaker style transfer for multi-language text-to-speech. In: Proceedings of the Interspeech 2021, pp. 1619\u20131623 (2021). https:\/\/doi.org\/10.21437\/Interspeech.2021-1265","DOI":"10.21437\/Interspeech.2021-1265"},{"key":"43_CR20","unstructured":"Shen, J., Jia, Y., Chrzanowski, M., Zhang, Y., Elias, I., Zen, H., Wu, Y.: Non-attentive tacotron: robust and controllable neural TTS synthesis including unsupervised duration modeling (2020). arXiv:abs\/2010.04301"},{"key":"43_CR21","doi-asserted-by":"crossref","unstructured":"Shen, J., Pang, R., Weiss, R.J., Schuster, M., Jaitly, N., Yang, Z., Chen, Z., Zhang, Y., Wang, Y., Skerry-Ryan, R.J., Saurous, R.A., Agiomyrgiannakis, Y., Wu, Y.: Natural TTS synthesis by conditioning wavenet on mel spectrogram predictions (2017). arXiv:abs\/1712.05884","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"43_CR22","doi-asserted-by":"crossref","unstructured":"Student: the probable error of a mean. Biometrika 6(1), 1\u201325 (1908). http:\/\/www.jstor.org\/stable\/2331554","DOI":"10.2307\/2331554"},{"key":"43_CR23","doi-asserted-by":"crossref","unstructured":"Tachibana, H., Uenoyama, K., Aihara, S.: Efficiently trainable text-to-speech system based on deep convolutional networks with guided attention (2017). arXiv:abs\/1710.08969","DOI":"10.1109\/ICASSP.2018.8461829"},{"key":"43_CR24","doi-asserted-by":"crossref","unstructured":"Tits, N., Haddad, K.E., Dutoit, T.: Exploring transfer learning for low resource emotional TTS (2019). arXiv:abs\/1901.04276","DOI":"10.1007\/978-3-030-29516-5_5"},{"key":"43_CR25","doi-asserted-by":"publisher","unstructured":"Um, S.Y., Oh, S., Byun, K., Jang, I., Ahn, C., Kang, H.G.: Emotional speech synthesis with rich and granularized control. In: ICASSP 2020\u20132020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7254\u20137258 (2020). https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053732","DOI":"10.1109\/ICASSP40776.2020.9053732"},{"key":"43_CR26","doi-asserted-by":"publisher","unstructured":"Wan, L., Wang, Q., Papir, A., Moreno, I.L.: Generalized end-to-end loss for speaker verification. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4879\u20134883 (2018). https:\/\/doi.org\/10.1109\/ICASSP.2018.8462665","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"43_CR27","doi-asserted-by":"publisher","unstructured":"Wang, J., Li, J., Zhao, X., Wu, Z., Kang, S., Meng, H.: Adversarially learning disentangled speech representations for robust multi-factor voice conversion. In: Proceedings of the Interspeech 2021, pp. 846\u2013850 (2021). https:\/\/doi.org\/10.21437\/Interspeech","DOI":"10.21437\/Interspeech"},{"key":"43_CR28","unstructured":"Wang, Y., Stanton, D., Zhang, Y., Skerry-Ryan, R.J., Battenberg, E., Shor, J., Xiao, Y., Ren, F., Jia, Y., Saurous, R.A.: Style tokens: unsupervised style modeling, control and transfer in end-to-end speech synthesis (2018). arXiv:abs\/1803.09017"},{"key":"43_CR29","unstructured":"Yamagishi, J., Veaux, C., MacDonald, K.: CSTR VCTK corpus: english multi-speaker corpus for CSTR voice cloning toolkit (version 0.92) (2019)"},{"key":"43_CR30","doi-asserted-by":"publisher","unstructured":"Yamamoto, R., Song, E., Kim, J.M.: Parallel wavegan: a fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram. In: ICASSP 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6199\u20136203 (2020). https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053795","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"43_CR31","doi-asserted-by":"publisher","unstructured":"Zhang, Y., Weiss, R.J., Zen, H., Wu, Y., Chen, Z., Skerry-Ryan, R., Jia, Y., Rosenberg, A., Ramabhadran, B.: Learning to speak fluently in a foreign language: multilingual speech synthesis and cross-language voice cloning. In: Proceeding of the Interspeech 2019, pp. 2080\u20132084 (2019). https:\/\/doi.org\/10.21437\/Interspeech","DOI":"10.21437\/Interspeech"},{"key":"43_CR32","doi-asserted-by":"publisher","unstructured":"Zhou, K., Sisman, B., Li, H.: Limited data emotional voice conversion leveraging text-to-speech: two-stage sequence-to-sequence training. In: Proceeding of the Interspeech 2021, pp. 811\u2013815 (2021). https:\/\/doi.org\/10.21437\/Interspeech","DOI":"10.21437\/Interspeech"},{"key":"43_CR33","unstructured":"Zhou, K., Sisman, B., Liu, R., Li, H.: Emotional voice conversion: theory, databases and ESD (2021). arXiv:abs\/2105.14762"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-20980-2_43","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,11,12]],"date-time":"2022-11-12T19:08:46Z","timestamp":1668280126000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-20980-2_43"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031209796","9783031209802"],"references-count":33,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-20980-2_43","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"10 November 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"SPECOM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Speech and Computer","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Gurugram","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 November 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 November 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"specom2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.specom.co.in","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"99","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"60","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"61% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}