{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T03:16:02Z","timestamp":1774926962678,"version":"3.50.1"},"reference-count":29,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,6,6]],"date-time":"2021-06-06T00:00:00Z","timestamp":1622937600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,6,6]],"date-time":"2021-06-06T00:00:00Z","timestamp":1622937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,6,6]],"date-time":"2021-06-06T00:00:00Z","timestamp":1622937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,6,6]]},"DOI":"10.1109\/icassp39728.2021.9413466","type":"proceedings-article","created":{"date-parts":[[2021,5,13]],"date-time":"2021-05-13T19:53:45Z","timestamp":1620935625000},"page":"6593-6597","source":"Crossref","is-referenced-by-count":31,"title":["Low-Resource Expressive Text-To-Speech Using Data Augmentation"],"prefix":"10.1109","author":[{"given":"Goeric","family":"Huybrechts","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Thomas","family":"Merritt","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Giulia","family":"Comini","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bartek","family":"Perz","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Raahil","family":"Shah","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jaime","family":"Lorenzo-Trueba","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref10","first-page":"52","article-title":"Exploring transfer learning for low resource emotional tts","author":"tits","year":"2019","journal-title":"Proceedings of SAI Intelligent Systems Conference"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682168"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2730"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1403"},{"key":"ref14","article-title":"The effectiveness of data augmentation in image classification using deep learning","author":"perez","year":"2017","journal-title":"arXiv preprint arXiv 1712 04621"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/IIPHDW.2018.8388338"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1186\/s40537-019-0197-0"},{"key":"ref17","article-title":"Audio augmentation for speech recognition","author":"ko","year":"2015","journal-title":"Sixteenth Annual Conference of the International Speech Communication Association"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953152"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref28","article-title":"Dynamic prosody generation for speech synthesis using linguistics-driven acoustic embedding selection","author":"tyagi","year":"2019","journal-title":"arXiv preprint arXiv 1912 00955"},{"key":"ref4","article-title":"Towards end-to-end prosody transfer for expressive speech synthesis with tacotron","author":"skerry-ryan","year":"2018","journal-title":"arXiv preprint arXiv 1803 09047"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683623"},{"key":"ref3","author":"sotelo","year":"2017","journal-title":"Char2Wav End-to-end speech synthesis"},{"key":"ref6","first-page":"3918","article-title":"Parallel wavenet: Fast high-fidelity speech synthesis","author":"oord","year":"2018","journal-title":"International Conference on Machine Learning"},{"key":"ref29","article-title":"1534-1, method for the subjective assessment of intermediate quality levels of coding systems (mushra)","author":"itu-r","year":"2003","journal-title":"International Telecommunication Union"},{"key":"ref5","article-title":"Efficient neural audio synthesis","author":"kalchbrenner","year":"2018","journal-title":"arXiv preprint arXiv 1802 08908"},{"key":"ref8","first-page":"2962","article-title":"Deep voice 2: Multi-speaker neural text-to-speech","author":"gibiansky","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683862"},{"key":"ref2","article-title":"Tacotron: Towards end-to-end speech synthesis","author":"wang","year":"2017","journal-title":"arXiv preprint arXiv 1703 10593"},{"key":"ref9","first-page":"4480","article-title":"Transfer learning from speaker verification to multispeaker text-to-speech synthesis","author":"jia","year":"2018","journal-title":"Advances in neural information processing systems"},{"key":"ref1","article-title":"Wavenet: A generative model for raw audio","author":"oord","year":"2016","journal-title":"arXiv preprint arXiv 1609 04802"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403331"},{"key":"ref22","article-title":"Voice conversion from unaligned corpora using variational autoencoding wasserstein generative adversarial networks","author":"hsu","year":"2017","journal-title":"arXiv preprint arXiv 1704 00849"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2017.01.008"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682897"},{"key":"ref23","article-title":"The voice conversion challenge 2018: Promoting development of parallel and nonparallel methods","author":"lorenzo-trueba","year":"2018","journal-title":"arXiv preprint arXiv 1804 04121"},{"key":"ref26","article-title":"Auto-encoding variational bayes","author":"kingma","year":"2013","journal-title":"arXiv preprint arXiv 1312 6114"},{"key":"ref25","article-title":"Copycat: Many-to-many fine-grained prosody transfer for neural text-to-speech","author":"karlapati","year":"2020","journal-title":"arXiv preprint arXiv 2004 14408"}],"event":{"name":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Toronto, ON, Canada","start":{"date-parts":[[2021,6,6]]},"end":{"date-parts":[[2021,6,11]]}},"container-title":["ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9413349\/9413350\/09413466.pdf?arnumber=9413466","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,10]],"date-time":"2022-05-10T15:41:13Z","timestamp":1652197273000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9413466\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,6,6]]},"references-count":29,"URL":"https:\/\/doi.org\/10.1109\/icassp39728.2021.9413466","relation":{},"subject":[],"published":{"date-parts":[[2021,6,6]]}}}