{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T04:47:57Z","timestamp":1742964477822,"version":"3.40.3"},"publisher-location":"Cham","reference-count":38,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030878016"},{"type":"electronic","value":"9783030878023"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-87802-3_11","type":"book-chapter","created":{"date-parts":[[2021,9,21]],"date-time":"2021-09-21T23:36:52Z","timestamp":1632267412000},"page":"112-123","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Improved Prosodic Clustering for Multispeaker and Speaker-Independent Phoneme-Level Prosody Control"],"prefix":"10.1007","author":[{"given":"Myrsini","family":"Christidou","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alexandra","family":"Vioni","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nikolaos","family":"Ellinas","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Georgios","family":"Vamvoukakis","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Konstantinos","family":"Markopoulos","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Panos","family":"Kakoulidis","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"June Sig","family":"Sung","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hyoungmin","family":"Park","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Aimilios","family":"Chalamandaris","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Pirros","family":"Tsiakoulis","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2021,9,22]]},"reference":[{"key":"11_CR1","doi-asserted-by":"crossref","unstructured":"Angelini, O., Moinet, A., Yanagisawa, K., Drugman, T.: Singing synthesis: with a little help from my attention. In: Proceedings of Interspeech (2020)","DOI":"10.21437\/Interspeech.2020-1399"},{"key":"11_CR2","unstructured":"Battenberg, E., et al.: Effective use of variational embedding capacity in expressive end-to-end speech synthesis. arXiv:1906.03402 (2019)"},{"issue":"12","key":"11_CR3","doi-asserted-by":"publisher","first-page":"1313","DOI":"10.3390\/app7121313","volume":"7","author":"M Blaauw","year":"2017","unstructured":"Blaauw, M., Bonada, J.: A neural parametric singing synthesizer modeling timbre and expression from natural songs. Appl. Sci. 7(12), 1313 (2017)","journal-title":"Appl. Sci."},{"key":"11_CR4","doi-asserted-by":"crossref","unstructured":"Chalamandaris, A., Tsiakoulis, P., Raptis, S., Karabetsos, S.: Corpus design for a unit selection TTS system with application to Bulgarian. In: Proceedings of 4th Conference on Human Language Technology: Challenges for Computer Science and Linguistics, pp. 35\u201346 (2009)","DOI":"10.1007\/978-3-642-20095-3_4"},{"key":"11_CR5","doi-asserted-by":"crossref","unstructured":"Chien, C.M., Lee, H.: Hierarchical prosody modeling for non-autoregressive speech synthesis. In: Proceedings of SLT (2021)","DOI":"10.1109\/SLT48900.2021.9383629"},{"key":"11_CR6","doi-asserted-by":"crossref","unstructured":"Cooper, E., Lai, C.I., Yasuda, Y., Yamagishi, J.: Can speaker augmentation improve multi-speaker end-to-end TTS? In: Proceedings of Interspeech (2020)","DOI":"10.21437\/Interspeech.2020-1229"},{"key":"11_CR7","unstructured":"Corretge, R.: Praat Vocal Toolkit (2012\u20132020). http:\/\/www.praatvocaltoolkit.com"},{"key":"11_CR8","unstructured":"Daxin, T., Tan, L.: Fine-grained style modelling and transfer in text-to-speech synthesis via content-style disentanglement. arXiv:2011.03943 (2020)"},{"key":"11_CR9","unstructured":"Du, C., Yu, K.: Mixture Density Network for Phone-Level Prosody Modelling in Speech Synthesis. arXiv:2102.00851 (2021)"},{"key":"11_CR10","doi-asserted-by":"crossref","unstructured":"Ellinas, N., et al.: High quality streaming speech synthesis with low, sentence-length-independent latency. In: Proceedings of Interspeech (2020)","DOI":"10.21437\/Interspeech.2020-2464"},{"key":"11_CR11","unstructured":"Gururani, S., Gupta, K., Shah, D., Shakeri, Z., Pinto, J.: Prosody Transfer in Neural Text to Speech Using Global Pitch and Loudness Features. arXiv:1911.09645 (2019)"},{"key":"11_CR12","unstructured":"Hsu, W.N., et al.: Hierarchical generative modeling for controllable speech synthesis. In: Proceedings of ICLR (2018)"},{"key":"11_CR13","unstructured":"Ito, K., Johnson, L.: The LJ Speech Dataset (2017). https:\/\/keithito.com\/LJ-Speech-Dataset"},{"key":"11_CR14","doi-asserted-by":"crossref","unstructured":"Karlapati, S., Moinet, A., Joly, A., Klimkov, V., S\u00e1ez-Trigueros, D., Drugman, T.: CopyCat: many-to-many fine-grained prosody transfer for neural text-to-speech. In: Proceedings of Interspeech (2020)","DOI":"10.21437\/Interspeech.2020-1251"},{"key":"11_CR15","doi-asserted-by":"crossref","unstructured":"Klimkov, V., Ronanki, S., Rohnke, J., Drugman, T.: Fine-grained robust prosody transfer for single-speaker neural text-to-speech. In: Proceedings of Interspeech (2019)","DOI":"10.21437\/Interspeech.2019-2571"},{"key":"11_CR16","doi-asserted-by":"crossref","unstructured":"Kumar, N., Goel, S., Narang, A., Lall, B.: Few Shot Adaptive Normalization Driven Multi-Speaker Speech Synthesis. arXiv:2012.07252 (2020)","DOI":"10.21437\/Interspeech.2021-441"},{"key":"11_CR17","doi-asserted-by":"crossref","unstructured":"Kurihara, K., Seiyama, N., Kumano, T.: Prosodic features control by symbols as input of sequence-to-sequence acoustic modeling for neural TTS. IEICE Trans. Inf. Syst. E104.D(2), 302\u2013311 (2021)","DOI":"10.1587\/transinf.2020EDP7104"},{"key":"11_CR18","doi-asserted-by":"crossref","unstructured":"Lee, Y., Kim, T.: Robust and fine-grained prosody control of end-to-end speech synthesis. In: Proceedings of ICASSP (2019)","DOI":"10.1109\/ICASSP.2019.8683501"},{"key":"11_CR19","unstructured":"Neekhara, P., Hussain, S., Dubnov, S., Koushanfar, F., McAuley, J.: Expressive Neural Voice Cloning. arXiv:2102.00151 (2021)"},{"key":"11_CR20","doi-asserted-by":"crossref","unstructured":"Park, J., Han, K., Jeong, Y., Lee, S.W.: Phonemic-level duration control using attention alignment for natural speech synthesis. In: Proceedings of ICASSP (2019)","DOI":"10.1109\/ICASSP.2019.8683827"},{"key":"11_CR21","unstructured":"Ping, W., et al.: Deep voice 3: scaling text-to-speech with convolutional sequence learning. In: Proceedings of ICLR (2018)"},{"key":"11_CR22","doi-asserted-by":"crossref","unstructured":"Raitio, T., Rasipuram, R., Castellani, D.: Controllable neural text-to-speech synthesis using intuitive prosodic features. In: Proceedings of Interspeech (2020)","DOI":"10.21437\/Interspeech.2020-2861"},{"key":"11_CR23","doi-asserted-by":"crossref","unstructured":"Shechtman, S., Sorin, A.: Sequence to sequence neural speech synthesis with prosody modification capabilities. In: Proceedings of SSW (2019)","DOI":"10.21437\/SSW.2019-49"},{"key":"11_CR24","doi-asserted-by":"crossref","unstructured":"Shen, J., et al.: Natural TTS synthesis by conditioning WaveNet on mel spectrogram predictions. In: Proceedings of ICASSP (2018)","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"11_CR25","unstructured":"Skerry-Ryan, R., et al.: Towards end-to-end prosody transfer for expressive speech synthesis with Tacotron. In: Proceedings of ICML (2018)"},{"key":"11_CR26","doi-asserted-by":"crossref","unstructured":"Sun, G., et al.: Generating diverse and natural text-to-speech samples using a quantized fine-grained VAE and autoregressive prosody prior. In: Proceedings of ICASSP (2020)","DOI":"10.1109\/ICASSP40776.2020.9053436"},{"key":"11_CR27","doi-asserted-by":"crossref","unstructured":"Sun, G., Zhang, Y., Weiss, R.J., Cao, Y., Zen, H., Wu, Y.: Fully-hierarchical fine-grained prosody modeling for interpretable speech synthesis. In: Proceedings of ICASSP (2020)","DOI":"10.1109\/ICASSP40776.2020.9053520"},{"key":"11_CR28","doi-asserted-by":"crossref","unstructured":"Valle, R., Li, J., Prenger, R., Catanzaro, B.: Mellotron: multispeaker expressive voice synthesis by conditioning on rhythm, pitch and global style tokens. In: Proceedings of ICASSP (2020)","DOI":"10.1109\/ICASSP40776.2020.9054556"},{"key":"11_CR29","doi-asserted-by":"crossref","unstructured":"Vioni, A., et al.: Prosodic clustering for phoneme-level prosody control in end-to-end speech synthesis. In: Proceedings of ICASSP (2021)","DOI":"10.1109\/ICASSP39728.2021.9413604"},{"key":"11_CR30","doi-asserted-by":"crossref","unstructured":"Vipperla, R., et al.: Bunched LPCNet: vocoder for low-cost neural text-to-speech systems. In: Proceedings of Interspeech (2020)","DOI":"10.21437\/Interspeech.2020-2041"},{"key":"11_CR31","unstructured":"Wan, V., an Chan, C., Kenter, T., Vit, J., Clark, R.: CHiVE: varying prosody in speech synthesis with a linguistically driven dynamic hierarchical conditional variational network. In: Proceedings of ICML (2019)"},{"key":"11_CR32","doi-asserted-by":"crossref","unstructured":"Wang, J., Li, J., Zhao, X., Wu, Z., Meng, H.: Adversarially learning disentangled speech representations for robust multi-factor voice conversion. arXiv:2102.00184 (2021)","DOI":"10.21437\/Interspeech.2021-1990"},{"key":"11_CR33","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: Tacotron: towards end-to-end speech synthesis. In: Proceedings of Interspeech (2017)","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"11_CR34","unstructured":"Wang, Y., et al.: Style tokens: unsupervised style modeling, control and transfer in end-to-end speech synthesis. In: Proceedings of ICML (2018)"},{"key":"11_CR35","doi-asserted-by":"crossref","unstructured":"Zhang, G., Qin, Y., Lee, T.: Learning syllable-level discrete prosodic representation for expressive speech generation. In: Proceedings of Interspeech (2020)","DOI":"10.21437\/Interspeech.2020-2228"},{"key":"11_CR36","doi-asserted-by":"crossref","unstructured":"Zhang, J.X., et al.: Voice Conversion by Cascading Automatic Speech Recognition and Text-to-Speech Synthesis with Prosody Transfer. arXiv:2009.01475 (2020)","DOI":"10.21437\/VCC_BC.2020-16"},{"key":"11_CR37","doi-asserted-by":"crossref","unstructured":"Zhang, Y.J., Pan, S., He, L., Ling, Z.H.: Learning latent representations for style control and transfer in end-to-end speech synthesis. In: Proceedings of ICASSP (2019)","DOI":"10.1109\/ICASSP.2019.8683623"},{"key":"11_CR38","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: Learning to speak fluently in a foreign language: multilingual speech synthesis and cross-language voice cloning. In: Proceedings of Interspeech (2019)","DOI":"10.21437\/Interspeech.2019-2668"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-87802-3_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,9,21]],"date-time":"2021-09-21T23:40:33Z","timestamp":1632267633000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-87802-3_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030878016","9783030878023"],"references-count":38,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-87802-3_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"22 September 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"SPECOM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Speech and Computer","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"St Petersburg","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Russia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 September 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30 September 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"specom2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/specom.nw.ru\/2021\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"163","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"74","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"45% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.5","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5.5","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held online due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}