{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T02:26:24Z","timestamp":1742955984047,"version":"3.40.3"},"publisher-location":"Cham","reference-count":35,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783031216855"},{"type":"electronic","value":"9783031216862"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-21686-2_18","type":"book-chapter","created":{"date-parts":[[2022,11,18]],"date-time":"2022-11-18T08:30:15Z","timestamp":1668760215000},"page":"253-267","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Diffusion-Based Approach to\u00a0Style Modeling in\u00a0Expressive TTS"],"prefix":"10.1007","author":[{"given":"Leonardo","family":"B. de M. M. Marques","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lucas H.","family":"Ueda","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fl\u00e1vio O.","family":"Sim\u00f5es","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"M\u00e1rio","family":"Uliani Neto","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fernando O.","family":"Runstein","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Edson J.","family":"Nagle","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bianca Dal","family":"B\u00f3","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Paula D. P.","family":"Costa","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,11,19]]},"reference":[{"doi-asserted-by":"publisher","unstructured":"Aggarwal, V., Cotescu, M., Prateek, N., Lorenzo-Trueba, J., Barra-Chicote, R.: Using VAEs and normalizing flows for one-shot text-to-speech synthesis of expressive speech. In: ICASSP 2020\u20132020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6179\u20136183 (2020). https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053678","key":"18_CR1","DOI":"10.1109\/ICASSP40776.2020.9053678"},{"doi-asserted-by":"publisher","unstructured":"Aylett, M.P., Clark, L., Cowan, B.R., Torre, I.: Building and designing expressive speech synthesis. In: The Handbook on Socially Interactive Agents: 20 years of Research on Embodied Conversational Agents, Intelligent Virtual Agents, and Social Robotics Volume 1: Methods, Behavior, Cognition, pp. 173\u2013212. Association for Computing Machinery, New York (2021). https:\/\/doi.org\/10.1145\/3477322","key":"18_CR2","DOI":"10.1145\/3477322"},{"doi-asserted-by":"publisher","unstructured":"Chen, N., Zhang, Y., Zen, H., Weiss, R.J., Norouzi, M., Chan, W.: WAVEGRAD: estimating gradients for waveform generation (2020). https:\/\/doi.org\/10.48550\/ARXIV.2009.00713","key":"18_CR3","DOI":"10.48550\/ARXIV.2009.00713"},{"doi-asserted-by":"publisher","unstructured":"Chen, Z., et al.: InferGrad: improving diffusion models for vocoder by considering inference in training (2022). https:\/\/doi.org\/10.48550\/ARXIV.2202.03751","key":"18_CR4","DOI":"10.48550\/ARXIV.2202.03751"},{"unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. In: Ranzato, M., Beygelzimer, A., Dauphin, Y., Liang, P., Vaughan, J.W. (eds.) Advances in Neural Information Processing Systems, vol. 34, pp. 8780\u20138794. Curran Associates, Inc. (2021). https:\/\/proceedings.neurips.cc\/paper\/2021\/file\/49ad23d1ec9fa4bd8d77d02681df5cfa-Paper.pdf","key":"18_CR5"},{"issue":"2","key":"18_CR6","doi-asserted-by":"publisher","first-page":"124","DOI":"10.1037\/h0030377","volume":"17","author":"P Ekman","year":"1971","unstructured":"Ekman, P., Friesen, W.V.: Constants across cultures in the face and emotion. J. Pers. Soc. Psychol. 17(2), 124 (1971). https:\/\/doi.org\/10.1037\/h0030377","journal-title":"J. Pers. Soc. Psychol."},{"unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: Larochelle, H., Ranzato, M., Hadsell, R., Balcan, M.F., Lin, H. (eds.) Advances in Neural Information Processing Systems, vol. 33, pp. 6840\u20136851. Curran Associates, Inc. (2020). https:\/\/proceedings.neurips.cc\/paper\/2020\/file\/4c5bcfec8584af0d967f1ab10179ca4b-Paper.pdf","key":"18_CR7"},{"doi-asserted-by":"publisher","unstructured":"Hodari, Z., Lai, C., King, S.: Perception of prosodic variation for speech synthesis using an unsupervised discrete representation of f0. In: Proceedings of Speech Prosody 2020, pp. 965\u2013969 (2020). https:\/\/doi.org\/10.21437\/SpeechProsody.2020-197. Published 24 May 2020; Speech Prosody 2020; Conference date: 24-05-2020 Through 28-05-2020","key":"18_CR8","DOI":"10.21437\/SpeechProsody.2020-197"},{"doi-asserted-by":"publisher","unstructured":"Im, C.B., Lee, S.H., Kim, S.B., Lee, S.W.: EMOQ-TTS: emotion intensity quantization for fine-grained controllable emotional text-to-speech. In: ICASSP 2022\u20132022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6317\u20136321 (2022). https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9747098","key":"18_CR9","DOI":"10.1109\/ICASSP43922.2022.9747098"},{"issue":"8","key":"18_CR10","doi-asserted-by":"publisher","first-page":"2119","DOI":"10.1007\/s12369-020-00691-4","volume":"13","author":"J James","year":"2020","unstructured":"James, J., Balamurali, B.T., Watson, C.I., MacDonald, B.: Empathetic speech synthesis and testing for healthcare robots. Int. J. Soc. Robot. 13(8), 2119\u20132137 (2020). https:\/\/doi.org\/10.1007\/s12369-020-00691-4","journal-title":"Int. J. Soc. Robot."},{"doi-asserted-by":"publisher","unstructured":"Jeong, M., Kim, H., Cheon, S.J., Choi, B.J., Kim, N.S.: DIFF-TTS: a denoising diffusion model for text-to-speech (2021). https:\/\/doi.org\/10.48550\/ARXIV.2104.01409","key":"18_CR11","DOI":"10.48550\/ARXIV.2104.01409"},{"doi-asserted-by":"publisher","unstructured":"Klimkov, V., Ronanki, S., Rohnke, J., Drugman, T.: Fine-grained robust prosody transfer for single-speaker neural text-to-speech. In: 2019 Proceedings of the Interspeech, pp. 4440\u20134444 (2019). https:\/\/doi.org\/10.21437\/Interspeech.2019-2571","key":"18_CR12","DOI":"10.21437\/Interspeech.2019-2571"},{"unstructured":"Kong, Z., Ping, W.: On fast sampling of diffusion probabilistic models. In: ICML Workshop on Invertible Neural Networks, Normalizing Flows, and Explicit Likelihood Models (2021). https:\/\/openreview.net\/forum?id=agj4cdOfrAP","key":"18_CR13"},{"doi-asserted-by":"publisher","unstructured":"Kong, Z., Ping, W., Huang, J., Zhao, K., Catanzaro, B.: DiffWave: a versatile diffusion model for audio synthesis (2020). https:\/\/doi.org\/10.48550\/ARXIV.2009.09761","key":"18_CR14","DOI":"10.48550\/ARXIV.2009.09761"},{"doi-asserted-by":"publisher","unstructured":"Liu, J., Li, C., Ren, Y., Chen, F., Zhao, Z.: DiffSinger: singing voice synthesis via shallow diffusion mechanism (2021). https:\/\/doi.org\/10.48550\/ARXIV.2105.02446","key":"18_CR15","DOI":"10.48550\/ARXIV.2105.02446"},{"unstructured":"Liu, L., et al.: On the variance of the adaptive learning rate and beyond. In: International Conference on Learning Representations (2020). https:\/\/openreview.net\/forum?id=rkgz2aEKDr","key":"18_CR16"},{"key":"18_CR17","doi-asserted-by":"publisher","first-page":"1806","DOI":"10.1109\/TASLP.2021.3076369","volume":"29","author":"R Liu","year":"2021","unstructured":"Liu, R., Sisman, B., Gao, G., Li, H.: Expressive TTS training with frame and style reconstruction loss. IEEE\/ACM Trans. Audio Speech Lang. Proc. 29, 1806\u20131818 (2021). https:\/\/doi.org\/10.1109\/TASLP.2021.3076369","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Proc."},{"unstructured":"Ma, S., McDuff, D., Song, Y.: Neural TTS stylization with adversarial and collaborative games. In: International Conference on Learning Representations (ICLR) (2019). https:\/\/www.microsoft.com\/en-us\/research\/publication\/neural-tts-stylization-with-adversarial-and-collaborative-games\/","key":"18_CR18"},{"unstructured":"Neekhara, P., Hussain, S., Dubnov, S., Koushanfar, F., McAuley, J.: Expressive neural voice cloning. In: Balasubramanian, V.N., Tsang, I. (eds.) Proceedings of The 13th Asian Conference on Machine Learning. Proceedings of Machine Learning Research, vol. 157, pp. 252\u2013267. PMLR, 17\u201319 November 2021. https:\/\/proceedings.mlr.press\/v157\/neekhara21a.html","key":"18_CR19"},{"unstructured":"Obin, N.: MeLos: analysis and modelling of speech prosody and speaking style. Ph.D. thesis, Ecole Doctorale Informatique, T\u00e9l\u00e9communications et Electronique (EDITE) (2011). https:\/\/tel.archives-ouvertes.fr\/tel-00694687v2\/document","key":"18_CR20"},{"unstructured":"Ren, Y., et al.: FastSpeech: fast, robust and controllable text to speech. In: Wallach, H., Larochelle, H., Beygelzimer, A., d\u2019 Alch\u00e9-Buc, F., Fox, E., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol. 32. Curran Associates, Inc. (2019). https:\/\/proceedings.neurips.cc\/paper\/2019\/file\/f63f65b503e22cb970527f23c9ad7db1-Paper.pdf","key":"18_CR21"},{"doi-asserted-by":"publisher","unstructured":"Shen, J., et al.: Natural TTS synthesis by conditioning waveNet on MEL spectrogram predictions. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4779\u20134783 (2018). https:\/\/doi.org\/10.1109\/ICASSP.2018.8461368","key":"18_CR22","DOI":"10.1109\/ICASSP.2018.8461368"},{"unstructured":"Skerry-Ryan, R., et al.: Towards end-to-end prosody transfer for expressive speech synthesis with tacotron. In: Dy, J., Krause, A. (eds.) Proceedings of the 35th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol. 80, pp. 4693\u20134702. PMLR, 10\u201315 July 2018. https:\/\/proceedings.mlr.press\/v80\/skerry-ryan18a.html","key":"18_CR23"},{"unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: Bach, F., Blei, D. (eds.) Proceedings of the 32nd International Conference on Machine Learning. Proceedings of Machine Learning Research, vol. 37, pp. 2256\u20132265. PMLR, Lille, France, 07\u201309 July 2015. https:\/\/proceedings.mlr.press\/v37\/sohl-dickstein15.html","key":"18_CR24"},{"doi-asserted-by":"publisher","unstructured":"Stanton, D., Wang, Y., Skerry-Ryan, R.: Predicting expressive speaking style from text in end-to-end speech synthesis. In: 2018 IEEE Spoken Language Technology Workshop (SLT), pp. 595\u2013602 (2018). https:\/\/doi.org\/10.1109\/SLT.2018.8639682","key":"18_CR25","DOI":"10.1109\/SLT.2018.8639682"},{"doi-asserted-by":"publisher","unstructured":"Sun, G., Zhang, Y., Weiss, R.J., Cao, Y., Zen, H., Wu, Y.: Fully-hierarchical fine-grained prosody modeling for interpretable speech synthesis. In: ICASSP 2020\u20132020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6264\u20136268. IEEE (2020). https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053520","key":"18_CR26","DOI":"10.1109\/ICASSP40776.2020.9053520"},{"doi-asserted-by":"publisher","unstructured":"Tits, N., Wang, F., Haddad, K.E., Pagel, V., Dutoit, T.: Visualization and interpretation of latent spaces for controlling expressive speech synthesis through audio analysis (2019). https:\/\/doi.org\/10.48550\/ARXIV.1903.11570","key":"18_CR27","DOI":"10.48550\/ARXIV.1903.11570"},{"doi-asserted-by":"publisher","unstructured":"Tomczak, J.M., Welling, M.: Improving variational auto-encoders using householder flow (2016). https:\/\/doi.org\/10.48550\/ARXIV.1611.09630","key":"18_CR28","DOI":"10.48550\/ARXIV.1611.09630"},{"doi-asserted-by":"publisher","unstructured":"Ueda, L.H., Costa, P.D.P., Simoes, F.O., Neto, M.U.: Are we truly modeling expressiveness? a study on expressive TTS in Brazilian Portuguese for real-life application styles. In: Proceedings of the 11th ISCA Speech Synthesis Workshop (SSW 2011), pp. 84\u201389 (2021). https:\/\/doi.org\/10.21437\/SSW.2021-15","key":"18_CR29","DOI":"10.21437\/SSW.2021-15"},{"unstructured":"Vaswani, A., et al.: Attention is all you need. In: Guyon, I., Luxburg, U.V., Bengio, S., Wallach, H., Fergus, R., Vishwanathan, S., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol. 30. Curran Associates, Inc. (2017). https:\/\/proceedings.neurips.cc\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf","key":"18_CR30"},{"unstructured":"Wang, Y., et al.: Uncovering latent style factors for expressive speech synthesis. In: NIPS Workshop on Machine Learning for Audio Signal Processing (ML4Audio) (2017)","key":"18_CR31"},{"unstructured":"Wang, Y., et al.: Style tokens: unsupervised style modeling, control and transfer in end-to-end speech synthesis. In: Dy, J., Krause, A. (eds.) Proceedings of the 35th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol. 80, pp. 5180\u20135189. PMLR, 10\u201315 July 2018. https:\/\/proceedings.mlr.press\/v80\/wang18h.html","key":"18_CR32"},{"doi-asserted-by":"publisher","unstructured":"Wu, N.Q., Liu, Z.C., Ling, Z.H.: Discourse-level prosody modeling with a variational autoencoder for non-autoregressive expressive speech synthesis. In: ICASSP 2022\u20132022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7592\u20137596 (2022). https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9746238","key":"18_CR33","DOI":"10.1109\/ICASSP43922.2022.9746238"},{"doi-asserted-by":"publisher","unstructured":"Yamamoto, R., Song, E., Kim, J.M.: Parallel WaveGan: a fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram. In: ICASSP 2020\u20132020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6199\u20136203 (2020). https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053795","key":"18_CR34","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"doi-asserted-by":"publisher","unstructured":"Zhang, Y.J., Pan, S., He, L., Ling, Z.H.: Learning latent representations for style control and transfer in end-to-end speech synthesis. In: ICASSP 2019\u20132019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6945\u20136949 (2019). https:\/\/doi.org\/10.1109\/ICASSP.2019.8683623","key":"18_CR35","DOI":"10.1109\/ICASSP.2019.8683623"}],"container-title":["Lecture Notes in Computer Science","Intelligent Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-21686-2_18","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,7]],"date-time":"2024-03-07T17:10:33Z","timestamp":1709831433000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-21686-2_18"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031216855","9783031216862"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-21686-2_18","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"19 November 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"BRACIS","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Brazilian Conference on Intelligent Systems","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Campinas","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Brazil","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 November 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"bracis2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www2.sbc.org.br\/bracis2022\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"JEMS","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"225","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"89","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"40% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}