{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,1,23]],"date-time":"2025-01-23T05:26:36Z","timestamp":1737609996507,"version":"3.33.0"},"reference-count":50,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2025,1,22]],"date-time":"2025-01-22T00:00:00Z","timestamp":1737504000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,22]],"date-time":"2025-01-22T00:00:00Z","timestamp":1737504000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SN COMPUT. SCI."],"DOI":"10.1007\/s42979-024-03652-0","type":"journal-article","created":{"date-parts":[[2025,1,22]],"date-time":"2025-01-22T15:33:41Z","timestamp":1737560021000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["An Investigation of Phrase Break Prediction in an End-to-End TTS System"],"prefix":"10.1007","volume":"6","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-7685-089X","authenticated-orcid":false,"given":"Anandaswarup","family":"Vadapalli","sequence":"first","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,1,22]]},"reference":[{"key":"3652_CR1","unstructured":"Skerry-Ryan RJ, Battenberg E, Xiao Y, Wang Y, Stanton D, Shor J, Weiss RJ, Clark R, Saurous RA. Towards end-to-end prosody transfer for expressive speech synthesis with Tacotron. In: Dy JG, Krause A, editors. Proceedings of the 35th international conference on machine learning, ICML 2018, Stockholmsm\u00e4ssan, Stockholm, Sweden, July 10\u201315, 2018, Proceedings of machine learning research, vol.\u00a080 (PMLR, 2018). pp. 4700\u20139."},{"key":"3652_CR2","unstructured":"Wang Y, Stanton D, Zhang Y, Skerry-Ryan RJ, Battenberg E, Shor J, Xiao Y, Jia Y, Ren F, Saurous RA. Style tokens: unsupervised style modeling, control and transfer in end-to-end speech synthesis. In: Dy JG, Krause A, editors. Proceedings of the 35th international conference on machine learning, ICML 2018, Stockholmsm\u00e4ssan, Stockholm, Sweden, July 10\u201315, 2018, Proceedings of machine learning research, vol.\u00a080 (PMLR, 2018). pp. 5167\u201376."},{"key":"3652_CR3","doi-asserted-by":"publisher","unstructured":"Zhang Y, Pan S, He L, Ling Z. Learning latent representations for style control and transfer in end-to-end speech synthesis. In: IEEE international conference on acoustics, speech and signal processing, ICASSP 2019, Brighton, United Kingdom, May 12\u201317, (IEEE, 2019), 2019. pp. 6945\u20139. https:\/\/doi.org\/10.1109\/ICASSP.2019.8683623.","DOI":"10.1109\/ICASSP.2019.8683623"},{"key":"3652_CR4","doi-asserted-by":"publisher","unstructured":"Lee Y, Kim T. Robust and fine-grained prosody control of end-to-end speech synthesis. In: IEEE international conference on acoustics, speech and signal processing, ICASSP 2019, Brighton, United Kingdom, May 12\u201317, (IEEE, 2019), 2019. pp. 5911\u20135. https:\/\/doi.org\/10.1109\/ICASSP.2019.8683501.","DOI":"10.1109\/ICASSP.2019.8683501"},{"key":"3652_CR5","unstructured":"Hsu W, Zhang Y, Weiss RJ, Zen H, Wu Y, Wang Y, Cao Y, Jia Y, Chen Z, Shen J, Nguyen P, Pang R. Hierarchical generative modeling for controllable speech synthesis. In: 7th international conference on learning representations, ICLR 2019, New Orleans, LA, USA, May 6\u20139, 2019 (OpenReview.net, 2019)."},{"key":"3652_CR6","unstructured":"Battenberg E, Mariooryad S, Stanton D, Skerry-Ryan RJ, Shannon M, Kao D, Bagby T. Effective use of variational embedding capacity in expressive end-to-end speech synthesis. CoRR. 2019. arXiv:abs\/1906.03402."},{"key":"3652_CR7","doi-asserted-by":"publisher","unstructured":"Aggarwal V, Cotescu M, Prateek N, Lorenzo-Trueba J, Barra-Chicote R. Using VAEs and normalizing flows for one-shot text-to-speech synthesis of expressive speech. In: 2020 IEEE international conference on acoustics, speech and signal processing, ICASSP 2020, Barcelona, Spain, May 4\u20138 (IEEE, 2020), 2020. pp. 6179\u201383. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053678.","DOI":"10.1109\/ICASSP40776.2020.9053678"},{"key":"3652_CR8","doi-asserted-by":"publisher","unstructured":"Valle R, Li J, Prenger R, Catanzaro B. Mellotron: multispeaker expressive voice synthesis by conditioning on rhythm, pitch and global style tokens. In: 2020 IEEE international conference on acoustics, speech and signal processing, ICASSP 2020, Barcelona, Spain, May 4\u20138 (IEEE, 2020), 2020. pp. 6189\u201393. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9054556.","DOI":"10.1109\/ICASSP40776.2020.9054556"},{"key":"3652_CR9","doi-asserted-by":"publisher","unstructured":"Hu T, Shrivastava A, Tuzel O, Dhir C. Unsupervised style and content separation by minimizing mutual information for speech synthesis. In: 2020 IEEE international conference on acoustics, speech and signal processing, ICASSP 2020, Barcelona, Spain, May 4\u20138 (IEEE, 2020), 2020. pp. 3267\u201371. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9054591.","DOI":"10.1109\/ICASSP40776.2020.9054591"},{"key":"3652_CR10","doi-asserted-by":"publisher","first-page":"1470","DOI":"10.1109\/LSP.2020.3016564","volume":"27","author":"R Liu","year":"2020","unstructured":"Liu R, Sisman B, Bao F, Gao G, Li H. Modeling prosodic phrasing with multi-task learning in tacotron-based TTS. IEEE Signal Process Lett. 2020;27:1470\u20134. https:\/\/doi.org\/10.1109\/LSP.2020.3016564.","journal-title":"IEEE Signal Process. Lett."},{"key":"3652_CR11","doi-asserted-by":"publisher","unstructured":"Xiao Y, He L, Ming H, Soong FK. Improving prosody with linguistic and bert derived features in multi-speaker based mandarin Chinese neural TTS. In: 2020 IEEE international conference on acoustics, speech and signal processing, ICASSP 2020, Barcelona, Spain, May 4\u20138 (IEEE, 2020), 2020. pp. 6704\u20138. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9054337.","DOI":"10.1109\/ICASSP40776.2020.9054337"},{"key":"3652_CR12","doi-asserted-by":"crossref","unstructured":"Silverman, Beckman ME, Pitrelli JF, Ostendorf M, Wightman CW, Price P, Pierrehumbert JB, Hirschberg J. ToBI: a standard for labeling English prosody. In: ICSLP (ISCA, 1992).","DOI":"10.21437\/ICSLP.1992-260"},{"issue":"2","key":"3652_CR13","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1006\/csla.1998.0041","volume":"12","author":"P Taylor","year":"1998","unstructured":"Taylor P, Black AW. Assigning phrase breaks from part-of-speech sequences. Comput Speech Lang. 1998;12(2):99\u2013117. https:\/\/doi.org\/10.1006\/csla.1998.0041.","journal-title":"Comput. Speech Lang."},{"key":"3652_CR14","unstructured":"Prahallad K, Raghavendra EV, Black AW. Learning speaker-specific phrase breaks for text-to-speech systems. In: Proceedings of 7th ISCA speech synthesis workshop (SSW7) (Kyoto, Japan, 2010). pp. 148\u201353."},{"key":"3652_CR15","doi-asserted-by":"crossref","unstructured":"Prahallad K, Raghavendra EV, Black AW. Semi-supervised learning of acoustic driven prosodic phrase breaks for text-to-speech systems. In: Proceedings of 5th international conference on speech prosody (speech prosody 2010) (Chicago, Illinois, 2010).","DOI":"10.21437\/SpeechProsody.2010-125"},{"issue":"1\u20132","key":"3652_CR16","doi-asserted-by":"publisher","first-page":"305","DOI":"10.1016\/0004-3702(93)90020-C","volume":"63","author":"J Hirschberg","year":"1993","unstructured":"Hirschberg J. Pitch accent in context: predicting intonational prominence from text. Artif Intell. 1993;63(1\u20132):305\u201340. https:\/\/doi.org\/10.1016\/0004-3702(93)90020-C.","journal-title":"Artif Intell"},{"issue":"3","key":"3652_CR17","doi-asserted-by":"publisher","first-page":"155","DOI":"10.1006\/csla.1996.0010","volume":"10","author":"KN Ross","year":"1996","unstructured":"Ross KN, Ostendorf M. Prediction of abstract prosodic labels for speech synthesis. Comput Speech Lang. 1996;10(3):155\u201385. https:\/\/doi.org\/10.1006\/csla.1996.0010.","journal-title":"Comput Speech Lang"},{"issue":"2","key":"3652_CR18","doi-asserted-by":"publisher","first-page":"95","DOI":"10.1006\/csla.1994.1005","volume":"8","author":"JPH van Santen","year":"1994","unstructured":"van Santen JPH. Assignment of segmental duration in text-to-speech synthesis. Comput Speech Lang. 1994;8(2):95\u2013128. https:\/\/doi.org\/10.1006\/csla.1994.1005.","journal-title":"Comput Speech Lang"},{"issue":"3","key":"3652_CR19","doi-asserted-by":"publisher","first-page":"1707","DOI":"10.1121\/1.402450","volume":"91","author":"C Wightman","year":"1992","unstructured":"Wightman C, Shattuck-Hufnagel S, Ostendorf M, Price PJ. Segmental durations in the vicinity of prosodic phrase boundaries. J Acoust Soc Am. 1992;91(3):1707\u201317.","journal-title":"J Acoust Soc Am"},{"key":"3652_CR20","doi-asserted-by":"publisher","first-page":"407","DOI":"10.1006\/jpho.2001.0145","volume":"29","author":"L Redi","year":"2001","unstructured":"Redi L, Shattuck-Hufnagel S. Variation in realization of glottalization in normal speakers. J Phonet. 2001;29:407\u201329.","journal-title":"J Phonet"},{"key":"3652_CR21","doi-asserted-by":"crossref","unstructured":"Kim H, Yoon T, Cole J, Hasegawa-Johnson M. Acoustic differentiation of L- and L-L% in switchboard and radio news speech. In: Proceedings of speech prosody (Dresden, 2006).","DOI":"10.21437\/SpeechProsody.2006-165"},{"key":"3652_CR22","unstructured":"Parlikar A, Black AW. Minimum error rate training for phrasing in speech synthesis. In: Proceedings of 8th ISCA speech synthesis workshop (SSW8) (Barcelona, Spain, 2013). pp. 13\u20136."},{"key":"3652_CR23","doi-asserted-by":"crossref","unstructured":"Watts O, Stan A, Mamiya Y, Suni A, Burgos JM, Montero JM. The Simple4All entry to the Blizzard challenge 2013. In: Proceedings of the 2013 Blizzard challenge workshop. 2013.","DOI":"10.21437\/Blizzard.2013-13"},{"key":"3652_CR24","unstructured":"Vadapalli A, Bhaskararao P, Prahallad K. Significance of word-terminal syllables for prediction of phrase breaks in text-to-speech systems in Indian languages. In: Proceedings of 8th ISCA speech synthesis workshop (SSW8) (Barcelona, Spain). 2013. pp. 189\u201394."},{"key":"3652_CR25","doi-asserted-by":"crossref","unstructured":"Watts O, Yamagishi J, King S. Unsupervised continuous-valued word features for phrase-break prediction without a part-of-speech tagger. In: Proceedings of interspeech (Florence, Italy). 2011. pp. 2157\u201360.","DOI":"10.21437\/Interspeech.2011-565"},{"key":"3652_CR26","doi-asserted-by":"crossref","unstructured":"Vadapalli A, Prahallad K. Learning continuous-valued word representations for phrase break prediction. In: INTERSPEECH 2014, 15th annual conference of the international speech communication association, Singapore, September 14\u201318, (ISCA, 2014). 2014. pp. 41\u20135.","DOI":"10.21437\/Interspeech.2014-9"},{"key":"3652_CR27","doi-asserted-by":"crossref","unstructured":"Watts O, Gangireddy S, Yamagishi J, King S, Renals S, Stan A, Giurgiu M. Neural net word representations for phrase-break prediction without a part of speech tagger. In: Proceedings IEEE international conference on acoustics, speech, and signal processing (ICASSP) (Florence, Italy), 2014. pp. 2599\u2013603.","DOI":"10.1109\/ICASSP.2014.6854070"},{"key":"3652_CR28","doi-asserted-by":"publisher","unstructured":"Vadapalli A, Gangashetty SV. An investigation of recurrent neural network architectures using word embeddings for phrase break prediction. In: INTERSPEECH 2016, 17th annual conference of the international speech communication association, San Francisco, CA, USA, September 8\u201312, (ISCA, 2016), 2016. pp. 2308\u201312. https:\/\/doi.org\/10.21437\/Interspeech.2016-885.","DOI":"10.21437\/Interspeech.2016-885"},{"key":"3652_CR29","doi-asserted-by":"crossref","unstructured":"Parlikar A, Black AW. A grammar based approach to style specific phrase prediction. In: Proceedings of interspeech (Florence, Italy). 2011. pp. 2149\u201352.","DOI":"10.21437\/Interspeech.2011-563"},{"key":"3652_CR30","doi-asserted-by":"publisher","first-page":"175","DOI":"10.1016\/0885-2308(92)90025-Y","volume":"6","author":"M Wang","year":"1992","unstructured":"Wang M, Hirschberg J. Automatic classification of intonational phrase boundaries. Comput Speech Lang. 1992;6:175\u201396.","journal-title":"Comput Speech Lang"},{"issue":"11\u201312","key":"3652_CR31","doi-asserted-by":"publisher","first-page":"888","DOI":"10.1016\/j.specom.2008.03.003","volume":"50","author":"E Navas","year":"2008","unstructured":"Navas E, Hernez I, Sainz I. Evaluation of automatic break insertion for an agglutinative and inflected language. Speech Commun. 2008;50(11\u201312):888\u201399.","journal-title":"Speech Commun"},{"key":"3652_CR32","doi-asserted-by":"crossref","unstructured":"Schmid H, Atterer M. New statistical methods for phrase break prediction. In: Proceedings of 20th international conference on computational linguistics, COLING \u201904 (Geneva, Switzerland, 2004).","DOI":"10.3115\/1220355.1220450"},{"key":"3652_CR33","unstructured":"Bonafonte A, Ag\u00fcero P. Phrase break prediction using a finite state transducer. In: Proceedings of 11th international workshop on advances in speech technology. 2004."},{"key":"3652_CR34","unstructured":"Busser B, Daelemans W, van\u00a0den Bosch A. Predicting phrase breaks with memory-based learning. In: Proceedings of 4th ISCA speech synthesis workshop. 2001."},{"key":"3652_CR35","doi-asserted-by":"crossref","unstructured":"Parlikar A, Black AW. Data-driven phrasing for speech synthesis in low-resource languages. In: Proceedings of IEEE international conference on acoustics, speech and signal processing (Kyoto, Japan, 2012).","DOI":"10.1109\/ICASSP.2012.6288798"},{"key":"3652_CR36","doi-asserted-by":"crossref","unstructured":"Krishna NS, Murthy HA. A new prosodic phrasing model for Indian language Telugu. In: INTERSPEECH-2004-ICSLP, vol. 1. 2004. pp. 793\u20136.","DOI":"10.21437\/Interspeech.2004-298"},{"key":"3652_CR37","unstructured":"Watts O, Unsupervised learning for text-to-speech synthesis. Ph.D. thesis, University of Edinburgh 2012."},{"key":"3652_CR38","doi-asserted-by":"publisher","unstructured":"Klimkov V, Nadolski A, Moinet A, Putrycz B, Barra-Chicote R, Merritt T, Drugman T. Phrase break prediction for long-form reading TTS: exploiting text structure information. In: Lacerda F, editor. Interspeech 2017, 18th annual conference of the international speech communication association, Stockholm, Sweden, August 20\u201324, 2017 (ISCA). 2017. pp. 1064\u20138. https:\/\/doi.org\/10.21437\/INTERSPEECH.2017-419.","DOI":"10.21437\/INTERSPEECH.2017-419"},{"key":"3652_CR39","doi-asserted-by":"publisher","unstructured":"Devlin J, Chang M, Lee K, Toutanova K. BERT: pre-training of deep bidirectional transformers for language understanding. In: Burstein J, Doran C, Solorio T, editors. Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, NAACL-HLT 2019, Minneapolis, MN, USA, June 2\u20137, 2019, volume 1 (long and short papers) (Association for Computational Linguistics), 2019. pp. 4171\u201386. https:\/\/doi.org\/10.18653\/V1\/N19-1423.","DOI":"10.18653\/V1\/N19-1423"},{"key":"3652_CR40","doi-asserted-by":"publisher","unstructured":"Futamata K, Park B, Yamamoto R, Tachibana K. Phrase break prediction with bidirectional encoder representations in Japanese text-to-speech synthesis. In: Hermansky H, Cernock\u00fd H, Burget L, Lamel L, Scharenborg O, Motl\u00edcek P, editors. 22nd annual conference of the international speech communication association, Interspeech 2021, Brno, Czechia, August 30\u2013September 3, 2021, (ISCA), 2021. pp. 3126\u201330. https:\/\/doi.org\/10.21437\/INTERSPEECH.2021-252.","DOI":"10.21437\/INTERSPEECH.2021-252"},{"key":"3652_CR41","doi-asserted-by":"publisher","unstructured":"Zen H, Dang V, Clark R, Zhang Y, Weiss RJ, Jia Y, Chen Z, Wu Y. LibriTTS: a corpus derived from LibriSpeech for text-to-speech. In: Kubin G, Kacic Z, editors Interspeech 2019, 20th annual conference of the international speech communication association, Graz, Austria, 15\u201319 September 2019 (ISCA), 2019. pp. 1526\u201330. https:\/\/doi.org\/10.21437\/Interspeech.2019-2441.","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"3652_CR42","doi-asserted-by":"publisher","unstructured":"McAuliffe M, Socolof M, Mihuc S, Wagner M, Sonderegger M. Montreal forced aligner: trainable text-speech alignment using Kaldi. In: Lacerda F, editor. Interspeech 2017, 18th annual conference of the international speech communication association, Stockholm, Sweden, August 20\u201324, 2017 (ISCA), 2017. pp. 498\u2013502. https:\/\/doi.org\/10.21437\/INTERSPEECH.2017-1386.","DOI":"10.21437\/INTERSPEECH.2017-1386"},{"key":"3652_CR43","unstructured":"Kingma DP, Ba J. Adam: a method for stochastic optimization. In: Bengio Y, LeCun Y, editors. 3rd international conference on learning representations, ICLR 2015, San Diego, CA, USA, May 7\u20139, 2015, conference track proceedings. 2015. http:\/\/arxiv.org\/abs\/1412.6980."},{"key":"3652_CR44","unstructured":"van Rijsbergen CJ. Information retrieval (Butterworth, 1979)."},{"key":"3652_CR45","doi-asserted-by":"publisher","unstructured":"Battenberg E, Skerry-Ryan RJ, Mariooryad S, Stanton D, Kao D, Shannon M, Bagby T. Location-relative attention mechanisms for robust long-form speech synthesis. In: 2020 IEEE international conference on acoustics, speech and signal processing, ICASSP 2020, Barcelona, Spain, May 4\u20138, 2020 (IEEE, 2020). pp. 6194\u20138. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9054106.","DOI":"10.1109\/ICASSP40776.2020.9054106"},{"key":"3652_CR46","unstructured":"Kalchbrenner N, Elsen E, Simonyan K, Noury S, Casagrande N, Lockhart E, Stimberg F, van\u00a0den Oord A, Dieleman S, Kavukcuoglu K. Efficient neural audio synthesis. In: Dy JG, Krause A, editors. Proceedings of the 35th international conference on machine learning, ICML 2018, Stockholmsm\u00e4ssan, Stockholm, Sweden, July 10\u201315, 2018, Proceedings of machine learning research, vol.\u00a080 (PMLR). pp. 2415\u201324. 2018. http:\/\/proceedings.mlr.press\/v80\/kalchbrenner18a.html."},{"key":"3652_CR47","doi-asserted-by":"publisher","unstructured":"Lorenzo-Trueba J, Drugman T, Latorre J, Merritt T, Putrycz B, Barra-Chicote R, Moinet A, Aggarwal V. Towards achieving robust universal neural vocoding. In: Kubin G, Kacic Z, editors. Interspeech 2019, 20th annual conference of the international speech communication association, Graz, Austria, 15\u201319 September 2019 (ISCA, 2019), pp. 181\u20135. https:\/\/doi.org\/10.21437\/Interspeech.2019-1424.","DOI":"10.21437\/Interspeech.2019-1424"},{"key":"3652_CR48","unstructured":"Ito K, Johnson L. The LJ speech dataset. 2017. https:\/\/keithito.com\/LJ-Speech-Dataset\/."},{"key":"3652_CR49","doi-asserted-by":"publisher","unstructured":"Kharitonov E, Vincent D, Borsos Z, Marinier R, Girgin S, Pietquin O, Sharifi M, Tagliasacchi M, Zeghidour N. Speak, read and prompt: high-fidelity text-to-speech with minimal supervision. CoRR. 2023. arXiv:abs\/2302.03540. https:\/\/doi.org\/10.48550\/arXiv.2302.03540.","DOI":"10.48550\/arXiv.2302.03540"},{"key":"3652_CR50","doi-asserted-by":"publisher","unstructured":"Wang C, Chen S, Wu Y, Zhang Z, Zhou L, Liu S, Chen Z, Liu Y, Wang H, Li J, He L, Zhao S, Wei F. Neural codec language models are zero-shot text to speech synthesizers. CoRR. 2023. arXiv:abs\/2301.02111. https:\/\/doi.org\/10.48550\/arXiv.2301.02111.","DOI":"10.48550\/arXiv.2301.02111"}],"container-title":["SN Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-024-03652-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42979-024-03652-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-024-03652-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,22]],"date-time":"2025-01-22T15:33:52Z","timestamp":1737560032000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42979-024-03652-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,1,22]]},"references-count":50,"journal-issue":{"issue":"2","published-online":{"date-parts":[[2025,2]]}},"alternative-id":["3652"],"URL":"https:\/\/doi.org\/10.1007\/s42979-024-03652-0","relation":{},"ISSN":["2661-8907"],"issn-type":[{"value":"2661-8907","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,1,22]]},"assertion":[{"value":"29 June 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 December 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 January 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The author has no conflicts or conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"The author consents to the publication of this work.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"Not applicable.","order":5,"name":"Ethics","group":{"name":"EthicsHeading","label":"Materials availability"}},{"value":"\u2013 The code for the phrasing models is available online at  \u2013 The code for the end-to-end TTS system is available online at","order":6,"name":"Ethics","group":{"name":"EthicsHeading","label":"Code availability"}}],"article-number":"91"}}