{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,20]],"date-time":"2025-07-20T03:52:26Z","timestamp":1752983546765,"version":"3.37.3"},"reference-count":59,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2020,11,2]],"date-time":"2020-11-02T00:00:00Z","timestamp":1604275200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,11,2]],"date-time":"2020-11-02T00:00:00Z","timestamp":1604275200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"name":"CMCU","award":["15G1405"],"award-info":[{"award-number":["15G1405"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2021,3]]},"DOI":"10.1007\/s11042-020-09901-7","type":"journal-article","created":{"date-parts":[[2020,11,2]],"date-time":"2020-11-02T22:02:45Z","timestamp":1604354565000},"page":"8331-8353","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Duration modelling and evaluation for Arabic statistical parametric speech synthesis"],"prefix":"10.1007","volume":"80","author":[{"given":"Imene","family":"Zangar","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8929-3609","authenticated-orcid":false,"given":"Zied","family":"Mnasri","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Vincent","family":"Colotte","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Denis","family":"Jouvet","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2020,11,2]]},"reference":[{"key":"9901_CR1","unstructured":"Abdelhamid O, Abdou SM, Rashwan M (2006) Improving Arabic HMM-based speech synthesis quality. In: Proceeding international conference on spoken language processing, Pittsburgh, Pennsylvania, pp 1332\u20131335"},{"key":"9901_CR2","doi-asserted-by":"crossref","unstructured":"Abdelmalek R, Mnasri Z (2016) High quality Arabic text-to-speech synthesis using unit selection. In: Proceeding IEEE international multi-conference on systems, signals, signals & devices, Leipzig, Germany, pp 1\u20135","DOI":"10.1109\/SSD.2016.7473681"},{"key":"9901_CR3","unstructured":"Arabic Speech Corpus (2020) http:\/\/en.arabicspeechcorpus.com\/. Accessed Aug 2020"},{"key":"9901_CR4","unstructured":"Boukadida F, Ellouze N (2005) Mod\u00e9lisation Statistique de la dur\u00e9e des Voyelles en Parole Arabe. In: Proceeding science of electronics, telecommunications and information technology conference, Tunisia, pp 1\u20134"},{"key":"9901_CR5","doi-asserted-by":"crossref","unstructured":"Campbell WN (1993) Predicting segmental durations for accommodation within a syllable-level timing framework. In: Proceeding european conference on speech communication and technology, Berlin, Germany, pp 1332\u20131335","DOI":"10.21437\/Eurospeech.1993-267"},{"key":"9901_CR6","doi-asserted-by":"crossref","unstructured":"Chen B, Bian T, Yu K (2017) Discrete duration model for speech synthesis. In: Proceeding annual conference of the international speech communication association, Stockholm, Sweden, pp 789\u2013793","DOI":"10.21437\/Interspeech.2017-1144"},{"key":"9901_CR7","doi-asserted-by":"crossref","unstructured":"Chen B, Lai J, Yu K (2017) Comparison of modeling target in LSTM-RNN duration model. In: Proceeding annual conference of the international speech communication association, Stockholm, Sweden, pp 794\u2013798","DOI":"10.21437\/Interspeech.2017-1152"},{"issue":"5","key":"9901_CR8","doi-asserted-by":"publisher","first-page":"421","DOI":"10.1109\/89.466653","volume":"3","author":"S Dimolitsas","year":"1995","unstructured":"Dimolitsas S, Corcoran FL, Ravishankar C (1995) Dependence of opinion scores on listening sets used in degradation category rating assessments. IEEE Trans Speech Audio Process 3(5):421\u2013424","journal-title":"IEEE Trans Speech Audio Process"},{"issue":"3-4","key":"9901_CR9","doi-asserted-by":"publisher","first-page":"435","DOI":"10.1016\/0167-6393(93)90042-J","volume":"13","author":"T Dutoit","year":"1993","unstructured":"Dutoit T, Leich H (1993) MBR-PSOLA: Text-to-speech Synthesis based on an MBE re-synthesis of the segments database. Speech Comm 13(3-4):435\u2013440","journal-title":"Speech Comm"},{"key":"9901_CR10","doi-asserted-by":"crossref","unstructured":"Fernandez R, Rendel A, Ramabhadran B, Hoory R (2014) Prosody contour prediction with long short-term memory, bi-directional, deep recurrent neural networks. In: Proceeding annual conference of the international speech communication association, Singapore, pp 2268\u20132272","DOI":"10.21437\/Interspeech.2014-445"},{"key":"9901_CR11","doi-asserted-by":"crossref","unstructured":"Gao B, Qian Y, Wu Z, Soong FK (2008) Duration refinement by jointly optimizing state and longer unit likelihood. In: Proceeding annual conference of the international speech communication association, Brisbane, Australia, pp 2266\u20132269","DOI":"10.21437\/Interspeech.2008-556"},{"issue":"8","key":"9901_CR12","doi-asserted-by":"publisher","first-page":"1223","DOI":"10.1109\/29.1651","volume":"36","author":"DW Griffin","year":"1988","unstructured":"Griffin DW, Lim JS (1988) Multiband excitation vocoder. IEEE Trans Acoust Speech Signal Process 36(8):1223\u20131235","journal-title":"IEEE Trans Acoust Speech Signal Process"},{"key":"9901_CR13","unstructured":"HTS toolkit (2018) http:\/\/hts.sp.nitech.ac.jp. Accessed Nov 2018"},{"key":"9901_CR14","unstructured":"Halabi N (2016) Modern standard arabic phonetics for speech synthesis, Dissertation, University of Southamtpon"},{"key":"9901_CR15","unstructured":"Halabi N, Wald M (2016) Phonetic inventory for an Arabic speech corpus. In: Proceeding international conference on language resources and evaluation, Portoroz, Slovenia, pp 734\u2013738"},{"key":"9901_CR16","doi-asserted-by":"crossref","unstructured":"Henter GE, Ronanki S, Watts O, Wester M, Wu Z, King S (2016) Robust TTS duration modelling using DNNs. In: Proceeding international conference on acoustics, speech and signal processing, Shanghai, China, pp 5130\u20135134","DOI":"10.1109\/ICASSP.2016.7472655"},{"issue":"8","key":"9901_CR17","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput 9(8):1735\u20131780","journal-title":"Neural Comput"},{"key":"9901_CR18","unstructured":"Houidhek A, Colotte V, Mnasri Z, Jouvet D, Zangar I (2017) Statistical modelling of speech units in HMM-based speech synthesis for Arabic. In: Proceeding language & technology conference, Poznan, Poland, pp 1\u20136"},{"key":"9901_CR19","unstructured":"Hunt AJ, Black AW (1996) Unit selection in a concatenative speech synthesis system using a large speech database. In: Proceeding IEEE international conference on acoustics, speech and signal processing, atlanta, GA, USA, pp 373\u2013376"},{"issue":"2","key":"9901_CR20","doi-asserted-by":"publisher","first-page":"10","DOI":"10.1002\/ecja.4400660203","volume":"66","author":"S Imai","year":"1983","unstructured":"Imai S, Sumita K, Furuichi C (1983) Mel log spectrum approximation (MLSA) filter for speech synthesis. Electron Commun Japan (Part I:, Commun) 66 (2):10\u201318","journal-title":"Electron Commun Japan (Part I:, Commun)"},{"key":"9901_CR21","unstructured":"Ishimatsu Y (2001) Investigation of state duration model based on gamma distribution for HMM-based speech synthesis. IEICE Technical Report, SP2001-81"},{"key":"9901_CR22","doi-asserted-by":"crossref","unstructured":"Kawahara H (1997) Speech representation and transformation using adaptive interpolation of weighted spectrum: vocoder revisited. In: Proceeding international conference on acoustics, speech and signal processing, Munich, Germany, pp 1303\u20131306","DOI":"10.1109\/ICASSP.1997.596185"},{"issue":"5","key":"9901_CR23","doi-asserted-by":"publisher","first-page":"1208","DOI":"10.1121\/1.380986","volume":"59","author":"DH Klatt","year":"1976","unstructured":"Klatt DH (1976) Linguistic uses of segmental duration in english: Acoustic and perceptual evidence. J Acoust Soc Am 59(5):1208\u20131221","journal-title":"J Acoust Soc Am"},{"key":"9901_CR24","unstructured":"Klatt DH, William EC (1975) Perception of segment duration in sentence contexts. In: Structure and process in speech perception. Springer, Berlin, pp 69\u201389"},{"key":"9901_CR25","unstructured":"Lazaridis A, Honnet PE, Garner PN (2014) SVR vs MLP for Phone Duration Modelling in HMM-based Speech Synthesis, Technical Report No.EPFL-REPORT-198140"},{"key":"9901_CR26","unstructured":"Lu H, Wu YJ, Tokuda K, Dai LR, Wang RH (2009) Full covariance state duration modeling for HMM-based speech synthesis. In: Proceeding international conference on acoustics, speech and signal processing, Taipei, Taiwan, pp 4033\u20134036"},{"key":"9901_CR27","unstructured":"MERLIN toolkit (2018) https:\/\/github.com\/CSTR-Edinburgh\/Merlin. Accessed Nov 2018"},{"key":"9901_CR28","unstructured":"Mixdorff H (2002) An integrated approach to modeling German prosody, Doktor-Ingenieur habilitatus Dissertation, Technische Universitaet Dresden"},{"issue":"5","key":"9901_CR29","first-page":"533","volume":"4","author":"Z Mnasri","year":"2009","unstructured":"Mnasri Z, Boukadida F, Ellouze N (2009) Segmental duration modeling using non parametric statistical learning. Int Rev Comput Softw 4(5):533\u2013542","journal-title":"Int Rev Comput Softw"},{"issue":"7","key":"9901_CR30","doi-asserted-by":"publisher","first-page":"1877","DOI":"10.1587\/transinf.2015EDP7457","volume":"99","author":"M Morise","year":"2016","unstructured":"Morise M, Yokomori F, Ozawa K (2016) WORLD: A vocoder-based high-quality speech synthesis system for real-time applications. IEICE Trans Inform Syst 99(7):1877\u20131884","journal-title":"IEICE Trans Inform Syst"},{"issue":"5-6","key":"9901_CR31","doi-asserted-by":"publisher","first-page":"453","DOI":"10.1016\/0167-6393(90)90021-Z","volume":"9","author":"E Moulines","year":"1990","unstructured":"Moulines E, Charpentier F (1990) Pitch-synchronous waveform processing techniques for text-to-speech synthesis using diphones. Speech Comm 9 (5-6):453\u2013467","journal-title":"Speech Comm"},{"key":"9901_CR32","doi-asserted-by":"crossref","unstructured":"Moungsri D, Koriyama T, Kobayashi T (2017) Duration prediction using multiple Gaussian process experts for GPR-based speech synthesis. In: Proceeding international conference on acoustics, speech and signal processing, New Orleans, USA, pp 5495\u20135499","DOI":"10.1109\/ICASSP.2017.7953207"},{"key":"9901_CR33","first-page":"1","volume":"44","author":"D Newman","year":"1984","unstructured":"Newman D (1984) The phonetics of Arabic. J Am Orient Soc 44:1\u20136","journal-title":"J Am Orient Soc"},{"key":"9901_CR34","doi-asserted-by":"crossref","unstructured":"Ogbureke U, Cabral J, Berndsen J (2012) Explicit duration modelling in HMM-based speech synthesis using a hybrid hidden Markov model-multilayer perceptron. In: Proceeding SAPA-SCALE conference, workshops on statistical and perceptual audition speech communication with adaptive learning portland, OR, USA","DOI":"10.1109\/ISSPA.2012.6310643"},{"key":"9901_CR35","unstructured":"Pan S, Tao J, Wang Y (2011) A state duration generation algorithm considering global variance for HMM-based speech synthesis. In: Proceeding annual summit and conference asia pacific signal and information processing association, Xi\u2019an, China"},{"issue":"2","key":"9901_CR36","doi-asserted-by":"publisher","first-page":"282","DOI":"10.1016\/j.csl.2006.06.003","volume":"21","author":"KS Rao","year":"2007","unstructured":"Rao KS, Yegnanarayana B (2007) Modeling durations of syllables using neural networks. Comput Speech Lang 21(2):282\u2013295","journal-title":"Comput Speech Lang"},{"key":"9901_CR37","doi-asserted-by":"crossref","unstructured":"Riedi M (1997) Modeling segmental duration with multivariate adaptive regression splines. In: Proceeding european conference on speech communication and technology, Rhodes, Greece, pp 2627\u20132630","DOI":"10.21437\/Eurospeech.1997-663"},{"key":"9901_CR38","unstructured":"Riley MD (1990) Tree-based modelling for speech synthesis. In: Proceeding ESCA workshop on speech synthesis, Autrans, France, pp 229\u2013232"},{"issue":"4","key":"9901_CR39","doi-asserted-by":"publisher","first-page":"411","DOI":"10.1016\/j.wocn.2005.02.001","volume":"33","author":"KM Rosen","year":"2005","unstructured":"Rosen KM (2005) Analysis of speech segment duration with the lognormal distribution: a basis for unification and comparison. J Phon 33(4):411\u2013426","journal-title":"J Phon"},{"issue":"2","key":"9901_CR40","doi-asserted-by":"publisher","first-page":"321","DOI":"10.1121\/1.386780","volume":"70","author":"P Rubin","year":"1981","unstructured":"Rubin P, Baer T, Mermelstein P (1981) An articulatory synthesizer for perceptual research. J Acoust Soc Am 70(2):321\u2013328","journal-title":"J Acoust Soc Am"},{"key":"9901_CR41","doi-asserted-by":"crossref","unstructured":"Shen J, Pang R, Weiss RJ, Schuster M, Jaitly N, Yang Z, Chen Z, Zhang Y, Wang Y, Skerrv-Ryan Rj, et al. (2018) Natural tts synthesis by conditioning wavenet on mel spectrogram predictions. In: Proceeding international conference on acoustics, speech and signal processing, calgary, Alberta, Canada, pp 4779\u20134783","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"9901_CR42","doi-asserted-by":"crossref","unstructured":"Shinoda K, Watanabe T (1997) Acoustic modeling based on the MDL principle for speech recognition. In: Proceeding european conference on speech communication and technology, Rhodes, Greece, pp 99\u2013102","DOI":"10.21437\/Eurospeech.1997-52"},{"key":"9901_CR43","unstructured":"Sil\u00e9n H, Helander E, Nurminen J, Gabbouj M (2010) Analysis of duration prediction accuracy in HMM-based speech synthesis. In: Proceeding international conference on speech prosody, chicago, IL, USA, pp 1\u20134"},{"issue":"3","key":"9901_CR44","doi-asserted-by":"publisher","first-page":"1464","DOI":"10.1109\/23.589532","volume":"44","author":"J Sola","year":"1997","unstructured":"Sola J, Sevilla J (1997) Importance of input data normalization for the application of neural networks to complex industrial problems. IEEE Trans Nucl Sci 44(3):1464\u20131468","journal-title":"IEEE Trans Nucl Sci"},{"key":"9901_CR45","unstructured":"Thorpe LA, Shelton BR (1993) Subjective test methodology: MOS vs. DMOS in evaluation of speech coding algorithms. In: Proceeding IEEE workshop on speech coding for telecommunications, pp 73\u201374"},{"issue":"2","key":"9901_CR46","doi-asserted-by":"publisher","first-page":"95","DOI":"10.1006\/csla.1994.1005","volume":"8","author":"JP Van Santen","year":"1994","unstructured":"Van Santen JP (1994) Assignment of segmentalduration in text-to-speech synthesis. Comput Speech Lang 8(2):95\u2013128","journal-title":"Comput Speech Lang"},{"key":"9901_CR47","doi-asserted-by":"crossref","unstructured":"Wang Y, Skerry-Ryan RJ, Stanton D, Wu Y, Weiss RJ, Jaitly N, Yang Z, Xiao Y, Chen Z, Bengio S, et al. (2017) Tacotron: towards end-to-end speech synthesis. In: Proceeding annual conference of the international speech communication association, Stockholm, Sweden, pp 4006\u20134010","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"9901_CR48","unstructured":"Wavenet (2020) https:\/\/deepmind.com\/blog\/article\/wavenet-generative-model-raw-audio. Accessed Aug 2020"},{"key":"9901_CR49","doi-asserted-by":"crossref","unstructured":"Wu Z, Watts O, King S (2016) MERLIN: an open source neural network speech synthesis system. In: Proceeding ISCA workshop on speech synthesis, Sunnyvale, USA, pp 202\u2013207","DOI":"10.21437\/SSW.2016-33"},{"issue":"4","key":"9901_CR50","first-page":"75","volume":"20","author":"W Yijian","year":"2006","unstructured":"Yijian W, Renhua W (2006) HMM-Based trainable speech synthesis for chinese. J Chinese Inform Process 20(4):75\u201381","journal-title":"J Chinese Inform Process"},{"key":"9901_CR51","doi-asserted-by":"crossref","unstructured":"Yoshimura T, Tokuda K, Masuko T, Kobayashi T, Kitamura T (1998) Duration modeling for HMM-based speech synthesis. In: Proceeding international conference on spoken language processing, Sydney, Australia, pp 29\u201332","DOI":"10.21437\/ICSLP.1998-6"},{"key":"9901_CR52","doi-asserted-by":"crossref","unstructured":"Yoshimura T, Tokuda K, Masuko T, Kobayashi T, Kitamura T (1999) Simultaneous modeling of spectrum, pitch and duration in HMM-based speech synthesis. In: Proceeding European conference on speech communication and technology, Budapest, Hungary, pp 2347\u20132350","DOI":"10.21437\/Eurospeech.1999-513"},{"key":"9901_CR53","doi-asserted-by":"crossref","unstructured":"Yu K, Mairesse F, Young S (2010) Word-level emphasis modelling in HMM-based speech synthesis. In: Proceeding international conference on acoustics, speech and signal processing, dallas, TX, USA, pp 4238\u20134241","DOI":"10.1109\/ICASSP.2010.5495690"},{"key":"9901_CR54","unstructured":"Zaki A, Rajouani A, Najim M (2002) Un mod\u00e8le pr\u00e9dictif de la dur\u00e9e segmentale pour la synth\u00e8se de la parole arabe \u00e0 partir du texte. In: Proceeding Journ\u00e9es d\u2019etudes sur la parole, Nancy, France, pp 89\u201392"},{"key":"9901_CR55","unstructured":"Zangar I, Colotte V, Mnasri Z, Jouvet D, Houidhek A (2018) Duration modelling using DNN for Arabic speech synthesis. In: Proceeding international conference on speech prosody, Poznan, Poland, pp 597\u2013601"},{"key":"9901_CR56","doi-asserted-by":"crossref","unstructured":"Zen H, Sak H (2015) Unidirectional long short-term memory recurrent neural network with recurrent output layer for low-latency speech synthesis. In: Proceeding international conference on acoustics, speech and signal processing, Brisbane, Australia, pp 4470\u20134474","DOI":"10.1109\/ICASSP.2015.7178816"},{"key":"9901_CR57","doi-asserted-by":"crossref","unstructured":"Zen H, Senior A, Schuster M (2013) Statistical parametric speech synthesis using deep neural networks. In: Proceeding IEEE international conference on acoustics, speech and signal processing, Vancouver, Canada, pp 7962\u20137966","DOI":"10.1109\/ICASSP.2013.6639215"},{"issue":"11","key":"9901_CR58","doi-asserted-by":"publisher","first-page":"1039","DOI":"10.1016\/j.specom.2009.04.004","volume":"51","author":"H Zen","year":"2009","unstructured":"Zen H, Tokuda K, Black AW (2009) Statistical parametric speech synthesis. Speech Comm 51(11):1039\u20131064","journal-title":"Speech Comm"},{"key":"9901_CR59","doi-asserted-by":"crossref","unstructured":"Zen H, Tokuda K, Masuko T, Kobayashi T, Kitamura T (2004) Hidden semi-Markov model based speech synthesis. In: Proceeding international conference on spoken language processing, Jeju Island, Korea, pp 1393\u20131396","DOI":"10.21437\/Interspeech.2004-460"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-020-09901-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11042-020-09901-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-020-09901-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,10,11]],"date-time":"2023-10-11T06:57:43Z","timestamp":1697007463000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11042-020-09901-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,11,2]]},"references-count":59,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2021,3]]}},"alternative-id":["9901"],"URL":"https:\/\/doi.org\/10.1007\/s11042-020-09901-7","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"type":"print","value":"1380-7501"},{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2020,11,2]]},"assertion":[{"value":"26 June 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 September 2020","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 September 2020","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 November 2020","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}