{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,20]],"date-time":"2025-07-20T04:17:16Z","timestamp":1752985036171},"reference-count":30,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,10,25]],"date-time":"2023-10-25T00:00:00Z","timestamp":1698192000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,10,25]],"date-time":"2023-10-25T00:00:00Z","timestamp":1698192000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,10,25]]},"DOI":"10.1109\/sped59241.2023.10314948","type":"proceedings-article","created":{"date-parts":[[2023,11,15]],"date-time":"2023-11-15T18:50:55Z","timestamp":1700074255000},"page":"152-157","source":"Crossref","is-referenced-by-count":1,"title":["Advancing Limited Data Text-to-Speech Synthesis: Non-Autoregressive Transformer for High-Quality Parallel Synthesis"],"prefix":"10.1109","author":[{"given":"Mohammed Salah","family":"Al-Radhi","sequence":"first","affiliation":[{"name":"Budapest University of Technology and Economics,Department of Telecommunications and Media Informatics,Budapest,Hungary"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Omnia","family":"Ibrahim","sequence":"additional","affiliation":[{"name":"Saarland University,Department of Language Science and Technology,Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ali Raheem","family":"Mandeel","sequence":"additional","affiliation":[{"name":"Budapest University of Technology and Economics,Department of Telecommunications and Media Informatics,Budapest,Hungary"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tam\u00e1s","family":"G\u00e1bor Csap\u00f3","sequence":"additional","affiliation":[{"name":"Budapest University of Technology and Economics,Department of Telecommunications and Media Informatics,Budapest,Hungary"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"G\u00e9za","family":"N\u00e9meth","sequence":"additional","affiliation":[{"name":"Budapest University of Technology and Economics,Department of Telecommunications and Media Informatics,Budapest,Hungary"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746686"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683143"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-3139"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2935807"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Blizzard.2023-5"},{"key":"ref7","article-title":"Nana-HDR: A Non-attentive Non-autoregressive Hybrid Model for TTS","author":"Lin","year":"2021","journal-title":"Blizzard Challenge workshop"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/icassp40776.2020.9054484"},{"article-title":"FastSpeech2: Fast and High-Quality End-to-End Text to Speech","volume-title":"The Conference on Learning Representations (ICLR)","author":"Ren","key":"ref9"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2019-30"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682368"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58309-5_22"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/s10772-022-09961-0"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/s10772-016-9342-8"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/SSD.2019.8893275"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCSPA49915.2021.9385731"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICIHT.2017.7899133"},{"key":"ref19","first-page":"577","article-title":"Attention-Based Models for Speech Recognition","volume-title":"Proceedings of the Neural Information Processing Systems (NIPS)","author":"Chorowski"},{"article-title":"WaveNet: A generative model for raw audio","volume-title":"Speech Synthesis Workshop","author":"Oord","key":"ref20"},{"key":"ref21","first-page":"734","article-title":"Phonetic inventory for an Arabic speech corpus","volume-title":"Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC\u201916)","author":"Halabi"},{"key":"ref22","first-page":"3165","article-title":"Fastspeech: Fast, robust and controllable text to speech","author":"Ren","year":"2019","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3167258"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref25","first-page":"17022","article-title":"HiFi-GAN: generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"International Conference on Neural Information Processing Systems (NeurIPS)","author":"Kong"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1121\/1.3097493"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2003"},{"article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"34th Conference on Neural Information Processing Systems (NeurIPS 2020)","author":"Baevski","key":"ref28"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2018-28"},{"year":"2001","key":"ref30","article-title":"Method for the subjective assessment of intermediate audio quality"}],"event":{"name":"2023 International Conference on Speech Technology and Human-Computer Dialogue (SpeD)","start":{"date-parts":[[2023,10,25]]},"location":"Bucharest, Romania","end":{"date-parts":[[2023,10,27]]}},"container-title":["2023 International Conference on Speech Technology and Human-Computer Dialogue (SpeD)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10314856\/10314866\/10314948.pdf?arnumber=10314948","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,14]],"date-time":"2024-03-14T01:33:51Z","timestamp":1710380031000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10314948\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,25]]},"references-count":30,"URL":"https:\/\/doi.org\/10.1109\/sped59241.2023.10314948","relation":{},"subject":[],"published":{"date-parts":[[2023,10,25]]}}}