{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,28]],"date-time":"2025-03-28T04:05:13Z","timestamp":1743134713496,"version":"3.40.3"},"reference-count":19,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,1,11]],"date-time":"2025-01-11T00:00:00Z","timestamp":1736553600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,1,11]],"date-time":"2025-01-11T00:00:00Z","timestamp":1736553600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,1,11]]},"DOI":"10.1109\/icce63647.2025.10930170","type":"proceedings-article","created":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T02:16:58Z","timestamp":1743041818000},"page":"1-3","source":"Crossref","is-referenced-by-count":0,"title":["Briefing of Text-to-Speech Trends Toward Zero-Shot Multi-Speaker Synthesis"],"prefix":"10.1109","author":[{"given":"Dahyun","family":"Song","sequence":"first","affiliation":[{"name":"Chung-Ang University,Dept. Artificial Intelligence,Seoul,South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jaesung","family":"Lee","sequence":"additional","affiliation":[{"name":"Chung-Ang University,Dept. Artificial Intelligence,Seoul,South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414403"},{"key":"ref2","first-page":"7748","article-title":"Meta-stylespeech: Multi-speaker adaptive text-to-speech generation","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"Min","year":"2021"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2017-1452"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016706"},{"key":"ref6","article-title":"Fastspeech: Fast, robust and controllable text to speech","volume-title":"Proceedings of the 32th Annual Conference on Neural Information Processing Systems","volume":"32","author":"Ren","year":"2019"},{"key":"ref7","article-title":"Deep voice 2: Multi-speaker neural text-to-speech","author":"Arik","year":"2017","journal-title":"arXiv preprint"},{"key":"ref8","article-title":"Hierarchical generative modeling for controllable speech synthesis","author":"Hsu","year":"2018","journal-title":"arXiv preprint"},{"key":"ref9","article-title":"Deep voice 3: Scaling text-to-speech with convolutional sequence learning","author":"Ping","year":"2017","journal-title":"arXiv preprint"},{"key":"ref10","article-title":"Transfer learning from speaker verification to multispeaker text-to-speech synthesis","volume-title":"Proceedings of the 32nd Annual Conference on Neural Information Processing Systems","volume":"31","author":"Jia","year":"2018"},{"key":"ref11","first-page":"4693","article-title":"Towards end-to-end prosody transfer for expressive speech synthesis with tacotron","volume-title":"Proceedings of the 35th international conference on machine learning","author":"Skerry-Ryan","year":"2018"},{"key":"ref12","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"Kim","year":"2021"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3139"},{"key":"ref14","article-title":"Ganspeech: Adversarial training for high-fidelity multi-speaker speech synthesis","author":"Yang","year":"2021","journal-title":"arXiv preprint"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2023.3277786"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746875"},{"key":"ref17","first-page":"10970","article-title":"Generspeech: Towards style transfer for generalizable out-of-domain text-to-speech","volume-title":"Proceedings of the 36th Annual Conference on Neural Information Processing Systems","volume":"35","author":"Huang","year":"2022"},{"key":"ref18","first-page":"2709","article-title":"Yourtts: Towards zero-shot multi-speaker tts and zero-shot voice conversion for everyone","volume-title":"Proceedings of the 39th International Conference on Machine Learning","author":"Casanova","year":"2022"},{"key":"ref19","article-title":"Adaspeech 4: Adaptive text to speech in zero-shot scenarios","author":"Wu","year":"2022","journal-title":"arXiv preprint"}],"event":{"name":"2025 IEEE International Conference on Consumer Electronics (ICCE)","start":{"date-parts":[[2025,1,11]]},"location":"Las Vegas, NV, USA","end":{"date-parts":[[2025,1,14]]}},"container-title":["2025 IEEE International Conference on Consumer Electronics (ICCE)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10929765\/10929768\/10930170.pdf?arnumber=10930170","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T10:47:10Z","timestamp":1743072430000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10930170\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,1,11]]},"references-count":19,"URL":"https:\/\/doi.org\/10.1109\/icce63647.2025.10930170","relation":{},"subject":[],"published":{"date-parts":[[2025,1,11]]}}}