{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,8]],"date-time":"2025-05-08T04:48:11Z","timestamp":1746679691195,"version":"3.28.0"},"reference-count":36,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,12,11]],"date-time":"2022-12-11T00:00:00Z","timestamp":1670716800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,12,11]],"date-time":"2022-12-11T00:00:00Z","timestamp":1670716800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,12,11]]},"DOI":"10.1109\/iscslp57327.2022.10038135","type":"proceedings-article","created":{"date-parts":[[2023,2,8]],"date-time":"2023-02-08T13:53:24Z","timestamp":1675864404000},"page":"61-65","source":"Crossref","is-referenced-by-count":3,"title":["Style-Label-Free: Cross-Speaker Style Transfer by Quantized VAE and Speaker-wise Normalization in Speech Synthesis"],"prefix":"10.1109","author":[{"given":"Chunyu","family":"Qiang","sequence":"first","affiliation":[{"name":"Kwai,Beijing,P.R. China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Peng","family":"Yang","sequence":"additional","affiliation":[{"name":"Kwai,Beijing,P.R. China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hao","family":"Che","sequence":"additional","affiliation":[{"name":"Kwai,Beijing,P.R. China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaorui","family":"Wang","sequence":"additional","affiliation":[{"name":"Kwai,Beijing,P.R. China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhongyuan","family":"Wang","sequence":"additional","affiliation":[{"name":"Kwai,Beijing,P.R. China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref13","article-title":"Cross-speaker emotion transfer based on speaker condition layer normalization and semi-supervised training in text-to-speech","author":"wu","year":"2021","journal-title":"arXiv preprint arXiv 2110 07058"},{"key":"ref35","article-title":"Neural tts stylization with adversarial and collaborative games","author":"ma","year":"2018","journal-title":"International Conference on Learning Representations"},{"key":"ref12","article-title":"Semi-supervised generative modeling for controllable speech synthesis","author":"habib","year":"2019","journal-title":"arXiv preprint arXiv 1910 01500"},{"key":"ref34","article-title":"Multi-speaker multi-style text-to-speech synthesis with single-speaker single-style training data scenarios","author":"xie","year":"2021","journal-title":"arXiv preprint arXiv 2112 12310"},{"key":"ref15","first-page":"5180","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","author":"wang","year":"2018","journal-title":"International Conference on Machine Learning"},{"key":"ref14","first-page":"4693","article-title":"Towards end-to-end prosody transfer for expressive speech synthesis with tacotron","author":"skerry-ryan","year":"2018","journal-title":"International Conference on Machine Learning"},{"key":"ref36","article-title":"Visualizing data using t-sne","volume":"9","author":"van der maaten","year":"2008","journal-title":"Journal of Machine Learning Research"},{"key":"ref31","first-page":"3683","article-title":"Fitting new speakers based on a short untranscribed sample","author":"nachmani","year":"2018","journal-title":"International Conference on Machine Learning"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3021758"},{"key":"ref11","article-title":"Hierarchical generative modeling for controllable speech synthesis","author":"hsu","year":"2018","journal-title":"arXiv preprint arXiv 1810 07225"},{"key":"ref33","article-title":"Neural discrete representation learning","volume":"30","author":"van den oord","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-979"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"article-title":"Char2wav: End-to-end speech synthesis","year":"2017","author":"sotelo","key":"ref1"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-838"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683623"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1430"},{"key":"ref18","article-title":"Auto-encoding variational bayes","author":"kingma","year":"2013","journal-title":"arXiv preprint arXiv 1312 6114"},{"key":"ref24","first-page":"2096","article-title":"Domainadversarial training of neural networks","volume":"17","author":"ganin","year":"2016","journal-title":"The Journal of Machine Learning Research"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1854"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413391"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413934"},{"key":"ref20","first-page":"3331","article-title":"Chive: Varying prosody in speech synthesis with a linguistically driven dynamic hierarchical conditional variational network","author":"kenter","year":"2019","journal-title":"International Conference on Machine Learning"},{"key":"ref22","article-title":"Multi-reference tacotron by intercross training for style disentangling, transfer and control in speech synthesis","author":"bian","year":"2019","journal-title":"arXiv preprint arXiv 1904 13138"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053436"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1407"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2021.03.005"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO54536.2021.9616249"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP49672.2021.9362069"},{"key":"ref7","article-title":"Multi-reference neural tts stylization with adversarial cycle consistency","author":"whitehill","year":"2019","journal-title":"arXiv preprint arXiv 1910 10106"},{"key":"ref9","article-title":"Unitts: Residual learning of unified embedding space for speech style control","author":"kang","year":"2021","journal-title":"arXiv preprint arXiv 2106 01111"},{"key":"ref4","first-page":"8067","article-title":"Glow-tts: A generative flow for text-to-speech via monotonic alignment search","volume":"33","author":"kim","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref3","first-page":"7586","article-title":"Non-autoregressive neural text-to-speech","author":"peng","year":"2020","journal-title":"International Conference on Machine Learning"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414718"},{"key":"ref5","article-title":"Vara-tts: Non-autoregressive text-to-speech synthesis based on very deep vae with residual attention","author":"liu","year":"2021","journal-title":"arXiv preprint arXiv 2102 09032"}],"event":{"name":"2022 13th International Symposium on Chinese Spoken Language Processing (ISCSLP)","start":{"date-parts":[[2022,12,11]]},"location":"Singapore, Singapore","end":{"date-parts":[[2022,12,14]]}},"container-title":["2022 13th International Symposium on Chinese Spoken Language Processing (ISCSLP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10037756\/10037573\/10038135.pdf?arnumber=10038135","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,3,6]],"date-time":"2023-03-06T13:37:39Z","timestamp":1678109859000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10038135\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,12,11]]},"references-count":36,"URL":"https:\/\/doi.org\/10.1109\/iscslp57327.2022.10038135","relation":{},"subject":[],"published":{"date-parts":[[2022,12,11]]}}}