{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T17:19:17Z","timestamp":1765041557948,"version":"3.28.0"},"reference-count":24,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,7,15]],"date-time":"2024-07-15T00:00:00Z","timestamp":1721001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,7,15]],"date-time":"2024-07-15T00:00:00Z","timestamp":1721001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,7,15]]},"DOI":"10.1109\/icme57554.2024.10687845","type":"proceedings-article","created":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T17:24:16Z","timestamp":1727717056000},"page":"1-6","source":"Crossref","is-referenced-by-count":2,"title":["AdaStyleSpeech: A Fast Stylized Speech Synthesis Model Based on Adaptive Instance Normalization"],"prefix":"10.1109","author":[{"given":"Yuming","family":"Yang","sequence":"first","affiliation":[{"name":"Chongqing University,College of Computer Science,Chongqing,China"}]},{"given":"Dongsheng","family":"Zou","sequence":"additional","affiliation":[{"name":"Chongqing University,College of Computer Science,Chongqing,China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"journal-title":"Fastspeech 2: Fast and high-quality end-to-end text to speech","year":"2020","author":"Ren","key":"ref2"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016706"},{"key":"ref4","first-page":"8599","article-title":"Grad-tts: A diffusion probabilistic model for text-to-speech","volume-title":"International Conference on Machine Learning","author":"Popov"},{"journal-title":"Wavegrad: Estimating gradients for waveform generation","year":"2020","author":"Chen","key":"ref5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"ref7","first-page":"5180","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","volume-title":"International conference on machine learning","author":"Wang"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01199"},{"journal-title":"Adaspeech: Adaptive text to speech for custom voice","year":"2021","author":"Chen","key":"ref9"},{"journal-title":"Wavenet: A generative model for raw audio","year":"2016","author":"van den Oord","key":"ref10"},{"key":"ref11","first-page":"195","article-title":"Deep voice: Real-time neural text-to-speech","volume-title":"International conference on machine learning","author":"Ar\u0131k"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"journal-title":"Clarinet: Parallel wave generation in end-to-end text-to-speech","year":"2018","author":"Ping","key":"ref13"},{"journal-title":"End-to-end adversarial text-to-speech","year":"2020","author":"Donahue","key":"ref14"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1129"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054591"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-947"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2280"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639535"},{"key":"ref20","first-page":"4693","article-title":"Towards end-to-end prosody transfer for expressive speech synthesis with tacotron","volume-title":"international conference on machine learning","author":"Skerry-Ryan"},{"journal-title":"Styletts 2: Towards human-level text-to-speech through style diffusion and adversarial training with large speech language models","year":"2023","author":"Li","key":"ref21"},{"key":"ref22","article-title":"Transfer learning from speaker verification to multispeaker text-to-speech synthesis","volume":"31","author":"Jia","year":"2018","journal-title":"Advances in neural information processing systems"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.167"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-755"}],"event":{"name":"2024 IEEE International Conference on Multimedia and Expo (ICME)","start":{"date-parts":[[2024,7,15]]},"location":"Niagara Falls, ON, Canada","end":{"date-parts":[[2024,7,19]]}},"container-title":["2024 IEEE International Conference on Multimedia and Expo (ICME)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10685847\/10687354\/10687845.pdf?arnumber=10687845","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T06:35:38Z","timestamp":1727764538000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10687845\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,15]]},"references-count":24,"URL":"https:\/\/doi.org\/10.1109\/icme57554.2024.10687845","relation":{},"subject":[],"published":{"date-parts":[[2024,7,15]]}}}