{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,10]],"date-time":"2026-02-10T14:28:27Z","timestamp":1770733707604,"version":"3.49.0"},"reference-count":59,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"3","license":[{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62322120"],"award-info":[{"award-number":["62322120"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U21B2010"],"award-info":[{"award-number":["U21B2010"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["2025T180461"],"award-info":[{"award-number":["2025T180461"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["2025M771685"],"award-info":[{"award-number":["2025M771685"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1109\/tpami.2025.3626793","type":"journal-article","created":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T17:11:59Z","timestamp":1761930719000},"page":"2596-2609","source":"Crossref","is-referenced-by-count":0,"title":["SpeechPalette: A Comprehensive Speech Editing Method for Text-Based Speech Editing, One-Shot TTS and Attributes Editing"],"prefix":"10.1109","volume":"48","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1490-6973","authenticated-orcid":false,"given":"Tao","family":"Wang","sequence":"first","affiliation":[{"name":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2422-4618","authenticated-orcid":false,"given":"Jiangyan","family":"Yi","sequence":"additional","affiliation":[{"name":"Department of Automation, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9598-1881","authenticated-orcid":false,"given":"Ruibo","family":"Fu","sequence":"additional","affiliation":[{"name":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chunyu","family":"Qiang","sequence":"additional","affiliation":[{"name":"Department of New Media and Communication, Tianjin University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3495-522X","authenticated-orcid":false,"given":"Dading","family":"Chong","sequence":"additional","affiliation":[{"name":"Department of Electronic and Computer Engineering, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chao","family":"Wang","sequence":"additional","affiliation":[{"name":"Department of Electronics and Information Engineering, Soochow University, Suzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dongyang","family":"Dai","sequence":"additional","affiliation":[{"name":"Department of Electronics and Information Engineering, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhengqi","family":"Wen","sequence":"additional","affiliation":[{"name":"Department of Automation, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9344-6428","authenticated-orcid":false,"given":"Jianhua","family":"Tao","sequence":"additional","affiliation":[{"name":"Department of Automation, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3305243"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"ref3","first-page":"16331","article-title":"EditGAN: High-precision semantic image editing","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Ling","year":"2021"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3308102"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3181070"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3142527"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3168569"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073702"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414633"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/j.artint.2024.104076"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688051"},{"key":"ref12","first-page":"1399","article-title":"A$^{3}$3 t: Alignment-aware acoustic and text pretraining for speech synthesis and editing","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Bai","year":"2022"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746765"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3190717"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1993.319366"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.3390\/app6020057"},{"key":"ref17","article-title":"FastSpeech 2: Fast and high-quality end-to-end text to speech","author":"Ren","year":"2020"},{"key":"ref18","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kim","year":"2021"},{"key":"ref19","first-page":"4693","article-title":"Towards end-to-end prosody transfer for expressive speech synthesis with tacotron","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Skerry-Ryan","year":"2018"},{"key":"ref20","article-title":"A survey on neural speech synthesis","author":"Tan","year":"2021"},{"key":"ref21","first-page":"5180","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang","year":"2018"},{"key":"ref22","first-page":"4485","article-title":"Transfer learning from speaker verification to multispeaker text-to-speech synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"31","author":"Jia","year":"2018"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1737"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref25","first-page":"8067","article-title":"Glow-TTS: A generative flow for text-to-speech via monotonic alignment search","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Kim","year":"2020"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414422"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414427"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095776"},{"key":"ref29","article-title":"NaturalSpeech 2: Latent diffusion models are natural and zero-shot speech and singing synthesizers","author":"Shen","year":"2023"},{"key":"ref30","article-title":"NaturalSpeech 3: Zero-shot speech synthesis with factorized codec and diffusion models","author":"Ju","year":"2024"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2015.2420092"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.3311\/ppee.17024"},{"key":"ref34","first-page":"2709","article-title":"YourTTS: Towards zero-shot multi-speaker TTS and zero-shot voice conversion for everyone","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Casanova","year":"2022"},{"key":"ref35","article-title":"Spark-TTS: An efficient LLM-based text-to-speech model with single-stream decoupled speech tokens","author":"Wang","year":"2025"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1745"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1113\/jphysiol.1937.sp003556"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-6393(99)00051-5"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1121\/1.1918682"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1002\/j.1538-7305.1966.tb01706.x"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.2307\/3680093"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.741"},{"key":"ref43","first-page":"56422","article-title":"UniAudio: Towards universal audio generation with large language models","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","author":"Yang","year":"2024"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.673"},{"key":"ref45","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"Proc. 38th Int. Conf. Mach. Learn.","volume":"139","author":"Kim","year":"2021"},{"key":"ref46","article-title":"ResGrad: Residual denoising diffusion probabilistic models for text to speech","author":"Chen","year":"2022"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"ref48","first-page":"8599","article-title":"Grad-TTS: A diffusion probabilistic model for text-to-speech","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Popov","year":"2021"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref50","volume-title":"Theory and Application of Digital Signal Processing","author":"Rabiner","year":"1975"},{"key":"ref51","first-page":"17022","article-title":"Hifi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Kong","year":"2020"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682156"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-288"},{"key":"ref55","article-title":"Step-audio: Unified understanding and generation in intelligent speech interaction","author":"Huang","year":"2025"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-299"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2960721"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.2970241"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2013.2294023"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/11372200\/11222950.pdf?arnumber=11222950","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T21:05:26Z","timestamp":1770671126000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11222950\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3]]},"references-count":59,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3626793","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,3]]}}}