{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T09:33:22Z","timestamp":1763458402805,"version":"3.37.3"},"reference-count":77,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62206171","U23B2018"],"award-info":[{"award-number":["62206171","U23B2018"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shanghai Municipal Science and Technology Major","award":["2021SHZDZX0102"],"award-info":[{"award-number":["2021SHZDZX0102"]}]},{"name":"International Cooperation Project of PCL"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/taslp.2024.3485466","type":"journal-article","created":{"date-parts":[[2024,10,23]],"date-time":"2024-10-23T17:50:29Z","timestamp":1729705829000},"page":"4810-4821","source":"Crossref","is-referenced-by-count":1,"title":["E$^{3}$TTS: End-to-End Text-Based Speech Editing TTS System and Its Applications"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-3314-3079","authenticated-orcid":false,"given":"Zheng","family":"Liang","sequence":"first","affiliation":[{"name":"X-LANCE Lab, Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China"}]},{"given":"Ziyang","family":"Ma","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5329-0847","authenticated-orcid":false,"given":"Chenpeng","family":"Du","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7102-9826","authenticated-orcid":false,"given":"Kai","family":"Yu","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7423-617X","authenticated-orcid":false,"given":"Xie","family":"Chen","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-923"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073702"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414633"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688051"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-189"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746765"},{"key":"ref7","first-page":"1399","article-title":"A $^{3}$ T: Alignment-aware acoustic and text pretraining for speech synthesis and editing","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Bai","year":"2022"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-245"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-194"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.673"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(90)90021-Z"},{"key":"ref12","first-page":"735","article-title":"FastSpeech 2: Fast and high-quality end-to-end text to speech","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ren","year":"2021"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9004035"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683223"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683338"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462180"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682850"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747537"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11249"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2416"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1209"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683573"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639034"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747726"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746126"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1996.541110"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2000.861820"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639215"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref32","first-page":"811","article-title":"DeepVoice 3: 2000-speaker neural text-to-speech","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ping","year":"2018"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016706"},{"key":"ref34","first-page":"3171","article-title":"Fastspeech: Fast, robust and controllable text to speech","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ren","year":"2019"},{"key":"ref35","first-page":"8599","article-title":"Grad-TTS: A diffusion probabilistic model for text-to-speech","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Popov","year":"2021"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413851"},{"key":"ref37","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kim","year":"2021"},{"key":"ref38","first-page":"602","article-title":"End-to-end adversarial text-to-speech","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Donahue","year":"2020"},{"key":"ref39","first-page":"1025","article-title":"ClariNet: Parallel wave generation in end-to-end text-to-speech","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ping","year":"2019"},{"key":"ref40","first-page":"14910","article-title":"MelGAN: Generative adversarial networks for conditional waveform synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Kumar","year":"2019"},{"key":"ref41","first-page":"17022","article-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Kong","year":"2020"},{"key":"ref42","first-page":"607","article-title":"BigVGAN: A universal neural vocoder with large-scale training","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Lee","year":"2023"},{"key":"ref43","first-page":"12449","article-title":"Wav2ec 2.0: A framework for self-supervised learning of speech representations","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Baevski","year":"2020"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref45","first-page":"1298","article-title":"Data2vec: A general framework for self-supervised learning in speech, vision and language","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Baevski","year":"2022"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-822"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3095662"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053176"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-630"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096923"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-6"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095100"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29747"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.741"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688101"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383620"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-719"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2402"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414778"},{"key":"ref61","article-title":"Sequence transduction with recurrent neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Graves","year":"2012"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023313"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2177"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3214"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-52"},{"key":"ref66","first-page":"271","article-title":"The Kaldi speech recognition toolkit","volume-title":"Proc. ASRU","author":"Povey","year":"2011"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2018-1456"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1599"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-755"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/PACRIM.1993.407206"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1965"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2482"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053191"},{"key":"ref75","first-page":"71","article-title":"The ASRU 2019 mandarin-english code-switching speech recognition challenge: Open datasets, tracks, methods and results","volume-title":"Proc. 1st Workshop Speech Technol. Code-Switching Multilingual Communities","author":"Shi","year":"2020"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-923"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6570655\/10304349\/10731477.pdf?arnumber=10731477","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T01:28:28Z","timestamp":1732670908000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10731477\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":77,"URL":"https:\/\/doi.org\/10.1109\/taslp.2024.3485466","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"type":"print","value":"2329-9290"},{"type":"electronic","value":"2329-9304"}],"subject":[],"published":{"date-parts":[[2024]]}}}