{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T03:11:06Z","timestamp":1772593866023,"version":"3.50.1"},"reference-count":29,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,12,13]],"date-time":"2021-12-13T00:00:00Z","timestamp":1639353600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,12,13]],"date-time":"2021-12-13T00:00:00Z","timestamp":1639353600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100004853","name":"Chinese University of Hong Kong","doi-asserted-by":"publisher","award":["KPF20QEP26"],"award-info":[{"award-number":["KPF20QEP26"]}],"id":[{"id":"10.13039\/501100004853","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,12,13]]},"DOI":"10.1109\/asru51503.2021.9688051","type":"proceedings-article","created":{"date-parts":[[2022,2,3]],"date-time":"2022-02-03T20:31:00Z","timestamp":1643920260000},"page":"626-633","source":"Crossref","is-referenced-by-count":20,"title":["EditSpeech: A Text Based Speech Editing System Using Partial Inference and Bidirectional Fusion"],"prefix":"10.1109","author":[{"given":"Daxin","family":"Tan","sequence":"first","affiliation":[{"name":"The Chinese University of Hong Kong,Department of Electronic Engineering,Hong Kong"}]},{"given":"Liqun","family":"Deng","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab,Shenzhen,China"}]},{"given":"Yu Ting","family":"Yeung","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab,Shenzhen,China"}]},{"given":"Xin","family":"Jiang","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab,Shenzhen,China"}]},{"given":"Xiao","family":"Chen","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab,Shenzhen,China"}]},{"given":"Tan","family":"Lee","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Department of Electronic Engineering,Hong Kong"}]}],"member":"263","reference":[{"key":"ref10","first-page":"214","article-title":"Deep voice 3: 2000-speaker neural text-to-speech","author":"wei","year":"0","journal-title":"Proc ICLR"},{"key":"ref11","article-title":"Durian: Duration informed attention network for multimodal synthesis","author":"yu","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref12","article-title":"Fastspeech 2: Fast and high-quality end-to-end text-to-speech","author":"ren","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016706"},{"key":"ref14","article-title":"Glow-tts: A generative flow for text-to-speech via monotonic alignment search","volume":"33","author":"kim","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413658"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2935807"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00256"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682804"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683143"},{"key":"ref4","first-page":"1","article-title":"Voco: Text-based insertion and replacement in audio narration","volume":"36","author":"zeyu","year":"2017","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"ref28","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2014","journal-title":"ArXiv Preprint"},{"key":"ref3","doi-asserted-by":"crossref","first-page":"67","DOI":"10.1016\/j.ijhcs.2018.03.006","article-title":"A contextual study of semantic speech editing in radio production","volume":"115","author":"chris","year":"2018","journal-title":"International Journal of Human-Computer Studies"},{"key":"ref27","author":"park","year":"2019","journal-title":"G2pe"},{"key":"ref6","year":"2020","journal-title":"The Descriptor"},{"key":"ref5","article-title":"Context-aware prosody correction for text-based speech editing","author":"max","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-74048-3_4"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2143"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(90)90021-Z"},{"key":"ref2","first-page":"113","article-title":"Content-based tools for editing audio stories","author":"steve","year":"0","journal-title":"Proc 16th Annual ACM Symposium on User Interface Software and Technology"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/985692.985759"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"ref22","article-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis","author":"kong","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref21","article-title":"Melgan: Generative adversarial networks for conditional waveform synthesis","author":"kumar","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref24","author":"yamagishi","year":"2019","journal-title":"CSTR VCTK Corpus English Multi-Speaker Corpus for CSTR Voice Cloning Toolkit (version 0 92)"},{"key":"ref23","author":"ito","year":"2017","journal-title":"The LJ speech dataset"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"ref25","article-title":"Aishell-3: A multi-speaker mandarin tts corpus and the baselines","author":"shi","year":"2020","journal-title":"ArXiv Preprint"}],"event":{"name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Cartagena, Colombia","start":{"date-parts":[[2021,12,13]]},"end":{"date-parts":[[2021,12,17]]}},"container-title":["2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9687821\/9687855\/09688051.pdf?arnumber=9688051","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,16]],"date-time":"2022-05-16T20:41:53Z","timestamp":1652733713000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9688051\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,12,13]]},"references-count":29,"URL":"https:\/\/doi.org\/10.1109\/asru51503.2021.9688051","relation":{},"subject":[],"published":{"date-parts":[[2021,12,13]]}}}