{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,30]],"date-time":"2025-11-30T08:50:09Z","timestamp":1764492609443},"reference-count":22,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,12,13]],"date-time":"2021-12-13T00:00:00Z","timestamp":1639353600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,12,13]],"date-time":"2021-12-13T00:00:00Z","timestamp":1639353600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,12,13]]},"DOI":"10.1109\/asru51503.2021.9688277","type":"proceedings-article","created":{"date-parts":[[2022,2,3]],"date-time":"2022-02-03T20:31:00Z","timestamp":1643920260000},"page":"710-716","source":"Crossref","is-referenced-by-count":8,"title":["AC-VC: Non-Parallel Low Latency Phonetic Posteriorgrams Based Voice Conversion"],"prefix":"10.1109","author":[{"given":"Damien","family":"Ronssin","sequence":"first","affiliation":[{"name":"Logitech Europe S.A.,Lausanne,Switzerland,1015"}]},{"given":"Milos","family":"Cernak","sequence":"additional","affiliation":[{"name":"Logitech Europe S.A.,Lausanne,Switzerland,1015"}]}],"member":"263","reference":[{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682804"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2015EDP7457"},{"key":"ref12","first-page":"2410","article-title":"Efficient neural audio synthesis","author":"kalchbrenner","year":"2018","journal-title":"International Conference on Machine Learning"},{"key":"ref13","article-title":"Wavenet: A generative model for raw audio","author":"van den oord","year":"2016","journal-title":"ArXiv Preprint"},{"key":"ref14","article-title":"The kaldi speech recognition toolkit","author":"povey","year":"2011","journal-title":"2011 IEEE Workshop on Automatic Speech Recognition &amp; Understanding"},{"key":"ref15","first-page":"4006","article-title":"Tacotron: Towards end-to-end speech synthesis","author":"wang","year":"0","journal-title":"Proc Interspeech 2017"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2017.2723507"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"journal-title":"Librispeech alignments","year":"2019","author":"lugosch","key":"ref18"},{"journal-title":"CSTR VCTK corpus English multi-speaker corpus for cstr voice cloning toolkit","year":"2016","author":"veaux","key":"ref19"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/VCC_BC.2020-19"},{"key":"ref3","first-page":"1","article-title":"Phonetic posteriorgrams for many-to-one voice conversion without parallel data training","author":"sun","year":"2016","journal-title":"2016 IEEE International Conference on Multimedia and Expo (ICME)"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/VCC_BC.2020-21"},{"key":"ref5","first-page":"5210","article-title":"Autovc: Zero-shot voice style transfer with only autoencoder loss","author":"qian","year":"2019","journal-title":"International Conference on Machine Learning"},{"key":"ref8","first-page":"1021","article-title":"Real-time, full-band, online dnn-based voice conversion system using a single cpu","author":"saeki","year":"0","journal-title":"Proc Interspeech 2020"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2019-17"},{"journal-title":"Voice transformer network Sequence-to-sequence voice conversion using transformer with text-to-speech pretraining","year":"2019","author":"huang","key":"ref2"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-1066"},{"key":"ref9","article-title":"Voice conversion challenge 2020: Intra-lingual semi-parallel and cross-lingual voice conversion","author":"zhao","year":"2020","journal-title":"ArXiv Preprint"},{"journal-title":"Parallel wavegan A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram","year":"2020","author":"yamamoto","key":"ref20"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2665"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1456"}],"event":{"name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","start":{"date-parts":[[2021,12,13]]},"location":"Cartagena, Colombia","end":{"date-parts":[[2021,12,17]]}},"container-title":["2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9687821\/9687855\/09688277.pdf?arnumber=9688277","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,16]],"date-time":"2022-05-16T20:42:13Z","timestamp":1652733733000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9688277\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,12,13]]},"references-count":22,"URL":"https:\/\/doi.org\/10.1109\/asru51503.2021.9688277","relation":{},"subject":[],"published":{"date-parts":[[2021,12,13]]}}}